1 /*        $NetBSD: tsc.c,v 1.62 2025/05/06 04:34:59 imil Exp $        */
2 
3 /*-
4  * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.62 2025/05/06 04:34:59 imil Exp $");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/time.h>
35 #include <sys/timetc.h>
36 #include <sys/lwp.h>
37 #include <sys/atomic.h>
38 #include <sys/kernel.h>
39 #include <sys/cpu.h>
40 #include <sys/xcall.h>
41 #include <sys/lock.h>
42 #ifdef BOOTCYCLETIME
43 #include <sys/bootcycletime.h>
44 #endif
45 
46 #include <machine/cpu_counter.h>
47 #include <machine/cpuvar.h>
48 #include <machine/cpufunc.h>
49 #include <machine/specialreg.h>
50 #include <machine/cputypes.h>
51 
52 #include "tsc.h"
53 
54 #define   TSC_SYNC_ROUNDS               1000
55 #define   ABS(a)                        ((a) >= 0 ? (a) : -(a))
56 
57 static u_int        tsc_get_timecount(struct timecounter *);
58 
59 static void         tsc_delay(unsigned int);
60 
61 static uint64_t     tsc_dummy_cacheline __cacheline_aligned;
62 uint64_t  tsc_freq __read_mostly;       /* exported for sysctl */
63 #ifdef BOOTCYCLETIME
64 extern uint32_t     starttsc_lo;
65 extern uint32_t     starttsc_hi;
66 #endif
67 static int64_t      tsc_drift_max = 1000;         /* max cycles */
68 static int64_t      tsc_drift_observed;
69 uint64_t  (*rdtsc)(void) = rdtsc_cpuid;
70 uint64_t  (*cpu_counter)(void) = cpu_counter_cpuid;
71 uint32_t  (*cpu_counter32)(void) = cpu_counter32_cpuid;
72 
73 int tsc_user_enabled = 1;
74 
75 static volatile int64_t       tsc_sync_val;
76 static volatile struct cpu_info         *tsc_sync_cpu;
77 
78 static struct timecounter tsc_timecounter = {
79           .tc_get_timecount = tsc_get_timecount,
80           .tc_counter_mask = ~0U,
81           .tc_name = "TSC",
82           .tc_quality = 3000,
83 };
84 
85 bool
tsc_is_invariant(void)86 tsc_is_invariant(void)
87 {
88           struct cpu_info *ci;
89           uint32_t descs[4];
90           uint32_t family;
91           bool invariant;
92 
93           if (!cpu_hascounter())
94                     return false;
95 
96           ci = curcpu();
97           invariant = false;
98 
99           if (cpu_vendor == CPUVENDOR_INTEL) {
100                     /*
101                      * From Intel(tm) 64 and IA-32 Architectures Software
102                      * Developer's Manual Volume 3A: System Programming Guide,
103                      * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors
104                      * where the TSC is known invariant:
105                      *
106                      * Pentium 4, Intel Xeon (family 0f, models 03 and higher)
107                      * Core Solo and Core Duo processors (family 06, model 0e)
108                      * Xeon 5100 series and Core 2 Duo (family 06, model 0f)
109                      * Core 2 and Xeon (family 06, model 17)
110                      * Atom (family 06, model 1c)
111                      *
112                      * We'll also assume that it's safe on the Pentium, and
113                      * that it's safe on P-II and P-III Xeons due to the
114                      * typical configuration of those systems.
115                      *
116                      */
117                     switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) {
118                     case 0x05:
119                               invariant = true;
120                               break;
121                     case 0x06:
122                               invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e ||
123                                   CPUID_TO_MODEL(ci->ci_signature) == 0x0f ||
124                                   CPUID_TO_MODEL(ci->ci_signature) == 0x17 ||
125                                   CPUID_TO_MODEL(ci->ci_signature) == 0x1c;
126                               break;
127                     case 0x0f:
128                               invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03;
129                               break;
130                     }
131           } else if (cpu_vendor == CPUVENDOR_AMD) {
132                     /*
133                      * TSC and Power Management Events on AMD Processors
134                      * Nov 2, 2005 Rich Brunner, AMD Fellow
135                      * http://lkml.org/lkml/2005/11/4/173
136                      *
137                      * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power
138                      * Management Features, AMD64 Architecture Programmer's
139                      * Manual Volume 3: General-Purpose and System Instructions.
140                      * The check is done below.
141                      */
142 
143                     /*
144                      * AMD Errata 778: Processor Core Time Stamp Counters May
145                      * Experience Drift
146                      *
147                      * This affects all family 15h and family 16h processors.
148                      */
149                     switch (CPUID_TO_FAMILY(ci->ci_signature)) {
150                     case 0x15:
151                     case 0x16:
152                               return false;
153                     }
154           }
155 
156           /*
157            * The best way to check whether the TSC counter is invariant or not
158            * is to check CPUID 80000007.
159            */
160           family = CPUID_TO_BASEFAMILY(ci->ci_signature);
161           if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD))
162               && ((family == 0x06) || (family == 0x0f))) {
163                     x86_cpuid(0x80000000, descs);
164                     if (descs[0] >= 0x80000007) {
165                               x86_cpuid(0x80000007, descs);
166                               invariant = (descs[3] & CPUID_APM_ITSC) != 0;
167                     }
168           }
169 
170           return invariant;
171 }
172 
173 /* Setup function pointers for rdtsc() and timecounter(9). */
174 void
tsc_setfunc(struct cpu_info * ci)175 tsc_setfunc(struct cpu_info *ci)
176 {
177           bool use_lfence, use_mfence;
178 
179           use_lfence = use_mfence = false;
180 
181           /*
182            * XXX On AMD, we might be able to use lfence for some cases:
183            *   a) if MSR_DE_CFG exist and the bit 1 is set.
184            *   b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and
185            *      lfence is always serializing.
186            *
187            * We don't use it because the test result showed mfence was better
188            * than lfence with MSR_DE_CFG.
189            */
190           if (cpu_vendor == CPUVENDOR_AMD)
191                     use_mfence = true;
192           else if (cpu_vendor == CPUVENDOR_INTEL)
193                     use_lfence = true;
194 
195           /* LFENCE and MFENCE are applicable if SSE2 is set. */
196           if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0)
197                     use_lfence = use_mfence = false;
198 
199 #define TSC_SETFUNC(fence)                                                            \
200           do {                                                                                  \
201                     rdtsc = rdtsc_##fence;                                                      \
202                     cpu_counter = cpu_counter_##fence;                                \
203                     cpu_counter32 = cpu_counter32_##fence;                            \
204           } while (/* CONSTCOND */ 0)
205 
206           if (use_lfence)
207                     TSC_SETFUNC(lfence);
208           else if (use_mfence)
209                     TSC_SETFUNC(mfence);
210           else
211                     TSC_SETFUNC(cpuid);
212 
213           aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n",
214               use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid"));
215 }
216 
217 /*
218  * Initialize timecounter(9) and DELAY() function of TSC.
219  *
220  * This function is called after all secondary processors were brought up
221  * and drift has been measured, and after any other potential delay funcs
222  * have been installed (e.g. lapic_delay()).
223  */
224 void
tsc_tc_init(void)225 tsc_tc_init(void)
226 {
227           struct cpu_info *ci;
228           bool invariant;
229 
230           if (!cpu_hascounter())
231                     return;
232 
233           ci = curcpu();
234           tsc_freq = ci->ci_data.cpu_cc_freq;
235           invariant = tsc_is_invariant();
236           if (!invariant) {
237                     aprint_debug("TSC not known invariant on this CPU\n");
238                     tsc_timecounter.tc_quality = -100;
239           } else if (tsc_drift_observed > tsc_drift_max) {
240                     aprint_error("ERROR: %lld cycle TSC drift observed\n",
241                         (long long)tsc_drift_observed);
242                     tsc_timecounter.tc_quality = -100;
243                     invariant = false;
244           } else if (vm_guest == VM_GUEST_NO) {
245                     delay_func = tsc_delay;
246           } else if (vm_guest == VM_GUEST_VIRTUALBOX) {
247                     tsc_timecounter.tc_quality = -100;
248           }
249 
250           if (tsc_freq != 0) {
251                     tsc_timecounter.tc_frequency = tsc_freq;
252                     tc_init(&tsc_timecounter);
253           }
254 }
255 
256 /*
257  * Record drift (in clock cycles).  Called during AP startup.
258  */
259 void
tsc_sync_drift(int64_t drift)260 tsc_sync_drift(int64_t drift)
261 {
262 
263           if (drift < 0)
264                     drift = -drift;
265           if (drift > tsc_drift_observed)
266                     tsc_drift_observed = drift;
267 }
268 
269 /*
270  * Called during startup of APs, by the boot processor.  Interrupts
271  * are disabled on entry.
272  */
273 static void __noinline
tsc_read_bp(struct cpu_info * ci,uint64_t * bptscp,uint64_t * aptscp)274 tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
275 {
276           uint64_t bptsc;
277 
278           if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) {
279                     panic("tsc_sync_bp: 1");
280           }
281 
282           /* Prepare a cache miss for the other side. */
283           (void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0);
284 
285           /* Flag our readiness. */
286           atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC);
287 
288           /* Wait for other side then read our TSC. */
289           while ((ci->ci_flags & CPUF_SYNCTSC) != 0) {
290                     __insn_barrier();
291           }
292           bptsc = rdtsc();
293 
294           /* Wait for the results to come in. */
295           while (tsc_sync_cpu == ci) {
296                     x86_pause();
297           }
298           if (tsc_sync_cpu != NULL) {
299                     panic("tsc_sync_bp: 2");
300           }
301 
302           *bptscp = bptsc;
303           *aptscp = tsc_sync_val;
304 }
305 
306 void
tsc_sync_bp(struct cpu_info * ci)307 tsc_sync_bp(struct cpu_info *ci)
308 {
309           int64_t bptsc, aptsc, val, diff;
310 
311           if (!cpu_hascounter())
312                     return;
313 
314           val = INT64_MAX;
315           for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
316                     tsc_read_bp(ci, &bptsc, &aptsc);
317                     diff = bptsc - aptsc;
318                     if (ABS(diff) < ABS(val)) {
319                               val = diff;
320                     }
321           }
322 
323           ci->ci_data.cpu_cc_skew = val;
324 }
325 
326 /*
327  * Called during startup of AP, by the AP itself.  Interrupts are
328  * disabled on entry.
329  */
330 static void __noinline
tsc_post_ap(struct cpu_info * ci)331 tsc_post_ap(struct cpu_info *ci)
332 {
333           uint64_t tsc;
334 
335           /* Wait for go-ahead from primary. */
336           while ((ci->ci_flags & CPUF_SYNCTSC) == 0) {
337                     __insn_barrier();
338           }
339 
340           /* Instruct primary to read its counter. */
341           atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC);
342 
343           /* Suffer a cache miss, then read TSC. */
344           __insn_barrier();
345           tsc = tsc_dummy_cacheline;
346           __insn_barrier();
347           tsc += rdtsc();
348 
349           /* Post result.  Ensure the whole value goes out atomically. */
350           (void)atomic_swap_64(&tsc_sync_val, tsc);
351 
352           if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) {
353                     panic("tsc_sync_ap");
354           }
355 }
356 
357 void
tsc_sync_ap(struct cpu_info * ci)358 tsc_sync_ap(struct cpu_info *ci)
359 {
360 
361           if (!cpu_hascounter())
362                     return;
363 
364           for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
365                     tsc_post_ap(ci);
366           }
367 }
368 
369 static void
tsc_apply_cpu(void * arg1,void * arg2)370 tsc_apply_cpu(void *arg1, void *arg2)
371 {
372           bool enable = arg1 != NULL;
373           if (enable) {
374                     lcr4(rcr4() & ~CR4_TSD);
375           } else {
376                     lcr4(rcr4() | CR4_TSD);
377           }
378 }
379 
380 void
tsc_user_enable(void)381 tsc_user_enable(void)
382 {
383           uint64_t xc;
384 
385           xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL);
386           xc_wait(xc);
387 }
388 
389 void
tsc_user_disable(void)390 tsc_user_disable(void)
391 {
392           uint64_t xc;
393 
394           xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL);
395           xc_wait(xc);
396 }
397 
398 uint64_t
cpu_frequency(struct cpu_info * ci)399 cpu_frequency(struct cpu_info *ci)
400 {
401 
402           return ci->ci_data.cpu_cc_freq;
403 }
404 
405 int
cpu_hascounter(void)406 cpu_hascounter(void)
407 {
408 
409           return cpu_feature[0] & CPUID_TSC;
410 }
411 
412 static void
tsc_delay(unsigned int us)413 tsc_delay(unsigned int us)
414 {
415           uint64_t start, delta;
416 
417           start = cpu_counter();
418           delta = (uint64_t)us * tsc_freq / 1000000;
419 
420           while ((cpu_counter() - start) < delta) {
421                     x86_pause();
422           }
423 }
424 
425 static u_int
tsc_get_timecount(struct timecounter * tc)426 tsc_get_timecount(struct timecounter *tc)
427 {
428 #if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */
429           static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED;
430           static int lastwarn;
431           uint64_t cur, prev;
432           lwp_t *l = curlwp;
433           int ticks;
434 
435           /*
436            * Previous value must be read before the counter and stored to
437            * after, because this routine can be called from interrupt context
438            * and may run over the top of an existing invocation.  Ordering is
439            * guaranteed by "volatile" on md_tsc.
440            */
441           prev = l->l_md.md_tsc;
442           cur = cpu_counter();
443           if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) &&
444               __cpu_simple_lock_try(&lock)) {
445                     ticks = getticks();
446                     if (ticks - lastwarn >= hz) {
447                               printf(
448                                   "WARNING: %s TSC went backwards by %u - "
449                                   "change sysctl(7) kern.timecounter?\n",
450                                   cpu_name(curcpu()), (unsigned)(prev - cur));
451                               lastwarn = ticks;
452                     }
453                     __cpu_simple_unlock(&lock);
454           }
455           l->l_md.md_tsc = cur;
456           return (uint32_t)cur;
457 #else
458           return cpu_counter32();
459 #endif
460 }
461 
462 /*
463  * tsc has been reset; zero the cached tsc of every lwp in the system
464  * so we don't spuriously report that the tsc has gone backward.
465  * Caller must ensure all LWPs are quiescent (except the current one,
466  * obviously) and interrupts are blocked while we update this.
467  */
468 void
tsc_tc_reset(void)469 tsc_tc_reset(void)
470 {
471           struct lwp *l;
472 
473           LIST_FOREACH(l, &alllwp, l_list)
474                     l->l_md.md_tsc = 0;
475 }
476 
477 #ifdef BOOTCYCLETIME
478 /* Returns the kernel boot time in milliseconds. */
479 uint64_t
bootcycletime(void)480 bootcycletime(void)
481 {
482           KASSERT(curcpu_stable());
483           KASSERT(CPU_IS_PRIMARY(curcpu()));
484           return (rdtsc() - ((uint64_t)starttsc_hi << 32 | starttsc_lo)) /
485               (curcpu()->ci_data.cpu_cc_freq / 1000);
486 }
487 #endif
488