1 /* $NetBSD: tsc.c,v 1.62 2025/05/06 04:34:59 imil Exp $ */
2
3 /*-
4 * Copyright (c) 2008, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include <sys/cdefs.h>
30 __KERNEL_RCSID(0, "$NetBSD: tsc.c,v 1.62 2025/05/06 04:34:59 imil Exp $");
31
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/time.h>
35 #include <sys/timetc.h>
36 #include <sys/lwp.h>
37 #include <sys/atomic.h>
38 #include <sys/kernel.h>
39 #include <sys/cpu.h>
40 #include <sys/xcall.h>
41 #include <sys/lock.h>
42 #ifdef BOOTCYCLETIME
43 #include <sys/bootcycletime.h>
44 #endif
45
46 #include <machine/cpu_counter.h>
47 #include <machine/cpuvar.h>
48 #include <machine/cpufunc.h>
49 #include <machine/specialreg.h>
50 #include <machine/cputypes.h>
51
52 #include "tsc.h"
53
54 #define TSC_SYNC_ROUNDS 1000
55 #define ABS(a) ((a) >= 0 ? (a) : -(a))
56
57 static u_int tsc_get_timecount(struct timecounter *);
58
59 static void tsc_delay(unsigned int);
60
61 static uint64_t tsc_dummy_cacheline __cacheline_aligned;
62 uint64_t tsc_freq __read_mostly; /* exported for sysctl */
63 #ifdef BOOTCYCLETIME
64 extern uint32_t starttsc_lo;
65 extern uint32_t starttsc_hi;
66 #endif
67 static int64_t tsc_drift_max = 1000; /* max cycles */
68 static int64_t tsc_drift_observed;
69 uint64_t (*rdtsc)(void) = rdtsc_cpuid;
70 uint64_t (*cpu_counter)(void) = cpu_counter_cpuid;
71 uint32_t (*cpu_counter32)(void) = cpu_counter32_cpuid;
72
73 int tsc_user_enabled = 1;
74
75 static volatile int64_t tsc_sync_val;
76 static volatile struct cpu_info *tsc_sync_cpu;
77
78 static struct timecounter tsc_timecounter = {
79 .tc_get_timecount = tsc_get_timecount,
80 .tc_counter_mask = ~0U,
81 .tc_name = "TSC",
82 .tc_quality = 3000,
83 };
84
85 bool
tsc_is_invariant(void)86 tsc_is_invariant(void)
87 {
88 struct cpu_info *ci;
89 uint32_t descs[4];
90 uint32_t family;
91 bool invariant;
92
93 if (!cpu_hascounter())
94 return false;
95
96 ci = curcpu();
97 invariant = false;
98
99 if (cpu_vendor == CPUVENDOR_INTEL) {
100 /*
101 * From Intel(tm) 64 and IA-32 Architectures Software
102 * Developer's Manual Volume 3A: System Programming Guide,
103 * Part 1, 17.13 TIME_STAMP COUNTER, these are the processors
104 * where the TSC is known invariant:
105 *
106 * Pentium 4, Intel Xeon (family 0f, models 03 and higher)
107 * Core Solo and Core Duo processors (family 06, model 0e)
108 * Xeon 5100 series and Core 2 Duo (family 06, model 0f)
109 * Core 2 and Xeon (family 06, model 17)
110 * Atom (family 06, model 1c)
111 *
112 * We'll also assume that it's safe on the Pentium, and
113 * that it's safe on P-II and P-III Xeons due to the
114 * typical configuration of those systems.
115 *
116 */
117 switch (CPUID_TO_BASEFAMILY(ci->ci_signature)) {
118 case 0x05:
119 invariant = true;
120 break;
121 case 0x06:
122 invariant = CPUID_TO_MODEL(ci->ci_signature) == 0x0e ||
123 CPUID_TO_MODEL(ci->ci_signature) == 0x0f ||
124 CPUID_TO_MODEL(ci->ci_signature) == 0x17 ||
125 CPUID_TO_MODEL(ci->ci_signature) == 0x1c;
126 break;
127 case 0x0f:
128 invariant = CPUID_TO_MODEL(ci->ci_signature) >= 0x03;
129 break;
130 }
131 } else if (cpu_vendor == CPUVENDOR_AMD) {
132 /*
133 * TSC and Power Management Events on AMD Processors
134 * Nov 2, 2005 Rich Brunner, AMD Fellow
135 * http://lkml.org/lkml/2005/11/4/173
136 *
137 * See Appendix E.4.7 CPUID Fn8000_0007_EDX Advanced Power
138 * Management Features, AMD64 Architecture Programmer's
139 * Manual Volume 3: General-Purpose and System Instructions.
140 * The check is done below.
141 */
142
143 /*
144 * AMD Errata 778: Processor Core Time Stamp Counters May
145 * Experience Drift
146 *
147 * This affects all family 15h and family 16h processors.
148 */
149 switch (CPUID_TO_FAMILY(ci->ci_signature)) {
150 case 0x15:
151 case 0x16:
152 return false;
153 }
154 }
155
156 /*
157 * The best way to check whether the TSC counter is invariant or not
158 * is to check CPUID 80000007.
159 */
160 family = CPUID_TO_BASEFAMILY(ci->ci_signature);
161 if (((cpu_vendor == CPUVENDOR_INTEL) || (cpu_vendor == CPUVENDOR_AMD))
162 && ((family == 0x06) || (family == 0x0f))) {
163 x86_cpuid(0x80000000, descs);
164 if (descs[0] >= 0x80000007) {
165 x86_cpuid(0x80000007, descs);
166 invariant = (descs[3] & CPUID_APM_ITSC) != 0;
167 }
168 }
169
170 return invariant;
171 }
172
173 /* Setup function pointers for rdtsc() and timecounter(9). */
174 void
tsc_setfunc(struct cpu_info * ci)175 tsc_setfunc(struct cpu_info *ci)
176 {
177 bool use_lfence, use_mfence;
178
179 use_lfence = use_mfence = false;
180
181 /*
182 * XXX On AMD, we might be able to use lfence for some cases:
183 * a) if MSR_DE_CFG exist and the bit 1 is set.
184 * b) family == 0x0f or 0x11. Those have no MSR_DE_CFG and
185 * lfence is always serializing.
186 *
187 * We don't use it because the test result showed mfence was better
188 * than lfence with MSR_DE_CFG.
189 */
190 if (cpu_vendor == CPUVENDOR_AMD)
191 use_mfence = true;
192 else if (cpu_vendor == CPUVENDOR_INTEL)
193 use_lfence = true;
194
195 /* LFENCE and MFENCE are applicable if SSE2 is set. */
196 if ((ci->ci_feat_val[0] & CPUID_SSE2) == 0)
197 use_lfence = use_mfence = false;
198
199 #define TSC_SETFUNC(fence) \
200 do { \
201 rdtsc = rdtsc_##fence; \
202 cpu_counter = cpu_counter_##fence; \
203 cpu_counter32 = cpu_counter32_##fence; \
204 } while (/* CONSTCOND */ 0)
205
206 if (use_lfence)
207 TSC_SETFUNC(lfence);
208 else if (use_mfence)
209 TSC_SETFUNC(mfence);
210 else
211 TSC_SETFUNC(cpuid);
212
213 aprint_verbose_dev(ci->ci_dev, "Use %s to serialize rdtsc\n",
214 use_lfence ? "lfence" : (use_mfence ? "mfence" : "cpuid"));
215 }
216
217 /*
218 * Initialize timecounter(9) and DELAY() function of TSC.
219 *
220 * This function is called after all secondary processors were brought up
221 * and drift has been measured, and after any other potential delay funcs
222 * have been installed (e.g. lapic_delay()).
223 */
224 void
tsc_tc_init(void)225 tsc_tc_init(void)
226 {
227 struct cpu_info *ci;
228 bool invariant;
229
230 if (!cpu_hascounter())
231 return;
232
233 ci = curcpu();
234 tsc_freq = ci->ci_data.cpu_cc_freq;
235 invariant = tsc_is_invariant();
236 if (!invariant) {
237 aprint_debug("TSC not known invariant on this CPU\n");
238 tsc_timecounter.tc_quality = -100;
239 } else if (tsc_drift_observed > tsc_drift_max) {
240 aprint_error("ERROR: %lld cycle TSC drift observed\n",
241 (long long)tsc_drift_observed);
242 tsc_timecounter.tc_quality = -100;
243 invariant = false;
244 } else if (vm_guest == VM_GUEST_NO) {
245 delay_func = tsc_delay;
246 } else if (vm_guest == VM_GUEST_VIRTUALBOX) {
247 tsc_timecounter.tc_quality = -100;
248 }
249
250 if (tsc_freq != 0) {
251 tsc_timecounter.tc_frequency = tsc_freq;
252 tc_init(&tsc_timecounter);
253 }
254 }
255
256 /*
257 * Record drift (in clock cycles). Called during AP startup.
258 */
259 void
tsc_sync_drift(int64_t drift)260 tsc_sync_drift(int64_t drift)
261 {
262
263 if (drift < 0)
264 drift = -drift;
265 if (drift > tsc_drift_observed)
266 tsc_drift_observed = drift;
267 }
268
269 /*
270 * Called during startup of APs, by the boot processor. Interrupts
271 * are disabled on entry.
272 */
273 static void __noinline
tsc_read_bp(struct cpu_info * ci,uint64_t * bptscp,uint64_t * aptscp)274 tsc_read_bp(struct cpu_info *ci, uint64_t *bptscp, uint64_t *aptscp)
275 {
276 uint64_t bptsc;
277
278 if (atomic_swap_ptr(&tsc_sync_cpu, ci) != NULL) {
279 panic("tsc_sync_bp: 1");
280 }
281
282 /* Prepare a cache miss for the other side. */
283 (void)atomic_swap_uint((void *)&tsc_dummy_cacheline, 0);
284
285 /* Flag our readiness. */
286 atomic_or_uint(&ci->ci_flags, CPUF_SYNCTSC);
287
288 /* Wait for other side then read our TSC. */
289 while ((ci->ci_flags & CPUF_SYNCTSC) != 0) {
290 __insn_barrier();
291 }
292 bptsc = rdtsc();
293
294 /* Wait for the results to come in. */
295 while (tsc_sync_cpu == ci) {
296 x86_pause();
297 }
298 if (tsc_sync_cpu != NULL) {
299 panic("tsc_sync_bp: 2");
300 }
301
302 *bptscp = bptsc;
303 *aptscp = tsc_sync_val;
304 }
305
306 void
tsc_sync_bp(struct cpu_info * ci)307 tsc_sync_bp(struct cpu_info *ci)
308 {
309 int64_t bptsc, aptsc, val, diff;
310
311 if (!cpu_hascounter())
312 return;
313
314 val = INT64_MAX;
315 for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
316 tsc_read_bp(ci, &bptsc, &aptsc);
317 diff = bptsc - aptsc;
318 if (ABS(diff) < ABS(val)) {
319 val = diff;
320 }
321 }
322
323 ci->ci_data.cpu_cc_skew = val;
324 }
325
326 /*
327 * Called during startup of AP, by the AP itself. Interrupts are
328 * disabled on entry.
329 */
330 static void __noinline
tsc_post_ap(struct cpu_info * ci)331 tsc_post_ap(struct cpu_info *ci)
332 {
333 uint64_t tsc;
334
335 /* Wait for go-ahead from primary. */
336 while ((ci->ci_flags & CPUF_SYNCTSC) == 0) {
337 __insn_barrier();
338 }
339
340 /* Instruct primary to read its counter. */
341 atomic_and_uint(&ci->ci_flags, ~CPUF_SYNCTSC);
342
343 /* Suffer a cache miss, then read TSC. */
344 __insn_barrier();
345 tsc = tsc_dummy_cacheline;
346 __insn_barrier();
347 tsc += rdtsc();
348
349 /* Post result. Ensure the whole value goes out atomically. */
350 (void)atomic_swap_64(&tsc_sync_val, tsc);
351
352 if (atomic_swap_ptr(&tsc_sync_cpu, NULL) != ci) {
353 panic("tsc_sync_ap");
354 }
355 }
356
357 void
tsc_sync_ap(struct cpu_info * ci)358 tsc_sync_ap(struct cpu_info *ci)
359 {
360
361 if (!cpu_hascounter())
362 return;
363
364 for (int i = 0; i < TSC_SYNC_ROUNDS; i++) {
365 tsc_post_ap(ci);
366 }
367 }
368
369 static void
tsc_apply_cpu(void * arg1,void * arg2)370 tsc_apply_cpu(void *arg1, void *arg2)
371 {
372 bool enable = arg1 != NULL;
373 if (enable) {
374 lcr4(rcr4() & ~CR4_TSD);
375 } else {
376 lcr4(rcr4() | CR4_TSD);
377 }
378 }
379
380 void
tsc_user_enable(void)381 tsc_user_enable(void)
382 {
383 uint64_t xc;
384
385 xc = xc_broadcast(0, tsc_apply_cpu, (void *)true, NULL);
386 xc_wait(xc);
387 }
388
389 void
tsc_user_disable(void)390 tsc_user_disable(void)
391 {
392 uint64_t xc;
393
394 xc = xc_broadcast(0, tsc_apply_cpu, (void *)false, NULL);
395 xc_wait(xc);
396 }
397
398 uint64_t
cpu_frequency(struct cpu_info * ci)399 cpu_frequency(struct cpu_info *ci)
400 {
401
402 return ci->ci_data.cpu_cc_freq;
403 }
404
405 int
cpu_hascounter(void)406 cpu_hascounter(void)
407 {
408
409 return cpu_feature[0] & CPUID_TSC;
410 }
411
412 static void
tsc_delay(unsigned int us)413 tsc_delay(unsigned int us)
414 {
415 uint64_t start, delta;
416
417 start = cpu_counter();
418 delta = (uint64_t)us * tsc_freq / 1000000;
419
420 while ((cpu_counter() - start) < delta) {
421 x86_pause();
422 }
423 }
424
425 static u_int
tsc_get_timecount(struct timecounter * tc)426 tsc_get_timecount(struct timecounter *tc)
427 {
428 #if defined(_LP64) && defined(DIAGNOSTIC) /* requires atomic 64-bit store */
429 static __cpu_simple_lock_t lock = __SIMPLELOCK_UNLOCKED;
430 static int lastwarn;
431 uint64_t cur, prev;
432 lwp_t *l = curlwp;
433 int ticks;
434
435 /*
436 * Previous value must be read before the counter and stored to
437 * after, because this routine can be called from interrupt context
438 * and may run over the top of an existing invocation. Ordering is
439 * guaranteed by "volatile" on md_tsc.
440 */
441 prev = l->l_md.md_tsc;
442 cur = cpu_counter();
443 if (__predict_false(cur < prev) && (cur >> 63) == (prev >> 63) &&
444 __cpu_simple_lock_try(&lock)) {
445 ticks = getticks();
446 if (ticks - lastwarn >= hz) {
447 printf(
448 "WARNING: %s TSC went backwards by %u - "
449 "change sysctl(7) kern.timecounter?\n",
450 cpu_name(curcpu()), (unsigned)(prev - cur));
451 lastwarn = ticks;
452 }
453 __cpu_simple_unlock(&lock);
454 }
455 l->l_md.md_tsc = cur;
456 return (uint32_t)cur;
457 #else
458 return cpu_counter32();
459 #endif
460 }
461
462 /*
463 * tsc has been reset; zero the cached tsc of every lwp in the system
464 * so we don't spuriously report that the tsc has gone backward.
465 * Caller must ensure all LWPs are quiescent (except the current one,
466 * obviously) and interrupts are blocked while we update this.
467 */
468 void
tsc_tc_reset(void)469 tsc_tc_reset(void)
470 {
471 struct lwp *l;
472
473 LIST_FOREACH(l, &alllwp, l_list)
474 l->l_md.md_tsc = 0;
475 }
476
477 #ifdef BOOTCYCLETIME
478 /* Returns the kernel boot time in milliseconds. */
479 uint64_t
bootcycletime(void)480 bootcycletime(void)
481 {
482 KASSERT(curcpu_stable());
483 KASSERT(CPU_IS_PRIMARY(curcpu()));
484 return (rdtsc() - ((uint64_t)starttsc_hi << 32 | starttsc_lo)) /
485 (curcpu()->ci_data.cpu_cc_freq / 1000);
486 }
487 #endif
488