xref: /freebsd-13-stable/sys/x86/x86/pvclock.c (revision 3bc80996974a61a4223eae4c1ccd47b6ee32a48a)
1 /*-
2  * Copyright (c) 2009 Adrian Chadd
3  * Copyright (c) 2012 Spectra Logic Corporation
4  * Copyright (c) 2014 Bryan Venteicher
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/bus.h>
33 #include <sys/clock.h>
34 #include <sys/conf.h>
35 #include <sys/fcntl.h>
36 #include <sys/limits.h>
37 #include <sys/mman.h>
38 #include <sys/proc.h>
39 #include <sys/smp.h>
40 #include <sys/sysctl.h>
41 #include <sys/vdso.h>
42 
43 #include <vm/vm.h>
44 #include <vm/pmap.h>
45 
46 #include <machine/atomic.h>
47 #include <machine/cpufunc.h>
48 #include <machine/md_var.h>
49 #include <machine/pvclock.h>
50 
51 /*
52  * Last system time. This is used to guarantee a monotonically non-decreasing
53  * clock for the kernel codepath and approximate the same for the vDSO codepath.
54  * In theory, this should be unnecessary absent hypervisor bug(s) and/or what
55  * should be rare cases where TSC jitter may still be visible despite the
56  * hypervisor's best efforts.
57  */
58 static volatile uint64_t pvclock_last_systime;
59 
60 static uint64_t		 pvclock_getsystime(struct pvclock *pvc);
61 static void		 pvclock_read_time_info(
62     struct pvclock_vcpu_time_info *ti, uint64_t *ns, uint8_t *flags);
63 static void		 pvclock_read_wall_clock(struct pvclock_wall_clock *wc,
64     struct timespec *ts);
65 static u_int		 pvclock_tc_get_timecount(struct timecounter *tc);
66 static uint32_t		 pvclock_tc_vdso_timehands(
67     struct vdso_timehands *vdso_th, struct timecounter *tc);
68 #ifdef COMPAT_FREEBSD32
69 static uint32_t		 pvclock_tc_vdso_timehands32(
70     struct vdso_timehands32 *vdso_th, struct timecounter *tc);
71 #endif
72 
73 static d_open_t		 pvclock_cdev_open;
74 static d_mmap_t		 pvclock_cdev_mmap;
75 
76 static struct cdevsw	 pvclock_cdev_cdevsw = {
77 	.d_version =	D_VERSION,
78 	.d_name =	PVCLOCK_CDEVNAME,
79 	.d_open =	pvclock_cdev_open,
80 	.d_mmap =	pvclock_cdev_mmap,
81 };
82 
83 void
pvclock_resume(void)84 pvclock_resume(void)
85 {
86 	atomic_store_rel_64(&pvclock_last_systime, 0);
87 }
88 
89 uint64_t
pvclock_tsc_freq(struct pvclock_vcpu_time_info * ti)90 pvclock_tsc_freq(struct pvclock_vcpu_time_info *ti)
91 {
92 	uint64_t freq;
93 
94 	freq = (1000000000ULL << 32) / ti->tsc_to_system_mul;
95 	if (ti->tsc_shift < 0)
96 		freq <<= -ti->tsc_shift;
97 	else
98 		freq >>= ti->tsc_shift;
99 	return (freq);
100 }
101 
102 static void
pvclock_read_time_info(struct pvclock_vcpu_time_info * ti,uint64_t * ns,uint8_t * flags)103 pvclock_read_time_info(struct pvclock_vcpu_time_info *ti,
104     uint64_t *ns, uint8_t *flags)
105 {
106 	uint64_t delta;
107 	uint32_t version;
108 
109 	do {
110 		version = atomic_load_acq_32(&ti->version);
111 		delta = rdtsc_ordered() - ti->tsc_timestamp;
112 		*ns = ti->system_time + pvclock_scale_delta(delta,
113 		    ti->tsc_to_system_mul, ti->tsc_shift);
114 		*flags = ti->flags;
115 		atomic_thread_fence_acq();
116 	} while ((ti->version & 1) != 0 || ti->version != version);
117 }
118 
119 static void
pvclock_read_wall_clock(struct pvclock_wall_clock * wc,struct timespec * ts)120 pvclock_read_wall_clock(struct pvclock_wall_clock *wc, struct timespec *ts)
121 {
122 	uint32_t version;
123 
124 	do {
125 		version = atomic_load_acq_32(&wc->version);
126 		ts->tv_sec = wc->sec;
127 		ts->tv_nsec = wc->nsec;
128 		atomic_thread_fence_acq();
129 	} while ((wc->version & 1) != 0 || wc->version != version);
130 }
131 
132 static uint64_t
pvclock_getsystime(struct pvclock * pvc)133 pvclock_getsystime(struct pvclock *pvc)
134 {
135 	uint64_t now, last, ret;
136 	uint8_t flags;
137 
138 	critical_enter();
139 	pvclock_read_time_info(&pvc->timeinfos[curcpu], &now, &flags);
140 	ret = now;
141 	if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) {
142 		last = atomic_load_acq_64(&pvclock_last_systime);
143 		do {
144 			if (last > now) {
145 				ret = last;
146 				break;
147 			}
148 		} while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last,
149 		    now));
150 	}
151 	critical_exit();
152 	return (ret);
153 }
154 
155 /*
156  * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c'
157  * has been migrated to the 'struct pvclock' API.
158  */
159 uint64_t
pvclock_get_timecount(struct pvclock_vcpu_time_info * ti)160 pvclock_get_timecount(struct pvclock_vcpu_time_info *ti)
161 {
162 	uint64_t now, last, ret;
163 	uint8_t flags;
164 
165 	pvclock_read_time_info(ti, &now, &flags);
166 	ret = now;
167 	if ((flags & PVCLOCK_FLAG_TSC_STABLE) == 0) {
168 		last = atomic_load_acq_64(&pvclock_last_systime);
169 		do {
170 			if (last > now) {
171 				ret = last;
172 				break;
173 			}
174 		} while (!atomic_fcmpset_rel_64(&pvclock_last_systime, &last,
175 		    now));
176 	}
177 	return (ret);
178 }
179 
180 /*
181  * NOTE: Transitional-only; this should be removed after 'dev/xen/timer/timer.c'
182  * has been migrated to the 'struct pvclock' API.
183  */
184 void
pvclock_get_wallclock(struct pvclock_wall_clock * wc,struct timespec * ts)185 pvclock_get_wallclock(struct pvclock_wall_clock *wc, struct timespec *ts)
186 {
187 	pvclock_read_wall_clock(wc, ts);
188 }
189 
190 static int
pvclock_cdev_open(struct cdev * dev,int oflags,int devtype,struct thread * td)191 pvclock_cdev_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
192 {
193 	if (oflags & FWRITE)
194 		return (EPERM);
195 	return (0);
196 }
197 
198 static int
pvclock_cdev_mmap(struct cdev * dev,vm_ooffset_t offset,vm_paddr_t * paddr,int nprot,vm_memattr_t * memattr)199 pvclock_cdev_mmap(struct cdev *dev, vm_ooffset_t offset, vm_paddr_t *paddr,
200     int nprot, vm_memattr_t *memattr)
201 {
202 	if (offset >= mp_ncpus * sizeof(struct pvclock_vcpu_time_info))
203 		return (EINVAL);
204 	if (PROT_EXTRACT(nprot) != PROT_READ)
205 		return (EACCES);
206 	*paddr = vtophys((uintptr_t)dev->si_drv1 + offset);
207 	*memattr = VM_MEMATTR_DEFAULT;
208 	return (0);
209 }
210 
211 static u_int
pvclock_tc_get_timecount(struct timecounter * tc)212 pvclock_tc_get_timecount(struct timecounter *tc)
213 {
214 	struct pvclock *pvc = tc->tc_priv;
215 
216 	return (pvclock_getsystime(pvc) & UINT_MAX);
217 }
218 
219 static uint32_t
pvclock_tc_vdso_timehands(struct vdso_timehands * vdso_th,struct timecounter * tc)220 pvclock_tc_vdso_timehands(struct vdso_timehands *vdso_th,
221     struct timecounter *tc)
222 {
223 	struct pvclock *pvc = tc->tc_priv;
224 
225 	if (pvc->cdev == NULL)
226 		return (0);
227 
228 	vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK;
229 	vdso_th->th_x86_shift = 0;
230 	vdso_th->th_x86_hpet_idx = 0;
231 	vdso_th->th_x86_pvc_last_systime =
232 	    atomic_load_acq_64(&pvclock_last_systime);
233 	vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable &&
234 	    pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0;
235 	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
236 	return ((amd_feature & AMDID_RDTSCP) != 0 ||
237 	    ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 &&
238 	    pvc->vdso_enable_without_rdtscp));
239 }
240 
241 #ifdef COMPAT_FREEBSD32
242 static uint32_t
pvclock_tc_vdso_timehands32(struct vdso_timehands32 * vdso_th,struct timecounter * tc)243 pvclock_tc_vdso_timehands32(struct vdso_timehands32 *vdso_th,
244     struct timecounter *tc)
245 {
246 	struct pvclock *pvc = tc->tc_priv;
247 
248 	if (pvc->cdev == NULL)
249 		return (0);
250 
251 	vdso_th->th_algo = VDSO_TH_ALGO_X86_PVCLK;
252 	vdso_th->th_x86_shift = 0;
253 	vdso_th->th_x86_hpet_idx = 0;
254 	*(uint64_t *)&vdso_th->th_x86_pvc_last_systime[0] =
255 	    atomic_load_acq_64(&pvclock_last_systime);
256 	vdso_th->th_x86_pvc_stable_mask = !pvc->vdso_force_unstable &&
257 	    pvc->stable_flag_supported ? PVCLOCK_FLAG_TSC_STABLE : 0;
258 	bzero(vdso_th->th_res, sizeof(vdso_th->th_res));
259 	return ((amd_feature & AMDID_RDTSCP) != 0 ||
260 	    ((vdso_th->th_x86_pvc_stable_mask & PVCLOCK_FLAG_TSC_STABLE) != 0 &&
261 	    pvc->vdso_enable_without_rdtscp));
262 }
263 #endif
264 
265 void
pvclock_gettime(struct pvclock * pvc,struct timespec * ts)266 pvclock_gettime(struct pvclock *pvc, struct timespec *ts)
267 {
268 	struct timespec system_ts;
269 	uint64_t system_ns;
270 
271 	pvclock_read_wall_clock(pvc->get_wallclock(pvc->get_wallclock_arg), ts);
272 	system_ns = pvclock_getsystime(pvc);
273 	system_ts.tv_sec = system_ns / 1000000000ULL;
274 	system_ts.tv_nsec = system_ns % 1000000000ULL;
275 	timespecadd(ts, &system_ts, ts);
276 }
277 
278 void
pvclock_init(struct pvclock * pvc,device_t dev,const char * tc_name,int tc_quality,u_int tc_flags)279 pvclock_init(struct pvclock *pvc, device_t dev, const char *tc_name,
280     int tc_quality, u_int tc_flags)
281 {
282 	struct make_dev_args mda;
283 	int err;
284 
285 	KASSERT(((uintptr_t)pvc->timeinfos & PAGE_MASK) == 0,
286 	    ("Specified time info page(s) address is not page-aligned."));
287 
288 	/* Set up vDSO stable-flag suppression test facility: */
289 	pvc->vdso_force_unstable = false;
290 	SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev),
291 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
292 	    "vdso_force_unstable", CTLFLAG_RW, &pvc->vdso_force_unstable, 0,
293 	    "Forcibly deassert stable flag in vDSO codepath");
294 
295 	/*
296 	 * Make it possible to use the vDSO page even when the hypervisor does
297 	 * not support the rdtscp instruction.  This is disabled by default for
298 	 * compatibility with old libc.
299 	 */
300 	pvc->vdso_enable_without_rdtscp = false;
301 	SYSCTL_ADD_BOOL(device_get_sysctl_ctx(dev),
302 	    SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), OID_AUTO,
303 	    "vdso_enable_without_rdtscp", CTLFLAG_RWTUN,
304 	    &pvc->vdso_enable_without_rdtscp, 0,
305 	    "Allow the use of a vDSO when rdtscp is not available");
306 
307 	/* Set up timecounter and timecounter-supporting members: */
308 	pvc->tc.tc_get_timecount = pvclock_tc_get_timecount;
309 	pvc->tc.tc_poll_pps = NULL;
310 	pvc->tc.tc_counter_mask = ~0U;
311 	pvc->tc.tc_frequency = 1000000000ULL;
312 	pvc->tc.tc_name = tc_name;
313 	pvc->tc.tc_quality = tc_quality;
314 	pvc->tc.tc_flags = tc_flags;
315 	pvc->tc.tc_priv = pvc;
316 	pvc->tc.tc_fill_vdso_timehands = pvclock_tc_vdso_timehands;
317 #ifdef COMPAT_FREEBSD32
318 	pvc->tc.tc_fill_vdso_timehands32 = pvclock_tc_vdso_timehands32;
319 #endif
320 
321 	/* Set up cdev for userspace mmapping of vCPU 0 time info page: */
322 	make_dev_args_init(&mda);
323 	mda.mda_devsw = &pvclock_cdev_cdevsw;
324 	mda.mda_uid = UID_ROOT;
325 	mda.mda_gid = GID_WHEEL;
326 	mda.mda_mode = 0444;
327 	mda.mda_si_drv1 = pvc->timeinfos;
328 	err = make_dev_s(&mda, &pvc->cdev, PVCLOCK_CDEVNAME);
329 	if (err != 0) {
330 		device_printf(dev, "Could not create /dev/%s, error %d. Fast "
331 		    "time of day will be unavailable for this timecounter.\n",
332 		    PVCLOCK_CDEVNAME, err);
333 		KASSERT(pvc->cdev == NULL,
334 		    ("Failed make_dev_s() unexpectedly inited cdev."));
335 	}
336 
337 	/* Register timecounter: */
338 	tc_init(&pvc->tc);
339 
340 	/*
341 	 * Register wallclock:
342 	 *     The RTC registration API expects a resolution in microseconds;
343 	 *     pvclock's 1ns resolution is rounded up to 1us.
344 	 */
345 	clock_register(dev, 1);
346 }
347 
348 int
pvclock_destroy(struct pvclock * pvc)349 pvclock_destroy(struct pvclock *pvc)
350 {
351 	/*
352 	 * Not currently possible since there is no teardown counterpart of
353 	 * 'tc_init()'.
354 	 */
355 	return (EBUSY);
356 }
357