xref: /freebsd-13-stable/sys/amd64/vmm/vmm_dev.c (revision 7756dcc46f3b1059e68fdd0099ca26618d5d8d62)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2011 NetApp, Inc.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include "opt_bhyve_snapshot.h"
31 
32 #include <sys/param.h>
33 #include <sys/kernel.h>
34 #include <sys/jail.h>
35 #include <sys/queue.h>
36 #include <sys/lock.h>
37 #include <sys/mutex.h>
38 #include <sys/malloc.h>
39 #include <sys/conf.h>
40 #include <sys/sysctl.h>
41 #include <sys/libkern.h>
42 #include <sys/ioccom.h>
43 #include <sys/mman.h>
44 #include <sys/uio.h>
45 #include <sys/proc.h>
46 
47 #include <vm/vm.h>
48 #include <vm/pmap.h>
49 #include <vm/vm_map.h>
50 #include <vm/vm_object.h>
51 
52 #include <machine/vmparam.h>
53 #include <machine/vmm.h>
54 #include <machine/vmm_dev.h>
55 #include <machine/vmm_instruction_emul.h>
56 #include <machine/vmm_snapshot.h>
57 #include <x86/apicreg.h>
58 
59 #include "vmm_lapic.h"
60 #include "vmm_stat.h"
61 #include "vmm_mem.h"
62 #include "io/ppt.h"
63 #include "io/vatpic.h"
64 #include "io/vioapic.h"
65 #include "io/vhpet.h"
66 #include "io/vrtc.h"
67 
68 #ifdef COMPAT_FREEBSD13
69 struct vm_stats_old {
70 	int		cpuid;				/* in */
71 	int		num_entries;			/* out */
72 	struct timeval	tv;
73 	uint64_t	statbuf[MAX_VM_STATS];
74 };
75 
76 #define	VM_STATS_OLD \
77 	_IOWR('v', IOCNUM_VM_STATS, struct vm_stats_old)
78 #endif
79 
80 struct devmem_softc {
81 	int	segid;
82 	char	*name;
83 	struct cdev *cdev;
84 	struct vmmdev_softc *sc;
85 	SLIST_ENTRY(devmem_softc) link;
86 };
87 
88 struct vmmdev_softc {
89 	struct vm	*vm;		/* vm instance cookie */
90 	struct cdev	*cdev;
91 	struct ucred	*ucred;
92 	SLIST_ENTRY(vmmdev_softc) link;
93 	SLIST_HEAD(, devmem_softc) devmem;
94 	int		flags;
95 };
96 #define	VSC_LINKED		0x01
97 
98 static SLIST_HEAD(, vmmdev_softc) head;
99 
100 static unsigned pr_allow_flag;
101 static struct mtx vmmdev_mtx;
102 MTX_SYSINIT(vmmdev_mtx, &vmmdev_mtx, "vmm device mutex", MTX_DEF);
103 
104 static MALLOC_DEFINE(M_VMMDEV, "vmmdev", "vmmdev");
105 
106 SYSCTL_DECL(_hw_vmm);
107 
108 static int vmm_priv_check(struct ucred *ucred);
109 static int devmem_create_cdev(const char *vmname, int id, char *devmem);
110 static void devmem_destroy(void *arg);
111 
112 static int
vmm_priv_check(struct ucred * ucred)113 vmm_priv_check(struct ucred *ucred)
114 {
115 
116 	if (jailed(ucred) &&
117 	    !(ucred->cr_prison->pr_allow & pr_allow_flag))
118 		return (EPERM);
119 
120 	return (0);
121 }
122 
123 static int
vcpu_lock_one(struct vcpu * vcpu)124 vcpu_lock_one(struct vcpu *vcpu)
125 {
126 	return (vcpu_set_state(vcpu, VCPU_FROZEN, true));
127 }
128 
129 static void
vcpu_unlock_one(struct vmmdev_softc * sc,int vcpuid,struct vcpu * vcpu)130 vcpu_unlock_one(struct vmmdev_softc *sc, int vcpuid, struct vcpu *vcpu)
131 {
132 	enum vcpu_state state;
133 
134 	state = vcpu_get_state(vcpu, NULL);
135 	if (state != VCPU_FROZEN) {
136 		panic("vcpu %s(%d) has invalid state %d", vm_name(sc->vm),
137 		    vcpuid, state);
138 	}
139 
140 	vcpu_set_state(vcpu, VCPU_IDLE, false);
141 }
142 
143 static int
vcpu_lock_all(struct vmmdev_softc * sc)144 vcpu_lock_all(struct vmmdev_softc *sc)
145 {
146 	struct vcpu *vcpu;
147 	int error;
148 	uint16_t i, j, maxcpus;
149 
150 	vm_slock_vcpus(sc->vm);
151 	maxcpus = vm_get_maxcpus(sc->vm);
152 	for (i = 0; i < maxcpus; i++) {
153 		vcpu = vm_vcpu(sc->vm, i);
154 		if (vcpu == NULL)
155 			continue;
156 		error = vcpu_lock_one(vcpu);
157 		if (error)
158 			break;
159 	}
160 
161 	if (error) {
162 		for (j = 0; j < i; j++) {
163 			vcpu = vm_vcpu(sc->vm, j);
164 			if (vcpu == NULL)
165 				continue;
166 			vcpu_unlock_one(sc, j, vcpu);
167 		}
168 		vm_unlock_vcpus(sc->vm);
169 	}
170 
171 	return (error);
172 }
173 
174 static void
vcpu_unlock_all(struct vmmdev_softc * sc)175 vcpu_unlock_all(struct vmmdev_softc *sc)
176 {
177 	struct vcpu *vcpu;
178 	uint16_t i, maxcpus;
179 
180 	maxcpus = vm_get_maxcpus(sc->vm);
181 	for (i = 0; i < maxcpus; i++) {
182 		vcpu = vm_vcpu(sc->vm, i);
183 		if (vcpu == NULL)
184 			continue;
185 		vcpu_unlock_one(sc, i, vcpu);
186 	}
187 	vm_unlock_vcpus(sc->vm);
188 }
189 
190 static struct vmmdev_softc *
vmmdev_lookup(const char * name)191 vmmdev_lookup(const char *name)
192 {
193 	struct vmmdev_softc *sc;
194 
195 #ifdef notyet	/* XXX kernel is not compiled with invariants */
196 	mtx_assert(&vmmdev_mtx, MA_OWNED);
197 #endif
198 
199 	SLIST_FOREACH(sc, &head, link) {
200 		if (strcmp(name, vm_name(sc->vm)) == 0)
201 			break;
202 	}
203 
204 	if (sc == NULL)
205 		return (NULL);
206 
207 	if (cr_cansee(curthread->td_ucred, sc->ucred))
208 		return (NULL);
209 
210 	return (sc);
211 }
212 
213 static struct vmmdev_softc *
vmmdev_lookup2(struct cdev * cdev)214 vmmdev_lookup2(struct cdev *cdev)
215 {
216 
217 	return (cdev->si_drv1);
218 }
219 
220 static int
vmmdev_rw(struct cdev * cdev,struct uio * uio,int flags)221 vmmdev_rw(struct cdev *cdev, struct uio *uio, int flags)
222 {
223 	int error, off, c, prot;
224 	vm_paddr_t gpa, maxaddr;
225 	void *hpa, *cookie;
226 	struct vmmdev_softc *sc;
227 
228 	error = vmm_priv_check(curthread->td_ucred);
229 	if (error)
230 		return (error);
231 
232 	sc = vmmdev_lookup2(cdev);
233 	if (sc == NULL)
234 		return (ENXIO);
235 
236 	/*
237 	 * Get a read lock on the guest memory map.
238 	 */
239 	vm_slock_memsegs(sc->vm);
240 
241 	prot = (uio->uio_rw == UIO_WRITE ? VM_PROT_WRITE : VM_PROT_READ);
242 	maxaddr = vmm_sysmem_maxaddr(sc->vm);
243 	while (uio->uio_resid > 0 && error == 0) {
244 		gpa = uio->uio_offset;
245 		off = gpa & PAGE_MASK;
246 		c = min(uio->uio_resid, PAGE_SIZE - off);
247 
248 		/*
249 		 * The VM has a hole in its physical memory map. If we want to
250 		 * use 'dd' to inspect memory beyond the hole we need to
251 		 * provide bogus data for memory that lies in the hole.
252 		 *
253 		 * Since this device does not support lseek(2), dd(1) will
254 		 * read(2) blocks of data to simulate the lseek(2).
255 		 */
256 		hpa = vm_gpa_hold_global(sc->vm, gpa, c, prot, &cookie);
257 		if (hpa == NULL) {
258 			if (uio->uio_rw == UIO_READ && gpa < maxaddr)
259 				error = uiomove(__DECONST(void *, zero_region),
260 				    c, uio);
261 			else
262 				error = EFAULT;
263 		} else {
264 			error = uiomove(hpa, c, uio);
265 			vm_gpa_release(cookie);
266 		}
267 	}
268 	vm_unlock_memsegs(sc->vm);
269 	return (error);
270 }
271 
272 CTASSERT(sizeof(((struct vm_memseg *)0)->name) >= VM_MAX_SUFFIXLEN + 1);
273 
274 static int
get_memseg(struct vmmdev_softc * sc,struct vm_memseg * mseg,size_t len)275 get_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
276 {
277 	struct devmem_softc *dsc;
278 	int error;
279 	bool sysmem;
280 
281 	error = vm_get_memseg(sc->vm, mseg->segid, &mseg->len, &sysmem, NULL);
282 	if (error || mseg->len == 0)
283 		return (error);
284 
285 	if (!sysmem) {
286 		SLIST_FOREACH(dsc, &sc->devmem, link) {
287 			if (dsc->segid == mseg->segid)
288 				break;
289 		}
290 		KASSERT(dsc != NULL, ("%s: devmem segment %d not found",
291 		    __func__, mseg->segid));
292 		error = copystr(dsc->name, mseg->name, len, NULL);
293 	} else {
294 		bzero(mseg->name, len);
295 	}
296 
297 	return (error);
298 }
299 
300 static int
alloc_memseg(struct vmmdev_softc * sc,struct vm_memseg * mseg,size_t len)301 alloc_memseg(struct vmmdev_softc *sc, struct vm_memseg *mseg, size_t len)
302 {
303 	char *name;
304 	int error;
305 	bool sysmem;
306 
307 	error = 0;
308 	name = NULL;
309 	sysmem = true;
310 
311 	/*
312 	 * The allocation is lengthened by 1 to hold a terminating NUL.  It'll
313 	 * by stripped off when devfs processes the full string.
314 	 */
315 	if (VM_MEMSEG_NAME(mseg)) {
316 		sysmem = false;
317 		name = malloc(len, M_VMMDEV, M_WAITOK);
318 		error = copystr(mseg->name, name, len, NULL);
319 		if (error)
320 			goto done;
321 	}
322 
323 	error = vm_alloc_memseg(sc->vm, mseg->segid, mseg->len, sysmem);
324 	if (error)
325 		goto done;
326 
327 	if (VM_MEMSEG_NAME(mseg)) {
328 		error = devmem_create_cdev(vm_name(sc->vm), mseg->segid, name);
329 		if (error)
330 			vm_free_memseg(sc->vm, mseg->segid);
331 		else
332 			name = NULL;	/* freed when 'cdev' is destroyed */
333 	}
334 done:
335 	free(name, M_VMMDEV);
336 	return (error);
337 }
338 
339 static int
vm_get_register_set(struct vcpu * vcpu,unsigned int count,int * regnum,uint64_t * regval)340 vm_get_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
341     uint64_t *regval)
342 {
343 	int error, i;
344 
345 	error = 0;
346 	for (i = 0; i < count; i++) {
347 		error = vm_get_register(vcpu, regnum[i], &regval[i]);
348 		if (error)
349 			break;
350 	}
351 	return (error);
352 }
353 
354 static int
vm_set_register_set(struct vcpu * vcpu,unsigned int count,int * regnum,uint64_t * regval)355 vm_set_register_set(struct vcpu *vcpu, unsigned int count, int *regnum,
356     uint64_t *regval)
357 {
358 	int error, i;
359 
360 	error = 0;
361 	for (i = 0; i < count; i++) {
362 		error = vm_set_register(vcpu, regnum[i], regval[i]);
363 		if (error)
364 			break;
365 	}
366 	return (error);
367 }
368 
369 static int
vmmdev_ioctl(struct cdev * cdev,u_long cmd,caddr_t data,int fflag,struct thread * td)370 vmmdev_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
371 	     struct thread *td)
372 {
373 	int error, vcpuid, size;
374 	cpuset_t *cpuset;
375 	struct vmmdev_softc *sc;
376 	struct vcpu *vcpu;
377 	struct vm_register *vmreg;
378 	struct vm_seg_desc *vmsegdesc;
379 	struct vm_register_set *vmregset;
380 	struct vm_run *vmrun;
381 	struct vm_exception *vmexc;
382 	struct vm_lapic_irq *vmirq;
383 	struct vm_lapic_msi *vmmsi;
384 	struct vm_ioapic_irq *ioapic_irq;
385 	struct vm_isa_irq *isa_irq;
386 	struct vm_isa_irq_trigger *isa_irq_trigger;
387 	struct vm_capability *vmcap;
388 	struct vm_pptdev *pptdev;
389 	struct vm_pptdev_mmio *pptmmio;
390 	struct vm_pptdev_msi *pptmsi;
391 	struct vm_pptdev_msix *pptmsix;
392 #ifdef COMPAT_FREEBSD13
393 	struct vm_stats_old *vmstats_old;
394 #endif
395 	struct vm_stats *vmstats;
396 	struct vm_stat_desc *statdesc;
397 	struct vm_x2apic *x2apic;
398 	struct vm_gpa_pte *gpapte;
399 	struct vm_suspend *vmsuspend;
400 	struct vm_gla2gpa *gg;
401 	struct vm_cpuset *vm_cpuset;
402 	struct vm_intinfo *vmii;
403 	struct vm_rtc_time *rtctime;
404 	struct vm_rtc_data *rtcdata;
405 	struct vm_memmap *mm;
406 	struct vm_munmap *mu;
407 	struct vm_cpu_topology *topology;
408 	struct vm_readwrite_kernemu_device *kernemu;
409 	uint64_t *regvals;
410 	int *regnums;
411 	enum { NONE, SINGLE, ALL } vcpus_locked;
412 	bool memsegs_locked;
413 #ifdef BHYVE_SNAPSHOT
414 	struct vm_snapshot_meta *snapshot_meta;
415 #endif
416 
417 	error = vmm_priv_check(curthread->td_ucred);
418 	if (error)
419 		return (error);
420 
421 	sc = vmmdev_lookup2(cdev);
422 	if (sc == NULL)
423 		return (ENXIO);
424 
425 	vcpuid = -1;
426 	vcpu = NULL;
427 	vcpus_locked = NONE;
428 	memsegs_locked = false;
429 
430 	/*
431 	 * For VMM ioctls that operate on a single vCPU, lookup the
432 	 * vcpu.  For VMM ioctls which require one or more vCPUs to
433 	 * not be running, lock necessary vCPUs.
434 	 *
435 	 * XXX fragile, handle with care
436 	 * Most of these assume that the first field of the ioctl data
437 	 * is the vcpuid.
438 	 */
439 	switch (cmd) {
440 	case VM_RUN:
441 	case VM_GET_REGISTER:
442 	case VM_SET_REGISTER:
443 	case VM_GET_SEGMENT_DESCRIPTOR:
444 	case VM_SET_SEGMENT_DESCRIPTOR:
445 	case VM_GET_REGISTER_SET:
446 	case VM_SET_REGISTER_SET:
447 	case VM_INJECT_EXCEPTION:
448 	case VM_GET_CAPABILITY:
449 	case VM_SET_CAPABILITY:
450 	case VM_SET_X2APIC_STATE:
451 	case VM_GLA2GPA:
452 	case VM_GLA2GPA_NOFAULT:
453 	case VM_ACTIVATE_CPU:
454 	case VM_SET_INTINFO:
455 	case VM_GET_INTINFO:
456 	case VM_RESTART_INSTRUCTION:
457 	case VM_GET_KERNEMU_DEV:
458 	case VM_SET_KERNEMU_DEV:
459 		/*
460 		 * ioctls that can operate only on vcpus that are not running.
461 		 */
462 		vcpuid = *(int *)data;
463 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
464 		if (vcpu == NULL) {
465 			error = EINVAL;
466 			goto done;
467 		}
468 		error = vcpu_lock_one(vcpu);
469 		if (error)
470 			goto done;
471 		vcpus_locked = SINGLE;
472 		break;
473 
474 #ifdef COMPAT_FREEBSD12
475 	case VM_ALLOC_MEMSEG_FBSD12:
476 #endif
477 	case VM_ALLOC_MEMSEG:
478 	case VM_BIND_PPTDEV:
479 	case VM_UNBIND_PPTDEV:
480 	case VM_MMAP_MEMSEG:
481 	case VM_MUNMAP_MEMSEG:
482 	case VM_REINIT:
483 		/*
484 		 * ioctls that modify the memory map must lock memory
485 		 * segments exclusively.
486 		 */
487 		vm_xlock_memsegs(sc->vm);
488 		memsegs_locked = true;
489 		/* FALLTHROUGH */
490 	case VM_MAP_PPTDEV_MMIO:
491 	case VM_UNMAP_PPTDEV_MMIO:
492 #ifdef BHYVE_SNAPSHOT
493 	case VM_SNAPSHOT_REQ:
494 	case VM_RESTORE_TIME:
495 #endif
496 		/*
497 		 * ioctls that operate on the entire virtual machine must
498 		 * prevent all vcpus from running.
499 		 */
500 		error = vcpu_lock_all(sc);
501 		if (error)
502 			goto done;
503 		vcpus_locked = ALL;
504 		break;
505 
506 #ifdef COMPAT_FREEBSD12
507 	case VM_GET_MEMSEG_FBSD12:
508 #endif
509 	case VM_GET_MEMSEG:
510 	case VM_MMAP_GETNEXT:
511 		/*
512 		 * Lock the memory map while it is being inspected.
513 		 */
514 		vm_slock_memsegs(sc->vm);
515 		memsegs_locked = true;
516 		break;
517 
518 #ifdef COMPAT_FREEBSD13
519 	case VM_STATS_OLD:
520 #endif
521 	case VM_STATS:
522 	case VM_INJECT_NMI:
523 	case VM_LAPIC_IRQ:
524 	case VM_GET_X2APIC_STATE:
525 		/*
526 		 * These do not need the vCPU locked but do operate on
527 		 * a specific vCPU.
528 		 */
529 		vcpuid = *(int *)data;
530 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
531 		if (vcpu == NULL) {
532 			error = EINVAL;
533 			goto done;
534 		}
535 		break;
536 
537 	case VM_LAPIC_LOCAL_IRQ:
538 	case VM_SUSPEND_CPU:
539 	case VM_RESUME_CPU:
540 		/*
541 		 * These can either operate on all CPUs via a vcpuid of
542 		 * -1 or on a specific vCPU.
543 		 */
544 		vcpuid = *(int *)data;
545 		if (vcpuid == -1)
546 			break;
547 		vcpu = vm_alloc_vcpu(sc->vm, vcpuid);
548 		if (vcpu == NULL) {
549 			error = EINVAL;
550 			goto done;
551 		}
552 		break;
553 
554 	default:
555 		break;
556 	}
557 
558 	switch(cmd) {
559 	case VM_RUN:
560 		vmrun = (struct vm_run *)data;
561 		error = vm_run(vcpu, &vmrun->vm_exit);
562 		break;
563 	case VM_SUSPEND:
564 		vmsuspend = (struct vm_suspend *)data;
565 		error = vm_suspend(sc->vm, vmsuspend->how);
566 		break;
567 	case VM_REINIT:
568 		error = vm_reinit(sc->vm);
569 		break;
570 	case VM_STAT_DESC: {
571 		statdesc = (struct vm_stat_desc *)data;
572 		error = vmm_stat_desc_copy(statdesc->index,
573 					statdesc->desc, sizeof(statdesc->desc));
574 		break;
575 	}
576 #ifdef COMPAT_FREEBSD13
577 	case VM_STATS_OLD:
578 		vmstats_old = (struct vm_stats_old *)data;
579 		getmicrotime(&vmstats_old->tv);
580 		error = vmm_stat_copy(vcpu, 0,
581 				      nitems(vmstats_old->statbuf),
582 				      &vmstats_old->num_entries,
583 				      vmstats_old->statbuf);
584 		break;
585 #endif
586 	case VM_STATS: {
587 		vmstats = (struct vm_stats *)data;
588 		getmicrotime(&vmstats->tv);
589 		error = vmm_stat_copy(vcpu, vmstats->index,
590 				      nitems(vmstats->statbuf),
591 				      &vmstats->num_entries, vmstats->statbuf);
592 		break;
593 	}
594 	case VM_PPTDEV_MSI:
595 		pptmsi = (struct vm_pptdev_msi *)data;
596 		error = ppt_setup_msi(sc->vm,
597 				      pptmsi->bus, pptmsi->slot, pptmsi->func,
598 				      pptmsi->addr, pptmsi->msg,
599 				      pptmsi->numvec);
600 		break;
601 	case VM_PPTDEV_MSIX:
602 		pptmsix = (struct vm_pptdev_msix *)data;
603 		error = ppt_setup_msix(sc->vm,
604 				       pptmsix->bus, pptmsix->slot,
605 				       pptmsix->func, pptmsix->idx,
606 				       pptmsix->addr, pptmsix->msg,
607 				       pptmsix->vector_control);
608 		break;
609 	case VM_PPTDEV_DISABLE_MSIX:
610 		pptdev = (struct vm_pptdev *)data;
611 		error = ppt_disable_msix(sc->vm, pptdev->bus, pptdev->slot,
612 					 pptdev->func);
613 		break;
614 	case VM_MAP_PPTDEV_MMIO:
615 		pptmmio = (struct vm_pptdev_mmio *)data;
616 		error = ppt_map_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
617 				     pptmmio->func, pptmmio->gpa, pptmmio->len,
618 				     pptmmio->hpa);
619 		break;
620 	case VM_UNMAP_PPTDEV_MMIO:
621 		pptmmio = (struct vm_pptdev_mmio *)data;
622 		error = ppt_unmap_mmio(sc->vm, pptmmio->bus, pptmmio->slot,
623 				       pptmmio->func, pptmmio->gpa, pptmmio->len);
624 		break;
625 	case VM_BIND_PPTDEV:
626 		pptdev = (struct vm_pptdev *)data;
627 		error = vm_assign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
628 					 pptdev->func);
629 		break;
630 	case VM_UNBIND_PPTDEV:
631 		pptdev = (struct vm_pptdev *)data;
632 		error = vm_unassign_pptdev(sc->vm, pptdev->bus, pptdev->slot,
633 					   pptdev->func);
634 		break;
635 	case VM_INJECT_EXCEPTION:
636 		vmexc = (struct vm_exception *)data;
637 		error = vm_inject_exception(vcpu,
638 		    vmexc->vector, vmexc->error_code_valid, vmexc->error_code,
639 		    vmexc->restart_instruction);
640 		break;
641 	case VM_INJECT_NMI:
642 		error = vm_inject_nmi(vcpu);
643 		break;
644 	case VM_LAPIC_IRQ:
645 		vmirq = (struct vm_lapic_irq *)data;
646 		error = lapic_intr_edge(vcpu, vmirq->vector);
647 		break;
648 	case VM_LAPIC_LOCAL_IRQ:
649 		vmirq = (struct vm_lapic_irq *)data;
650 		error = lapic_set_local_intr(sc->vm, vcpu, vmirq->vector);
651 		break;
652 	case VM_LAPIC_MSI:
653 		vmmsi = (struct vm_lapic_msi *)data;
654 		error = lapic_intr_msi(sc->vm, vmmsi->addr, vmmsi->msg);
655 		break;
656 	case VM_IOAPIC_ASSERT_IRQ:
657 		ioapic_irq = (struct vm_ioapic_irq *)data;
658 		error = vioapic_assert_irq(sc->vm, ioapic_irq->irq);
659 		break;
660 	case VM_IOAPIC_DEASSERT_IRQ:
661 		ioapic_irq = (struct vm_ioapic_irq *)data;
662 		error = vioapic_deassert_irq(sc->vm, ioapic_irq->irq);
663 		break;
664 	case VM_IOAPIC_PULSE_IRQ:
665 		ioapic_irq = (struct vm_ioapic_irq *)data;
666 		error = vioapic_pulse_irq(sc->vm, ioapic_irq->irq);
667 		break;
668 	case VM_IOAPIC_PINCOUNT:
669 		*(int *)data = vioapic_pincount(sc->vm);
670 		break;
671 	case VM_SET_KERNEMU_DEV:
672 	case VM_GET_KERNEMU_DEV: {
673 		mem_region_write_t mwrite;
674 		mem_region_read_t mread;
675 		bool arg;
676 
677 		kernemu = (void *)data;
678 
679 		if (kernemu->access_width > 0)
680 			size = (1u << kernemu->access_width);
681 		else
682 			size = 1;
683 
684 		if (kernemu->gpa >= DEFAULT_APIC_BASE && kernemu->gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
685 			mread = lapic_mmio_read;
686 			mwrite = lapic_mmio_write;
687 		} else if (kernemu->gpa >= VIOAPIC_BASE && kernemu->gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
688 			mread = vioapic_mmio_read;
689 			mwrite = vioapic_mmio_write;
690 		} else if (kernemu->gpa >= VHPET_BASE && kernemu->gpa < VHPET_BASE + VHPET_SIZE) {
691 			mread = vhpet_mmio_read;
692 			mwrite = vhpet_mmio_write;
693 		} else {
694 			error = EINVAL;
695 			break;
696 		}
697 
698 		if (cmd == VM_SET_KERNEMU_DEV)
699 			error = mwrite(vcpu, kernemu->gpa,
700 			    kernemu->value, size, &arg);
701 		else
702 			error = mread(vcpu, kernemu->gpa,
703 			    &kernemu->value, size, &arg);
704 		break;
705 		}
706 	case VM_ISA_ASSERT_IRQ:
707 		isa_irq = (struct vm_isa_irq *)data;
708 		error = vatpic_assert_irq(sc->vm, isa_irq->atpic_irq);
709 		if (error == 0 && isa_irq->ioapic_irq != -1)
710 			error = vioapic_assert_irq(sc->vm,
711 			    isa_irq->ioapic_irq);
712 		break;
713 	case VM_ISA_DEASSERT_IRQ:
714 		isa_irq = (struct vm_isa_irq *)data;
715 		error = vatpic_deassert_irq(sc->vm, isa_irq->atpic_irq);
716 		if (error == 0 && isa_irq->ioapic_irq != -1)
717 			error = vioapic_deassert_irq(sc->vm,
718 			    isa_irq->ioapic_irq);
719 		break;
720 	case VM_ISA_PULSE_IRQ:
721 		isa_irq = (struct vm_isa_irq *)data;
722 		error = vatpic_pulse_irq(sc->vm, isa_irq->atpic_irq);
723 		if (error == 0 && isa_irq->ioapic_irq != -1)
724 			error = vioapic_pulse_irq(sc->vm, isa_irq->ioapic_irq);
725 		break;
726 	case VM_ISA_SET_IRQ_TRIGGER:
727 		isa_irq_trigger = (struct vm_isa_irq_trigger *)data;
728 		error = vatpic_set_irq_trigger(sc->vm,
729 		    isa_irq_trigger->atpic_irq, isa_irq_trigger->trigger);
730 		break;
731 	case VM_MMAP_GETNEXT:
732 		mm = (struct vm_memmap *)data;
733 		error = vm_mmap_getnext(sc->vm, &mm->gpa, &mm->segid,
734 		    &mm->segoff, &mm->len, &mm->prot, &mm->flags);
735 		break;
736 	case VM_MMAP_MEMSEG:
737 		mm = (struct vm_memmap *)data;
738 		error = vm_mmap_memseg(sc->vm, mm->gpa, mm->segid, mm->segoff,
739 		    mm->len, mm->prot, mm->flags);
740 		break;
741 	case VM_MUNMAP_MEMSEG:
742 		mu = (struct vm_munmap *)data;
743 		error = vm_munmap_memseg(sc->vm, mu->gpa, mu->len);
744 		break;
745 #ifdef COMPAT_FREEBSD12
746 	case VM_ALLOC_MEMSEG_FBSD12:
747 		error = alloc_memseg(sc, (struct vm_memseg *)data,
748 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
749 		break;
750 #endif
751 	case VM_ALLOC_MEMSEG:
752 		error = alloc_memseg(sc, (struct vm_memseg *)data,
753 		    sizeof(((struct vm_memseg *)0)->name));
754 		break;
755 #ifdef COMPAT_FREEBSD12
756 	case VM_GET_MEMSEG_FBSD12:
757 		error = get_memseg(sc, (struct vm_memseg *)data,
758 		    sizeof(((struct vm_memseg_fbsd12 *)0)->name));
759 		break;
760 #endif
761 	case VM_GET_MEMSEG:
762 		error = get_memseg(sc, (struct vm_memseg *)data,
763 		    sizeof(((struct vm_memseg *)0)->name));
764 		break;
765 	case VM_GET_REGISTER:
766 		vmreg = (struct vm_register *)data;
767 		error = vm_get_register(vcpu, vmreg->regnum, &vmreg->regval);
768 		break;
769 	case VM_SET_REGISTER:
770 		vmreg = (struct vm_register *)data;
771 		error = vm_set_register(vcpu, vmreg->regnum, vmreg->regval);
772 		break;
773 	case VM_SET_SEGMENT_DESCRIPTOR:
774 		vmsegdesc = (struct vm_seg_desc *)data;
775 		error = vm_set_seg_desc(vcpu,
776 					vmsegdesc->regnum,
777 					&vmsegdesc->desc);
778 		break;
779 	case VM_GET_SEGMENT_DESCRIPTOR:
780 		vmsegdesc = (struct vm_seg_desc *)data;
781 		error = vm_get_seg_desc(vcpu,
782 					vmsegdesc->regnum,
783 					&vmsegdesc->desc);
784 		break;
785 	case VM_GET_REGISTER_SET:
786 		vmregset = (struct vm_register_set *)data;
787 		if (vmregset->count > VM_REG_LAST) {
788 			error = EINVAL;
789 			break;
790 		}
791 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
792 		    M_WAITOK);
793 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
794 		    M_WAITOK);
795 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
796 		    vmregset->count);
797 		if (error == 0)
798 			error = vm_get_register_set(vcpu,
799 			    vmregset->count, regnums, regvals);
800 		if (error == 0)
801 			error = copyout(regvals, vmregset->regvals,
802 			    sizeof(regvals[0]) * vmregset->count);
803 		free(regvals, M_VMMDEV);
804 		free(regnums, M_VMMDEV);
805 		break;
806 	case VM_SET_REGISTER_SET:
807 		vmregset = (struct vm_register_set *)data;
808 		if (vmregset->count > VM_REG_LAST) {
809 			error = EINVAL;
810 			break;
811 		}
812 		regvals = malloc(sizeof(regvals[0]) * vmregset->count, M_VMMDEV,
813 		    M_WAITOK);
814 		regnums = malloc(sizeof(regnums[0]) * vmregset->count, M_VMMDEV,
815 		    M_WAITOK);
816 		error = copyin(vmregset->regnums, regnums, sizeof(regnums[0]) *
817 		    vmregset->count);
818 		if (error == 0)
819 			error = copyin(vmregset->regvals, regvals,
820 			    sizeof(regvals[0]) * vmregset->count);
821 		if (error == 0)
822 			error = vm_set_register_set(vcpu,
823 			    vmregset->count, regnums, regvals);
824 		free(regvals, M_VMMDEV);
825 		free(regnums, M_VMMDEV);
826 		break;
827 	case VM_GET_CAPABILITY:
828 		vmcap = (struct vm_capability *)data;
829 		error = vm_get_capability(vcpu,
830 					  vmcap->captype,
831 					  &vmcap->capval);
832 		break;
833 	case VM_SET_CAPABILITY:
834 		vmcap = (struct vm_capability *)data;
835 		error = vm_set_capability(vcpu,
836 					  vmcap->captype,
837 					  vmcap->capval);
838 		break;
839 	case VM_SET_X2APIC_STATE:
840 		x2apic = (struct vm_x2apic *)data;
841 		error = vm_set_x2apic_state(vcpu, x2apic->state);
842 		break;
843 	case VM_GET_X2APIC_STATE:
844 		x2apic = (struct vm_x2apic *)data;
845 		error = vm_get_x2apic_state(vcpu, &x2apic->state);
846 		break;
847 	case VM_GET_GPA_PMAP:
848 		gpapte = (struct vm_gpa_pte *)data;
849 		pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vm)),
850 				 gpapte->gpa, gpapte->pte, &gpapte->ptenum);
851 		error = 0;
852 		break;
853 	case VM_GET_HPET_CAPABILITIES:
854 		error = vhpet_getcap((struct vm_hpet_cap *)data);
855 		break;
856 	case VM_GLA2GPA: {
857 		CTASSERT(PROT_READ == VM_PROT_READ);
858 		CTASSERT(PROT_WRITE == VM_PROT_WRITE);
859 		CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
860 		gg = (struct vm_gla2gpa *)data;
861 		error = vm_gla2gpa(vcpu, &gg->paging, gg->gla,
862 		    gg->prot, &gg->gpa, &gg->fault);
863 		KASSERT(error == 0 || error == EFAULT,
864 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
865 		break;
866 	}
867 	case VM_GLA2GPA_NOFAULT:
868 		gg = (struct vm_gla2gpa *)data;
869 		error = vm_gla2gpa_nofault(vcpu, &gg->paging, gg->gla,
870 		    gg->prot, &gg->gpa, &gg->fault);
871 		KASSERT(error == 0 || error == EFAULT,
872 		    ("%s: vm_gla2gpa unknown error %d", __func__, error));
873 		break;
874 	case VM_ACTIVATE_CPU:
875 		error = vm_activate_cpu(vcpu);
876 		break;
877 	case VM_GET_CPUS:
878 		error = 0;
879 		vm_cpuset = (struct vm_cpuset *)data;
880 		size = vm_cpuset->cpusetsize;
881 		if (size < 1 || size > CPU_MAXSIZE / NBBY) {
882 			error = ERANGE;
883 			break;
884 		}
885 		cpuset = malloc(max(size, sizeof(cpuset_t)), M_TEMP,
886 		    M_WAITOK | M_ZERO);
887 		if (vm_cpuset->which == VM_ACTIVE_CPUS)
888 			*cpuset = vm_active_cpus(sc->vm);
889 		else if (vm_cpuset->which == VM_SUSPENDED_CPUS)
890 			*cpuset = vm_suspended_cpus(sc->vm);
891 		else if (vm_cpuset->which == VM_DEBUG_CPUS)
892 			*cpuset = vm_debug_cpus(sc->vm);
893 		else
894 			error = EINVAL;
895 		if (error == 0 && size < howmany(CPU_FLS(cpuset), NBBY))
896 			error = ERANGE;
897 		if (error == 0)
898 			error = copyout(cpuset, vm_cpuset->cpus, size);
899 		free(cpuset, M_TEMP);
900 		break;
901 	case VM_SUSPEND_CPU:
902 		error = vm_suspend_cpu(sc->vm, vcpu);
903 		break;
904 	case VM_RESUME_CPU:
905 		error = vm_resume_cpu(sc->vm, vcpu);
906 		break;
907 	case VM_SET_INTINFO:
908 		vmii = (struct vm_intinfo *)data;
909 		error = vm_exit_intinfo(vcpu, vmii->info1);
910 		break;
911 	case VM_GET_INTINFO:
912 		vmii = (struct vm_intinfo *)data;
913 		error = vm_get_intinfo(vcpu, &vmii->info1, &vmii->info2);
914 		break;
915 	case VM_RTC_WRITE:
916 		rtcdata = (struct vm_rtc_data *)data;
917 		error = vrtc_nvram_write(sc->vm, rtcdata->offset,
918 		    rtcdata->value);
919 		break;
920 	case VM_RTC_READ:
921 		rtcdata = (struct vm_rtc_data *)data;
922 		error = vrtc_nvram_read(sc->vm, rtcdata->offset,
923 		    &rtcdata->value);
924 		break;
925 	case VM_RTC_SETTIME:
926 		rtctime = (struct vm_rtc_time *)data;
927 		error = vrtc_set_time(sc->vm, rtctime->secs);
928 		break;
929 	case VM_RTC_GETTIME:
930 		error = 0;
931 		rtctime = (struct vm_rtc_time *)data;
932 		rtctime->secs = vrtc_get_time(sc->vm);
933 		break;
934 	case VM_RESTART_INSTRUCTION:
935 		error = vm_restart_instruction(vcpu);
936 		break;
937 	case VM_SET_TOPOLOGY:
938 		topology = (struct vm_cpu_topology *)data;
939 		error = vm_set_topology(sc->vm, topology->sockets,
940 		    topology->cores, topology->threads, topology->maxcpus);
941 		break;
942 	case VM_GET_TOPOLOGY:
943 		topology = (struct vm_cpu_topology *)data;
944 		vm_get_topology(sc->vm, &topology->sockets, &topology->cores,
945 		    &topology->threads, &topology->maxcpus);
946 		error = 0;
947 		break;
948 #ifdef BHYVE_SNAPSHOT
949 	case VM_SNAPSHOT_REQ:
950 		snapshot_meta = (struct vm_snapshot_meta *)data;
951 		error = vm_snapshot_req(sc->vm, snapshot_meta);
952 		break;
953 	case VM_RESTORE_TIME:
954 		error = vm_restore_time(sc->vm);
955 		break;
956 #endif
957 	default:
958 		error = ENOTTY;
959 		break;
960 	}
961 
962 	if (vcpus_locked == SINGLE)
963 		vcpu_unlock_one(sc, vcpuid, vcpu);
964 	else if (vcpus_locked == ALL)
965 		vcpu_unlock_all(sc);
966 	if (memsegs_locked)
967 		vm_unlock_memsegs(sc->vm);
968 
969 done:
970 	/*
971 	 * Make sure that no handler returns a kernel-internal
972 	 * error value to userspace.
973 	 */
974 	KASSERT(error == ERESTART || error >= 0,
975 	    ("vmmdev_ioctl: invalid error return %d", error));
976 	return (error);
977 }
978 
979 static int
vmmdev_mmap_single(struct cdev * cdev,vm_ooffset_t * offset,vm_size_t mapsize,struct vm_object ** objp,int nprot)980 vmmdev_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t mapsize,
981     struct vm_object **objp, int nprot)
982 {
983 	struct vmmdev_softc *sc;
984 	vm_paddr_t gpa;
985 	size_t len;
986 	vm_ooffset_t segoff, first, last;
987 	int error, found, segid;
988 	bool sysmem;
989 
990 	error = vmm_priv_check(curthread->td_ucred);
991 	if (error)
992 		return (error);
993 
994 	first = *offset;
995 	last = first + mapsize;
996 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
997 		return (EINVAL);
998 
999 	sc = vmmdev_lookup2(cdev);
1000 	if (sc == NULL) {
1001 		/* virtual machine is in the process of being created */
1002 		return (EINVAL);
1003 	}
1004 
1005 	/*
1006 	 * Get a read lock on the guest memory map.
1007 	 */
1008 	vm_slock_memsegs(sc->vm);
1009 
1010 	gpa = 0;
1011 	found = 0;
1012 	while (!found) {
1013 		error = vm_mmap_getnext(sc->vm, &gpa, &segid, &segoff, &len,
1014 		    NULL, NULL);
1015 		if (error)
1016 			break;
1017 
1018 		if (first >= gpa && last <= gpa + len)
1019 			found = 1;
1020 		else
1021 			gpa += len;
1022 	}
1023 
1024 	if (found) {
1025 		error = vm_get_memseg(sc->vm, segid, &len, &sysmem, objp);
1026 		KASSERT(error == 0 && *objp != NULL,
1027 		    ("%s: invalid memory segment %d", __func__, segid));
1028 		if (sysmem) {
1029 			vm_object_reference(*objp);
1030 			*offset = segoff + (first - gpa);
1031 		} else {
1032 			error = EINVAL;
1033 		}
1034 	}
1035 	vm_unlock_memsegs(sc->vm);
1036 	return (error);
1037 }
1038 
1039 static void
vmmdev_destroy(void * arg)1040 vmmdev_destroy(void *arg)
1041 {
1042 	struct vmmdev_softc *sc = arg;
1043 	struct devmem_softc *dsc;
1044 	int error __diagused;
1045 
1046 	vm_disable_vcpu_creation(sc->vm);
1047 	error = vcpu_lock_all(sc);
1048 	KASSERT(error == 0, ("%s: error %d freezing vcpus", __func__, error));
1049 	vm_unlock_vcpus(sc->vm);
1050 
1051 	while ((dsc = SLIST_FIRST(&sc->devmem)) != NULL) {
1052 		KASSERT(dsc->cdev == NULL, ("%s: devmem not free", __func__));
1053 		SLIST_REMOVE_HEAD(&sc->devmem, link);
1054 		free(dsc->name, M_VMMDEV);
1055 		free(dsc, M_VMMDEV);
1056 	}
1057 
1058 	if (sc->cdev != NULL)
1059 		destroy_dev(sc->cdev);
1060 
1061 	if (sc->vm != NULL)
1062 		vm_destroy(sc->vm);
1063 
1064 	if (sc->ucred != NULL)
1065 		crfree(sc->ucred);
1066 
1067 	if ((sc->flags & VSC_LINKED) != 0) {
1068 		mtx_lock(&vmmdev_mtx);
1069 		SLIST_REMOVE(&head, sc, vmmdev_softc, link);
1070 		mtx_unlock(&vmmdev_mtx);
1071 	}
1072 
1073 	free(sc, M_VMMDEV);
1074 }
1075 
1076 static int
sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)1077 sysctl_vmm_destroy(SYSCTL_HANDLER_ARGS)
1078 {
1079 	struct devmem_softc *dsc;
1080 	struct vmmdev_softc *sc;
1081 	struct cdev *cdev;
1082 	char *buf;
1083 	int error, buflen;
1084 
1085 	error = vmm_priv_check(req->td->td_ucred);
1086 	if (error)
1087 		return (error);
1088 
1089 	buflen = VM_MAX_NAMELEN + 1;
1090 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1091 	strlcpy(buf, "beavis", buflen);
1092 	error = sysctl_handle_string(oidp, buf, buflen, req);
1093 	if (error != 0 || req->newptr == NULL)
1094 		goto out;
1095 
1096 	mtx_lock(&vmmdev_mtx);
1097 	sc = vmmdev_lookup(buf);
1098 	if (sc == NULL || sc->cdev == NULL) {
1099 		mtx_unlock(&vmmdev_mtx);
1100 		error = EINVAL;
1101 		goto out;
1102 	}
1103 
1104 	/*
1105 	 * Setting 'sc->cdev' to NULL is used to indicate that the VM
1106 	 * is scheduled for destruction.
1107 	 */
1108 	cdev = sc->cdev;
1109 	sc->cdev = NULL;
1110 	mtx_unlock(&vmmdev_mtx);
1111 
1112 	/*
1113 	 * Destroy all cdevs:
1114 	 *
1115 	 * - any new operations on the 'cdev' will return an error (ENXIO).
1116 	 *
1117 	 * - the 'devmem' cdevs are destroyed before the virtual machine 'cdev'
1118 	 */
1119 	SLIST_FOREACH(dsc, &sc->devmem, link) {
1120 		KASSERT(dsc->cdev != NULL, ("devmem cdev already destroyed"));
1121 		destroy_dev(dsc->cdev);
1122 		devmem_destroy(dsc);
1123 	}
1124 	destroy_dev(cdev);
1125 	vmmdev_destroy(sc);
1126 	error = 0;
1127 
1128 out:
1129 	free(buf, M_VMMDEV);
1130 	return (error);
1131 }
1132 SYSCTL_PROC(_hw_vmm, OID_AUTO, destroy,
1133     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1134     NULL, 0, sysctl_vmm_destroy, "A",
1135     NULL);
1136 
1137 static struct cdevsw vmmdevsw = {
1138 	.d_name		= "vmmdev",
1139 	.d_version	= D_VERSION,
1140 	.d_ioctl	= vmmdev_ioctl,
1141 	.d_mmap_single	= vmmdev_mmap_single,
1142 	.d_read		= vmmdev_rw,
1143 	.d_write	= vmmdev_rw,
1144 };
1145 
1146 static int
sysctl_vmm_create(SYSCTL_HANDLER_ARGS)1147 sysctl_vmm_create(SYSCTL_HANDLER_ARGS)
1148 {
1149 	struct vm *vm;
1150 	struct cdev *cdev;
1151 	struct vmmdev_softc *sc, *sc2;
1152 	char *buf;
1153 	int error, buflen;
1154 
1155 	error = vmm_priv_check(req->td->td_ucred);
1156 	if (error)
1157 		return (error);
1158 
1159 	buflen = VM_MAX_NAMELEN + 1;
1160 	buf = malloc(buflen, M_VMMDEV, M_WAITOK | M_ZERO);
1161 	strlcpy(buf, "beavis", buflen);
1162 	error = sysctl_handle_string(oidp, buf, buflen, req);
1163 	if (error != 0 || req->newptr == NULL)
1164 		goto out;
1165 
1166 	mtx_lock(&vmmdev_mtx);
1167 	sc = vmmdev_lookup(buf);
1168 	mtx_unlock(&vmmdev_mtx);
1169 	if (sc != NULL) {
1170 		error = EEXIST;
1171 		goto out;
1172 	}
1173 
1174 	error = vm_create(buf, &vm);
1175 	if (error != 0)
1176 		goto out;
1177 
1178 	sc = malloc(sizeof(struct vmmdev_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1179 	sc->ucred = crhold(curthread->td_ucred);
1180 	sc->vm = vm;
1181 	SLIST_INIT(&sc->devmem);
1182 
1183 	/*
1184 	 * Lookup the name again just in case somebody sneaked in when we
1185 	 * dropped the lock.
1186 	 */
1187 	mtx_lock(&vmmdev_mtx);
1188 	sc2 = vmmdev_lookup(buf);
1189 	if (sc2 == NULL) {
1190 		SLIST_INSERT_HEAD(&head, sc, link);
1191 		sc->flags |= VSC_LINKED;
1192 	}
1193 	mtx_unlock(&vmmdev_mtx);
1194 
1195 	if (sc2 != NULL) {
1196 		vmmdev_destroy(sc);
1197 		error = EEXIST;
1198 		goto out;
1199 	}
1200 
1201 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &vmmdevsw, sc->ucred,
1202 	    UID_ROOT, GID_WHEEL, 0600, "vmm/%s", buf);
1203 	if (error != 0) {
1204 		vmmdev_destroy(sc);
1205 		goto out;
1206 	}
1207 
1208 	mtx_lock(&vmmdev_mtx);
1209 	sc->cdev = cdev;
1210 	sc->cdev->si_drv1 = sc;
1211 	mtx_unlock(&vmmdev_mtx);
1212 
1213 out:
1214 	free(buf, M_VMMDEV);
1215 	return (error);
1216 }
1217 SYSCTL_PROC(_hw_vmm, OID_AUTO, create,
1218     CTLTYPE_STRING | CTLFLAG_RW | CTLFLAG_PRISON | CTLFLAG_MPSAFE,
1219     NULL, 0, sysctl_vmm_create, "A",
1220     NULL);
1221 
1222 void
vmmdev_init(void)1223 vmmdev_init(void)
1224 {
1225 	pr_allow_flag = prison_add_allow(NULL, "vmm", NULL,
1226 	    "Allow use of vmm in a jail.");
1227 }
1228 
1229 int
vmmdev_cleanup(void)1230 vmmdev_cleanup(void)
1231 {
1232 	int error;
1233 
1234 	if (SLIST_EMPTY(&head))
1235 		error = 0;
1236 	else
1237 		error = EBUSY;
1238 
1239 	return (error);
1240 }
1241 
1242 static int
devmem_mmap_single(struct cdev * cdev,vm_ooffset_t * offset,vm_size_t len,struct vm_object ** objp,int nprot)1243 devmem_mmap_single(struct cdev *cdev, vm_ooffset_t *offset, vm_size_t len,
1244     struct vm_object **objp, int nprot)
1245 {
1246 	struct devmem_softc *dsc;
1247 	vm_ooffset_t first, last;
1248 	size_t seglen;
1249 	int error;
1250 	bool sysmem;
1251 
1252 	dsc = cdev->si_drv1;
1253 	if (dsc == NULL) {
1254 		/* 'cdev' has been created but is not ready for use */
1255 		return (ENXIO);
1256 	}
1257 
1258 	first = *offset;
1259 	last = *offset + len;
1260 	if ((nprot & PROT_EXEC) || first < 0 || first >= last)
1261 		return (EINVAL);
1262 
1263 	vm_slock_memsegs(dsc->sc->vm);
1264 
1265 	error = vm_get_memseg(dsc->sc->vm, dsc->segid, &seglen, &sysmem, objp);
1266 	KASSERT(error == 0 && !sysmem && *objp != NULL,
1267 	    ("%s: invalid devmem segment %d", __func__, dsc->segid));
1268 
1269 	if (seglen >= last)
1270 		vm_object_reference(*objp);
1271 	else
1272 		error = EINVAL;
1273 
1274 	vm_unlock_memsegs(dsc->sc->vm);
1275 	return (error);
1276 }
1277 
1278 static struct cdevsw devmemsw = {
1279 	.d_name		= "devmem",
1280 	.d_version	= D_VERSION,
1281 	.d_mmap_single	= devmem_mmap_single,
1282 };
1283 
1284 static int
devmem_create_cdev(const char * vmname,int segid,char * devname)1285 devmem_create_cdev(const char *vmname, int segid, char *devname)
1286 {
1287 	struct devmem_softc *dsc;
1288 	struct vmmdev_softc *sc;
1289 	struct cdev *cdev;
1290 	int error;
1291 
1292 	error = make_dev_p(MAKEDEV_CHECKNAME, &cdev, &devmemsw, NULL,
1293 	    UID_ROOT, GID_WHEEL, 0600, "vmm.io/%s.%s", vmname, devname);
1294 	if (error)
1295 		return (error);
1296 
1297 	dsc = malloc(sizeof(struct devmem_softc), M_VMMDEV, M_WAITOK | M_ZERO);
1298 
1299 	mtx_lock(&vmmdev_mtx);
1300 	sc = vmmdev_lookup(vmname);
1301 	KASSERT(sc != NULL, ("%s: vm %s softc not found", __func__, vmname));
1302 	if (sc->cdev == NULL) {
1303 		/* virtual machine is being created or destroyed */
1304 		mtx_unlock(&vmmdev_mtx);
1305 		free(dsc, M_VMMDEV);
1306 		destroy_dev_sched_cb(cdev, NULL, 0);
1307 		return (ENODEV);
1308 	}
1309 
1310 	dsc->segid = segid;
1311 	dsc->name = devname;
1312 	dsc->cdev = cdev;
1313 	dsc->sc = sc;
1314 	SLIST_INSERT_HEAD(&sc->devmem, dsc, link);
1315 	mtx_unlock(&vmmdev_mtx);
1316 
1317 	/* The 'cdev' is ready for use after 'si_drv1' is initialized */
1318 	cdev->si_drv1 = dsc;
1319 	return (0);
1320 }
1321 
1322 static void
devmem_destroy(void * arg)1323 devmem_destroy(void *arg)
1324 {
1325 	struct devmem_softc *dsc = arg;
1326 
1327 	KASSERT(dsc->cdev, ("%s: devmem cdev already destroyed", __func__));
1328 	dsc->cdev = NULL;
1329 	dsc->sc = NULL;
1330 }
1331