xref: /freebsd-11-stable/sys/amd64/vmm/vmm.c (revision 646d316dbc61a2ec6c48cec48981228423f0ca91)
1 /*-
2  * Copyright (c) 2011 NetApp, Inc.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/kernel.h>
35 #include <sys/module.h>
36 #include <sys/sysctl.h>
37 #include <sys/malloc.h>
38 #include <sys/pcpu.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/proc.h>
42 #include <sys/rwlock.h>
43 #include <sys/sched.h>
44 #include <sys/smp.h>
45 #include <sys/systm.h>
46 
47 #include <vm/vm.h>
48 #include <vm/vm_object.h>
49 #include <vm/vm_page.h>
50 #include <vm/pmap.h>
51 #include <vm/vm_map.h>
52 #include <vm/vm_extern.h>
53 #include <vm/vm_param.h>
54 
55 #include <machine/cpu.h>
56 #include <machine/pcb.h>
57 #include <machine/smp.h>
58 #include <machine/md_var.h>
59 #include <x86/psl.h>
60 #include <x86/apicreg.h>
61 
62 #include <machine/vmm.h>
63 #include <machine/vmm_dev.h>
64 #include <machine/vmm_instruction_emul.h>
65 
66 #include "vmm_ioport.h"
67 #include "vmm_ktr.h"
68 #include "vmm_host.h"
69 #include "vmm_mem.h"
70 #include "vmm_util.h"
71 #include "vatpic.h"
72 #include "vatpit.h"
73 #include "vhpet.h"
74 #include "vioapic.h"
75 #include "vlapic.h"
76 #include "vpmtmr.h"
77 #include "vrtc.h"
78 #include "vmm_stat.h"
79 #include "vmm_lapic.h"
80 
81 #include "io/ppt.h"
82 #include "io/iommu.h"
83 
84 struct vlapic;
85 
86 /*
87  * Initialization:
88  * (a) allocated when vcpu is created
89  * (i) initialized when vcpu is created and when it is reinitialized
90  * (o) initialized the first time the vcpu is created
91  * (x) initialized before use
92  */
93 struct vcpu {
94 	struct mtx 	mtx;		/* (o) protects 'state' and 'hostcpu' */
95 	enum vcpu_state	state;		/* (o) vcpu state */
96 	int		hostcpu;	/* (o) vcpu's host cpu */
97 	int		reqidle;	/* (i) request vcpu to idle */
98 	struct vlapic	*vlapic;	/* (i) APIC device model */
99 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
100 	uint64_t	exitintinfo;	/* (i) events pending at VM exit */
101 	int		nmi_pending;	/* (i) NMI pending */
102 	int		extint_pending;	/* (i) INTR pending */
103 	int	exception_pending;	/* (i) exception pending */
104 	int	exc_vector;		/* (x) exception collateral */
105 	int	exc_errcode_valid;
106 	uint32_t exc_errcode;
107 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
108 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
109 	void		*stats;		/* (a,i) statistics */
110 	struct vm_exit	exitinfo;	/* (x) exit reason and collateral */
111 	uint64_t	nextrip;	/* (x) next instruction to execute */
112 };
113 
114 #define	vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx))
115 #define	vcpu_lock_init(v)	mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
116 #define	vcpu_lock(v)		mtx_lock_spin(&((v)->mtx))
117 #define	vcpu_unlock(v)		mtx_unlock_spin(&((v)->mtx))
118 #define	vcpu_assert_locked(v)	mtx_assert(&((v)->mtx), MA_OWNED)
119 
120 struct mem_seg {
121 	size_t	len;
122 	bool	sysmem;
123 	struct vm_object *object;
124 };
125 #define	VM_MAX_MEMSEGS	3
126 
127 struct mem_map {
128 	vm_paddr_t	gpa;
129 	size_t		len;
130 	vm_ooffset_t	segoff;
131 	int		segid;
132 	int		prot;
133 	int		flags;
134 };
135 #define	VM_MAX_MEMMAPS	4
136 
137 /*
138  * Initialization:
139  * (o) initialized the first time the VM is created
140  * (i) initialized when VM is created and when it is reinitialized
141  * (x) initialized before use
142  */
143 struct vm {
144 	void		*cookie;		/* (i) cpu-specific data */
145 	void		*iommu;			/* (x) iommu-specific data */
146 	struct vhpet	*vhpet;			/* (i) virtual HPET */
147 	struct vioapic	*vioapic;		/* (i) virtual ioapic */
148 	struct vatpic	*vatpic;		/* (i) virtual atpic */
149 	struct vatpit	*vatpit;		/* (i) virtual atpit */
150 	struct vpmtmr	*vpmtmr;		/* (i) virtual ACPI PM timer */
151 	struct vrtc	*vrtc;			/* (o) virtual RTC */
152 	volatile cpuset_t active_cpus;		/* (i) active vcpus */
153 	int		suspend;		/* (i) stop VM execution */
154 	volatile cpuset_t suspended_cpus; 	/* (i) suspended vcpus */
155 	volatile cpuset_t halted_cpus;		/* (x) cpus in a hard halt */
156 	cpuset_t	rendezvous_req_cpus;	/* (x) rendezvous requested */
157 	cpuset_t	rendezvous_done_cpus;	/* (x) rendezvous finished */
158 	void		*rendezvous_arg;	/* (x) rendezvous func/arg */
159 	vm_rendezvous_func_t rendezvous_func;
160 	struct mtx	rendezvous_mtx;		/* (o) rendezvous lock */
161 	struct mem_map	mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
162 	struct mem_seg	mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
163 	struct vmspace	*vmspace;		/* (o) guest's address space */
164 	char		name[VM_MAX_NAMELEN];	/* (o) virtual machine name */
165 	struct vcpu	vcpu[VM_MAXCPU];	/* (i) guest vcpus */
166 	/* The following describe the vm cpu topology */
167 	uint16_t	sockets;		/* (o) num of sockets */
168 	uint16_t	cores;			/* (o) num of cores/socket */
169 	uint16_t	threads;		/* (o) num of threads/core */
170 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
171 };
172 
173 static int vmm_initialized;
174 
175 static struct vmm_ops *ops;
176 #define	VMM_INIT(num)	(ops != NULL ? (*ops->init)(num) : 0)
177 #define	VMM_CLEANUP()	(ops != NULL ? (*ops->cleanup)() : 0)
178 #define	VMM_RESUME()	(ops != NULL ? (*ops->resume)() : 0)
179 
180 #define	VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL)
181 #define	VMRUN(vmi, vcpu, rip, pmap, evinfo) \
182 	(ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO)
183 #define	VMCLEANUP(vmi)	(ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
184 #define	VMSPACE_ALLOC(min, max) \
185 	(ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL)
186 #define	VMSPACE_FREE(vmspace) \
187 	(ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO)
188 #define	VMGETREG(vmi, vcpu, num, retval)		\
189 	(ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
190 #define	VMSETREG(vmi, vcpu, num, val)		\
191 	(ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
192 #define	VMGETDESC(vmi, vcpu, num, desc)		\
193 	(ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
194 #define	VMSETDESC(vmi, vcpu, num, desc)		\
195 	(ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
196 #define	VMGETCAP(vmi, vcpu, num, retval)	\
197 	(ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
198 #define	VMSETCAP(vmi, vcpu, num, val)		\
199 	(ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
200 #define	VLAPIC_INIT(vmi, vcpu)			\
201 	(ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
202 #define	VLAPIC_CLEANUP(vmi, vlapic)		\
203 	(ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
204 
205 #define	fpu_start_emulating()	load_cr0(rcr0() | CR0_TS)
206 #define	fpu_stop_emulating()	clts()
207 
208 SDT_PROVIDER_DEFINE(vmm);
209 
210 static MALLOC_DEFINE(M_VM, "vm", "vm");
211 
212 /* statistics */
213 static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
214 
215 SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
216 
217 /*
218  * Halt the guest if all vcpus are executing a HLT instruction with
219  * interrupts disabled.
220  */
221 static int halt_detection_enabled = 1;
222 SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN,
223     &halt_detection_enabled, 0,
224     "Halt VM if all vcpus execute HLT with interrupts disabled");
225 
226 static int vmm_ipinum;
227 SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
228     "IPI vector used for vcpu notifications");
229 
230 static int trace_guest_exceptions;
231 SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
232     &trace_guest_exceptions, 0,
233     "Trap into hypervisor on all guest exceptions and reflect them back");
234 
235 static void vm_free_memmap(struct vm *vm, int ident);
236 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
237 static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
238 
239 #ifdef KTR
240 static const char *
vcpu_state2str(enum vcpu_state state)241 vcpu_state2str(enum vcpu_state state)
242 {
243 
244 	switch (state) {
245 	case VCPU_IDLE:
246 		return ("idle");
247 	case VCPU_FROZEN:
248 		return ("frozen");
249 	case VCPU_RUNNING:
250 		return ("running");
251 	case VCPU_SLEEPING:
252 		return ("sleeping");
253 	default:
254 		return ("unknown");
255 	}
256 }
257 #endif
258 
259 static void
vcpu_cleanup(struct vm * vm,int i,bool destroy)260 vcpu_cleanup(struct vm *vm, int i, bool destroy)
261 {
262 	struct vcpu *vcpu = &vm->vcpu[i];
263 
264 	VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
265 	if (destroy) {
266 		vmm_stat_free(vcpu->stats);
267 		fpu_save_area_free(vcpu->guestfpu);
268 	}
269 }
270 
271 static void
vcpu_init(struct vm * vm,int vcpu_id,bool create)272 vcpu_init(struct vm *vm, int vcpu_id, bool create)
273 {
274 	struct vcpu *vcpu;
275 
276 	KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus,
277 	    ("vcpu_init: invalid vcpu %d", vcpu_id));
278 
279 	vcpu = &vm->vcpu[vcpu_id];
280 
281 	if (create) {
282 		KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already "
283 		    "initialized", vcpu_id));
284 		vcpu_lock_init(vcpu);
285 		vcpu->state = VCPU_IDLE;
286 		vcpu->hostcpu = NOCPU;
287 		vcpu->guestfpu = fpu_save_area_alloc();
288 		vcpu->stats = vmm_stat_alloc();
289 	}
290 
291 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
292 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
293 	vcpu->reqidle = 0;
294 	vcpu->exitintinfo = 0;
295 	vcpu->nmi_pending = 0;
296 	vcpu->extint_pending = 0;
297 	vcpu->exception_pending = 0;
298 	vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
299 	fpu_save_area_reset(vcpu->guestfpu);
300 	vmm_stat_init(vcpu->stats);
301 }
302 
303 int
vcpu_trace_exceptions(struct vm * vm,int vcpuid)304 vcpu_trace_exceptions(struct vm *vm, int vcpuid)
305 {
306 
307 	return (trace_guest_exceptions);
308 }
309 
310 struct vm_exit *
vm_exitinfo(struct vm * vm,int cpuid)311 vm_exitinfo(struct vm *vm, int cpuid)
312 {
313 	struct vcpu *vcpu;
314 
315 	if (cpuid < 0 || cpuid >= vm->maxcpus)
316 		panic("vm_exitinfo: invalid cpuid %d", cpuid);
317 
318 	vcpu = &vm->vcpu[cpuid];
319 
320 	return (&vcpu->exitinfo);
321 }
322 
323 static void
vmm_resume(void)324 vmm_resume(void)
325 {
326 	VMM_RESUME();
327 }
328 
329 static int
vmm_init(void)330 vmm_init(void)
331 {
332 	int error;
333 
334 	vmm_host_state_init();
335 
336 	vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) :
337 	    &IDTVEC(justreturn));
338 	if (vmm_ipinum < 0)
339 		vmm_ipinum = IPI_AST;
340 
341 	error = vmm_mem_init();
342 	if (error)
343 		return (error);
344 
345 	if (vmm_is_intel())
346 		ops = &vmm_ops_intel;
347 	else if (vmm_is_amd())
348 		ops = &vmm_ops_amd;
349 	else
350 		return (ENXIO);
351 
352 	vmm_resume_p = vmm_resume;
353 
354 	return (VMM_INIT(vmm_ipinum));
355 }
356 
357 static int
vmm_handler(module_t mod,int what,void * arg)358 vmm_handler(module_t mod, int what, void *arg)
359 {
360 	int error;
361 
362 	switch (what) {
363 	case MOD_LOAD:
364 		vmmdev_init();
365 		error = vmm_init();
366 		if (error == 0)
367 			vmm_initialized = 1;
368 		break;
369 	case MOD_UNLOAD:
370 		error = vmmdev_cleanup();
371 		if (error == 0) {
372 			vmm_resume_p = NULL;
373 			iommu_cleanup();
374 			if (vmm_ipinum != IPI_AST)
375 				lapic_ipi_free(vmm_ipinum);
376 			error = VMM_CLEANUP();
377 			/*
378 			 * Something bad happened - prevent new
379 			 * VMs from being created
380 			 */
381 			if (error)
382 				vmm_initialized = 0;
383 		}
384 		break;
385 	default:
386 		error = 0;
387 		break;
388 	}
389 	return (error);
390 }
391 
392 static moduledata_t vmm_kmod = {
393 	"vmm",
394 	vmm_handler,
395 	NULL
396 };
397 
398 /*
399  * vmm initialization has the following dependencies:
400  *
401  * - VT-x initialization requires smp_rendezvous() and therefore must happen
402  *   after SMP is fully functional (after SI_SUB_SMP).
403  */
404 DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
405 MODULE_VERSION(vmm, 1);
406 
407 static void
vm_init(struct vm * vm,bool create)408 vm_init(struct vm *vm, bool create)
409 {
410 	int i;
411 
412 	vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace));
413 	vm->iommu = NULL;
414 	vm->vioapic = vioapic_init(vm);
415 	vm->vhpet = vhpet_init(vm);
416 	vm->vatpic = vatpic_init(vm);
417 	vm->vatpit = vatpit_init(vm);
418 	vm->vpmtmr = vpmtmr_init(vm);
419 	if (create)
420 		vm->vrtc = vrtc_init(vm);
421 
422 	CPU_ZERO(&vm->active_cpus);
423 
424 	vm->suspend = 0;
425 	CPU_ZERO(&vm->suspended_cpus);
426 
427 	for (i = 0; i < vm->maxcpus; i++)
428 		vcpu_init(vm, i, create);
429 }
430 
431 /*
432  * The default CPU topology is a single thread per package.
433  */
434 u_int cores_per_package = 1;
435 u_int threads_per_core = 1;
436 
437 int
vm_create(const char * name,struct vm ** retvm)438 vm_create(const char *name, struct vm **retvm)
439 {
440 	struct vm *vm;
441 	struct vmspace *vmspace;
442 
443 	/*
444 	 * If vmm.ko could not be successfully initialized then don't attempt
445 	 * to create the virtual machine.
446 	 */
447 	if (!vmm_initialized)
448 		return (ENXIO);
449 
450 	if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
451 		return (EINVAL);
452 
453 	vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS);
454 	if (vmspace == NULL)
455 		return (ENOMEM);
456 
457 	vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
458 	strcpy(vm->name, name);
459 	vm->vmspace = vmspace;
460 	mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
461 
462 	vm->sockets = 1;
463 	vm->cores = cores_per_package;	/* XXX backwards compatibility */
464 	vm->threads = threads_per_core;	/* XXX backwards compatibility */
465 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
466 
467 	vm_init(vm, true);
468 
469 	*retvm = vm;
470 	return (0);
471 }
472 
473 void
vm_get_topology(struct vm * vm,uint16_t * sockets,uint16_t * cores,uint16_t * threads,uint16_t * maxcpus)474 vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores,
475     uint16_t *threads, uint16_t *maxcpus)
476 {
477 	*sockets = vm->sockets;
478 	*cores = vm->cores;
479 	*threads = vm->threads;
480 	*maxcpus = vm->maxcpus;
481 }
482 
483 uint16_t
vm_get_maxcpus(struct vm * vm)484 vm_get_maxcpus(struct vm *vm)
485 {
486 	return (vm->maxcpus);
487 }
488 
489 int
vm_set_topology(struct vm * vm,uint16_t sockets,uint16_t cores,uint16_t threads,uint16_t maxcpus)490 vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores,
491     uint16_t threads, uint16_t maxcpus)
492 {
493 	if (maxcpus != 0)
494 		return (EINVAL);	/* XXX remove when supported */
495 	if ((sockets * cores * threads) > vm->maxcpus)
496 		return (EINVAL);
497 	/* XXX need to check sockets * cores * threads == vCPU, how? */
498 	vm->sockets = sockets;
499 	vm->cores = cores;
500 	vm->threads = threads;
501 	vm->maxcpus = VM_MAXCPU;	/* XXX temp to keep code working */
502 	return(0);
503 }
504 
505 static void
vm_cleanup(struct vm * vm,bool destroy)506 vm_cleanup(struct vm *vm, bool destroy)
507 {
508 	struct mem_map *mm;
509 	int i;
510 
511 	ppt_unassign_all(vm);
512 
513 	if (vm->iommu != NULL)
514 		iommu_destroy_domain(vm->iommu);
515 
516 	if (destroy)
517 		vrtc_cleanup(vm->vrtc);
518 	else
519 		vrtc_reset(vm->vrtc);
520 	vpmtmr_cleanup(vm->vpmtmr);
521 	vatpit_cleanup(vm->vatpit);
522 	vhpet_cleanup(vm->vhpet);
523 	vatpic_cleanup(vm->vatpic);
524 	vioapic_cleanup(vm->vioapic);
525 
526 	for (i = 0; i < vm->maxcpus; i++)
527 		vcpu_cleanup(vm, i, destroy);
528 
529 	VMCLEANUP(vm->cookie);
530 
531 	/*
532 	 * System memory is removed from the guest address space only when
533 	 * the VM is destroyed. This is because the mapping remains the same
534 	 * across VM reset.
535 	 *
536 	 * Device memory can be relocated by the guest (e.g. using PCI BARs)
537 	 * so those mappings are removed on a VM reset.
538 	 */
539 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
540 		mm = &vm->mem_maps[i];
541 		if (destroy || !sysmem_mapping(vm, mm))
542 			vm_free_memmap(vm, i);
543 	}
544 
545 	if (destroy) {
546 		for (i = 0; i < VM_MAX_MEMSEGS; i++)
547 			vm_free_memseg(vm, i);
548 
549 		VMSPACE_FREE(vm->vmspace);
550 		vm->vmspace = NULL;
551 	}
552 }
553 
554 void
vm_destroy(struct vm * vm)555 vm_destroy(struct vm *vm)
556 {
557 	vm_cleanup(vm, true);
558 	free(vm, M_VM);
559 }
560 
561 int
vm_reinit(struct vm * vm)562 vm_reinit(struct vm *vm)
563 {
564 	int error;
565 
566 	/*
567 	 * A virtual machine can be reset only if all vcpus are suspended.
568 	 */
569 	if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
570 		vm_cleanup(vm, false);
571 		vm_init(vm, false);
572 		error = 0;
573 	} else {
574 		error = EBUSY;
575 	}
576 
577 	return (error);
578 }
579 
580 const char *
vm_name(struct vm * vm)581 vm_name(struct vm *vm)
582 {
583 	return (vm->name);
584 }
585 
586 int
vm_map_mmio(struct vm * vm,vm_paddr_t gpa,size_t len,vm_paddr_t hpa)587 vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
588 {
589 	vm_object_t obj;
590 
591 	if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL)
592 		return (ENOMEM);
593 	else
594 		return (0);
595 }
596 
597 int
vm_unmap_mmio(struct vm * vm,vm_paddr_t gpa,size_t len)598 vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
599 {
600 
601 	vmm_mmio_free(vm->vmspace, gpa, len);
602 	return (0);
603 }
604 
605 /*
606  * Return 'true' if 'gpa' is allocated in the guest address space.
607  *
608  * This function is called in the context of a running vcpu which acts as
609  * an implicit lock on 'vm->mem_maps[]'.
610  */
611 bool
vm_mem_allocated(struct vm * vm,int vcpuid,vm_paddr_t gpa)612 vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa)
613 {
614 	struct mem_map *mm;
615 	int i;
616 
617 #ifdef INVARIANTS
618 	int hostcpu, state;
619 	state = vcpu_get_state(vm, vcpuid, &hostcpu);
620 	KASSERT(state == VCPU_RUNNING && hostcpu == curcpu,
621 	    ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu));
622 #endif
623 
624 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
625 		mm = &vm->mem_maps[i];
626 		if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len)
627 			return (true);		/* 'gpa' is sysmem or devmem */
628 	}
629 
630 	if (ppt_is_mmio(vm, gpa))
631 		return (true);			/* 'gpa' is pci passthru mmio */
632 
633 	return (false);
634 }
635 
636 int
vm_alloc_memseg(struct vm * vm,int ident,size_t len,bool sysmem)637 vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem)
638 {
639 	struct mem_seg *seg;
640 	vm_object_t obj;
641 
642 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
643 		return (EINVAL);
644 
645 	if (len == 0 || (len & PAGE_MASK))
646 		return (EINVAL);
647 
648 	seg = &vm->mem_segs[ident];
649 	if (seg->object != NULL) {
650 		if (seg->len == len && seg->sysmem == sysmem)
651 			return (EEXIST);
652 		else
653 			return (EINVAL);
654 	}
655 
656 	obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT);
657 	if (obj == NULL)
658 		return (ENOMEM);
659 
660 	seg->len = len;
661 	seg->object = obj;
662 	seg->sysmem = sysmem;
663 	return (0);
664 }
665 
666 int
vm_get_memseg(struct vm * vm,int ident,size_t * len,bool * sysmem,vm_object_t * objptr)667 vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem,
668     vm_object_t *objptr)
669 {
670 	struct mem_seg *seg;
671 
672 	if (ident < 0 || ident >= VM_MAX_MEMSEGS)
673 		return (EINVAL);
674 
675 	seg = &vm->mem_segs[ident];
676 	if (len)
677 		*len = seg->len;
678 	if (sysmem)
679 		*sysmem = seg->sysmem;
680 	if (objptr)
681 		*objptr = seg->object;
682 	return (0);
683 }
684 
685 void
vm_free_memseg(struct vm * vm,int ident)686 vm_free_memseg(struct vm *vm, int ident)
687 {
688 	struct mem_seg *seg;
689 
690 	KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS,
691 	    ("%s: invalid memseg ident %d", __func__, ident));
692 
693 	seg = &vm->mem_segs[ident];
694 	if (seg->object != NULL) {
695 		vm_object_deallocate(seg->object);
696 		bzero(seg, sizeof(struct mem_seg));
697 	}
698 }
699 
700 int
vm_mmap_memseg(struct vm * vm,vm_paddr_t gpa,int segid,vm_ooffset_t first,size_t len,int prot,int flags)701 vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first,
702     size_t len, int prot, int flags)
703 {
704 	struct mem_seg *seg;
705 	struct mem_map *m, *map;
706 	vm_ooffset_t last;
707 	int i, error;
708 
709 	if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0)
710 		return (EINVAL);
711 
712 	if (flags & ~VM_MEMMAP_F_WIRED)
713 		return (EINVAL);
714 
715 	if (segid < 0 || segid >= VM_MAX_MEMSEGS)
716 		return (EINVAL);
717 
718 	seg = &vm->mem_segs[segid];
719 	if (seg->object == NULL)
720 		return (EINVAL);
721 
722 	last = first + len;
723 	if (first < 0 || first >= last || last > seg->len)
724 		return (EINVAL);
725 
726 	if ((gpa | first | last) & PAGE_MASK)
727 		return (EINVAL);
728 
729 	map = NULL;
730 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
731 		m = &vm->mem_maps[i];
732 		if (m->len == 0) {
733 			map = m;
734 			break;
735 		}
736 	}
737 
738 	if (map == NULL)
739 		return (ENOSPC);
740 
741 	error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa,
742 	    len, 0, VMFS_NO_SPACE, prot, prot, 0);
743 	if (error != KERN_SUCCESS)
744 		return (EFAULT);
745 
746 	vm_object_reference(seg->object);
747 
748 	if (flags & VM_MEMMAP_F_WIRED) {
749 		error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len,
750 		    VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES);
751 		if (error != KERN_SUCCESS) {
752 			vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len);
753 			return (EFAULT);
754 		}
755 	}
756 
757 	map->gpa = gpa;
758 	map->len = len;
759 	map->segoff = first;
760 	map->segid = segid;
761 	map->prot = prot;
762 	map->flags = flags;
763 	return (0);
764 }
765 
766 int
vm_mmap_getnext(struct vm * vm,vm_paddr_t * gpa,int * segid,vm_ooffset_t * segoff,size_t * len,int * prot,int * flags)767 vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid,
768     vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
769 {
770 	struct mem_map *mm, *mmnext;
771 	int i;
772 
773 	mmnext = NULL;
774 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
775 		mm = &vm->mem_maps[i];
776 		if (mm->len == 0 || mm->gpa < *gpa)
777 			continue;
778 		if (mmnext == NULL || mm->gpa < mmnext->gpa)
779 			mmnext = mm;
780 	}
781 
782 	if (mmnext != NULL) {
783 		*gpa = mmnext->gpa;
784 		if (segid)
785 			*segid = mmnext->segid;
786 		if (segoff)
787 			*segoff = mmnext->segoff;
788 		if (len)
789 			*len = mmnext->len;
790 		if (prot)
791 			*prot = mmnext->prot;
792 		if (flags)
793 			*flags = mmnext->flags;
794 		return (0);
795 	} else {
796 		return (ENOENT);
797 	}
798 }
799 
800 static void
vm_free_memmap(struct vm * vm,int ident)801 vm_free_memmap(struct vm *vm, int ident)
802 {
803 	struct mem_map *mm;
804 	int error;
805 
806 	mm = &vm->mem_maps[ident];
807 	if (mm->len) {
808 		error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa,
809 		    mm->gpa + mm->len);
810 		KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d",
811 		    __func__, error));
812 		bzero(mm, sizeof(struct mem_map));
813 	}
814 }
815 
816 static __inline bool
sysmem_mapping(struct vm * vm,struct mem_map * mm)817 sysmem_mapping(struct vm *vm, struct mem_map *mm)
818 {
819 
820 	if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem)
821 		return (true);
822 	else
823 		return (false);
824 }
825 
826 vm_paddr_t
vmm_sysmem_maxaddr(struct vm * vm)827 vmm_sysmem_maxaddr(struct vm *vm)
828 {
829 	struct mem_map *mm;
830 	vm_paddr_t maxaddr;
831 	int i;
832 
833 	maxaddr = 0;
834 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
835 		mm = &vm->mem_maps[i];
836 		if (sysmem_mapping(vm, mm)) {
837 			if (maxaddr < mm->gpa + mm->len)
838 				maxaddr = mm->gpa + mm->len;
839 		}
840 	}
841 	return (maxaddr);
842 }
843 
844 static void
vm_iommu_modify(struct vm * vm,bool map)845 vm_iommu_modify(struct vm *vm, bool map)
846 {
847 	int i, sz;
848 	vm_paddr_t gpa, hpa;
849 	struct mem_map *mm;
850 	void *vp, *cookie, *host_domain;
851 
852 	sz = PAGE_SIZE;
853 	host_domain = iommu_host_domain();
854 
855 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
856 		mm = &vm->mem_maps[i];
857 		if (!sysmem_mapping(vm, mm))
858 			continue;
859 
860 		if (map) {
861 			KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0,
862 			    ("iommu map found invalid memmap %#lx/%#lx/%#x",
863 			    mm->gpa, mm->len, mm->flags));
864 			if ((mm->flags & VM_MEMMAP_F_WIRED) == 0)
865 				continue;
866 			mm->flags |= VM_MEMMAP_F_IOMMU;
867 		} else {
868 			if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0)
869 				continue;
870 			mm->flags &= ~VM_MEMMAP_F_IOMMU;
871 			KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0,
872 			    ("iommu unmap found invalid memmap %#lx/%#lx/%#x",
873 			    mm->gpa, mm->len, mm->flags));
874 		}
875 
876 		gpa = mm->gpa;
877 		while (gpa < mm->gpa + mm->len) {
878 			vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE,
879 					 &cookie);
880 			KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx",
881 			    vm_name(vm), gpa));
882 
883 			vm_gpa_release(cookie);
884 
885 			hpa = DMAP_TO_PHYS((uintptr_t)vp);
886 			if (map) {
887 				iommu_create_mapping(vm->iommu, gpa, hpa, sz);
888 				iommu_remove_mapping(host_domain, hpa, sz);
889 			} else {
890 				iommu_remove_mapping(vm->iommu, gpa, sz);
891 				iommu_create_mapping(host_domain, hpa, hpa, sz);
892 			}
893 
894 			gpa += PAGE_SIZE;
895 		}
896 	}
897 
898 	/*
899 	 * Invalidate the cached translations associated with the domain
900 	 * from which pages were removed.
901 	 */
902 	if (map)
903 		iommu_invalidate_tlb(host_domain);
904 	else
905 		iommu_invalidate_tlb(vm->iommu);
906 }
907 
908 #define	vm_iommu_unmap(vm)	vm_iommu_modify((vm), false)
909 #define	vm_iommu_map(vm)	vm_iommu_modify((vm), true)
910 
911 int
vm_unassign_pptdev(struct vm * vm,int bus,int slot,int func)912 vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func)
913 {
914 	int error;
915 
916 	error = ppt_unassign_device(vm, bus, slot, func);
917 	if (error)
918 		return (error);
919 
920 	if (ppt_assigned_devices(vm) == 0)
921 		vm_iommu_unmap(vm);
922 
923 	return (0);
924 }
925 
926 int
vm_assign_pptdev(struct vm * vm,int bus,int slot,int func)927 vm_assign_pptdev(struct vm *vm, int bus, int slot, int func)
928 {
929 	int error;
930 	vm_paddr_t maxaddr;
931 
932 	/* Set up the IOMMU to do the 'gpa' to 'hpa' translation */
933 	if (ppt_assigned_devices(vm) == 0) {
934 		KASSERT(vm->iommu == NULL,
935 		    ("vm_assign_pptdev: iommu must be NULL"));
936 		maxaddr = vmm_sysmem_maxaddr(vm);
937 		vm->iommu = iommu_create_domain(maxaddr);
938 		if (vm->iommu == NULL)
939 			return (ENXIO);
940 		vm_iommu_map(vm);
941 	}
942 
943 	error = ppt_assign_device(vm, bus, slot, func);
944 	return (error);
945 }
946 
947 void *
vm_gpa_hold(struct vm * vm,int vcpuid,vm_paddr_t gpa,size_t len,int reqprot,void ** cookie)948 vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot,
949 	    void **cookie)
950 {
951 	int i, count, pageoff;
952 	struct mem_map *mm;
953 	vm_page_t m;
954 #ifdef INVARIANTS
955 	/*
956 	 * All vcpus are frozen by ioctls that modify the memory map
957 	 * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is
958 	 * guaranteed if at least one vcpu is in the VCPU_FROZEN state.
959 	 */
960 	int state;
961 	KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d",
962 	    __func__, vcpuid));
963 	for (i = 0; i < vm->maxcpus; i++) {
964 		if (vcpuid != -1 && vcpuid != i)
965 			continue;
966 		state = vcpu_get_state(vm, i, NULL);
967 		KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d",
968 		    __func__, state));
969 	}
970 #endif
971 	pageoff = gpa & PAGE_MASK;
972 	if (len > PAGE_SIZE - pageoff)
973 		panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
974 
975 	count = 0;
976 	for (i = 0; i < VM_MAX_MEMMAPS; i++) {
977 		mm = &vm->mem_maps[i];
978 		if (sysmem_mapping(vm, mm) && gpa >= mm->gpa &&
979 		    gpa < mm->gpa + mm->len) {
980 			count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
981 			    trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
982 			break;
983 		}
984 	}
985 
986 	if (count == 1) {
987 		*cookie = m;
988 		return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
989 	} else {
990 		*cookie = NULL;
991 		return (NULL);
992 	}
993 }
994 
995 void
vm_gpa_release(void * cookie)996 vm_gpa_release(void *cookie)
997 {
998 	vm_page_t m = cookie;
999 
1000 	vm_page_lock(m);
1001 	vm_page_unhold(m);
1002 	vm_page_unlock(m);
1003 }
1004 
1005 int
vm_get_register(struct vm * vm,int vcpu,int reg,uint64_t * retval)1006 vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
1007 {
1008 
1009 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1010 		return (EINVAL);
1011 
1012 	if (reg >= VM_REG_LAST)
1013 		return (EINVAL);
1014 
1015 	return (VMGETREG(vm->cookie, vcpu, reg, retval));
1016 }
1017 
1018 int
vm_set_register(struct vm * vm,int vcpuid,int reg,uint64_t val)1019 vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
1020 {
1021 	struct vcpu *vcpu;
1022 	int error;
1023 
1024 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1025 		return (EINVAL);
1026 
1027 	if (reg >= VM_REG_LAST)
1028 		return (EINVAL);
1029 
1030 	error = VMSETREG(vm->cookie, vcpuid, reg, val);
1031 	if (error || reg != VM_REG_GUEST_RIP)
1032 		return (error);
1033 
1034 	/* Set 'nextrip' to match the value of %rip */
1035 	VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
1036 	vcpu = &vm->vcpu[vcpuid];
1037 	vcpu->nextrip = val;
1038 	return (0);
1039 }
1040 
1041 static bool
is_descriptor_table(int reg)1042 is_descriptor_table(int reg)
1043 {
1044 
1045 	switch (reg) {
1046 	case VM_REG_GUEST_IDTR:
1047 	case VM_REG_GUEST_GDTR:
1048 		return (true);
1049 	default:
1050 		return (false);
1051 	}
1052 }
1053 
1054 static bool
is_segment_register(int reg)1055 is_segment_register(int reg)
1056 {
1057 
1058 	switch (reg) {
1059 	case VM_REG_GUEST_ES:
1060 	case VM_REG_GUEST_CS:
1061 	case VM_REG_GUEST_SS:
1062 	case VM_REG_GUEST_DS:
1063 	case VM_REG_GUEST_FS:
1064 	case VM_REG_GUEST_GS:
1065 	case VM_REG_GUEST_TR:
1066 	case VM_REG_GUEST_LDTR:
1067 		return (true);
1068 	default:
1069 		return (false);
1070 	}
1071 }
1072 
1073 int
vm_get_seg_desc(struct vm * vm,int vcpu,int reg,struct seg_desc * desc)1074 vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
1075 		struct seg_desc *desc)
1076 {
1077 
1078 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1079 		return (EINVAL);
1080 
1081 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1082 		return (EINVAL);
1083 
1084 	return (VMGETDESC(vm->cookie, vcpu, reg, desc));
1085 }
1086 
1087 int
vm_set_seg_desc(struct vm * vm,int vcpu,int reg,struct seg_desc * desc)1088 vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
1089 		struct seg_desc *desc)
1090 {
1091 	if (vcpu < 0 || vcpu >= vm->maxcpus)
1092 		return (EINVAL);
1093 
1094 	if (!is_segment_register(reg) && !is_descriptor_table(reg))
1095 		return (EINVAL);
1096 
1097 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
1098 }
1099 
1100 static void
restore_guest_fpustate(struct vcpu * vcpu)1101 restore_guest_fpustate(struct vcpu *vcpu)
1102 {
1103 
1104 	/* flush host state to the pcb */
1105 	fpuexit(curthread);
1106 
1107 	/* restore guest FPU state */
1108 	fpu_stop_emulating();
1109 	fpurestore(vcpu->guestfpu);
1110 
1111 	/* restore guest XCR0 if XSAVE is enabled in the host */
1112 	if (rcr4() & CR4_XSAVE)
1113 		load_xcr(0, vcpu->guest_xcr0);
1114 
1115 	/*
1116 	 * The FPU is now "dirty" with the guest's state so turn on emulation
1117 	 * to trap any access to the FPU by the host.
1118 	 */
1119 	fpu_start_emulating();
1120 }
1121 
1122 static void
save_guest_fpustate(struct vcpu * vcpu)1123 save_guest_fpustate(struct vcpu *vcpu)
1124 {
1125 
1126 	if ((rcr0() & CR0_TS) == 0)
1127 		panic("fpu emulation not enabled in host!");
1128 
1129 	/* save guest XCR0 and restore host XCR0 */
1130 	if (rcr4() & CR4_XSAVE) {
1131 		vcpu->guest_xcr0 = rxcr(0);
1132 		load_xcr(0, vmm_get_host_xcr0());
1133 	}
1134 
1135 	/* save guest FPU state */
1136 	fpu_stop_emulating();
1137 	fpusave(vcpu->guestfpu);
1138 	fpu_start_emulating();
1139 }
1140 
1141 static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
1142 
1143 static int
vcpu_set_state_locked(struct vm * vm,int vcpuid,enum vcpu_state newstate,bool from_idle)1144 vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
1145     bool from_idle)
1146 {
1147 	struct vcpu *vcpu;
1148 	int error;
1149 
1150 	vcpu = &vm->vcpu[vcpuid];
1151 	vcpu_assert_locked(vcpu);
1152 
1153 	/*
1154 	 * State transitions from the vmmdev_ioctl() must always begin from
1155 	 * the VCPU_IDLE state. This guarantees that there is only a single
1156 	 * ioctl() operating on a vcpu at any point.
1157 	 */
1158 	if (from_idle) {
1159 		while (vcpu->state != VCPU_IDLE) {
1160 			vcpu->reqidle = 1;
1161 			vcpu_notify_event_locked(vcpu, false);
1162 			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
1163 			    "idle requested", vcpu_state2str(vcpu->state));
1164 			msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
1165 		}
1166 	} else {
1167 		KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
1168 		    "vcpu idle state"));
1169 	}
1170 
1171 	if (vcpu->state == VCPU_RUNNING) {
1172 		KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
1173 		    "mismatch for running vcpu", curcpu, vcpu->hostcpu));
1174 	} else {
1175 		KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
1176 		    "vcpu that is not running", vcpu->hostcpu));
1177 	}
1178 
1179 	/*
1180 	 * The following state transitions are allowed:
1181 	 * IDLE -> FROZEN -> IDLE
1182 	 * FROZEN -> RUNNING -> FROZEN
1183 	 * FROZEN -> SLEEPING -> FROZEN
1184 	 */
1185 	switch (vcpu->state) {
1186 	case VCPU_IDLE:
1187 	case VCPU_RUNNING:
1188 	case VCPU_SLEEPING:
1189 		error = (newstate != VCPU_FROZEN);
1190 		break;
1191 	case VCPU_FROZEN:
1192 		error = (newstate == VCPU_FROZEN);
1193 		break;
1194 	default:
1195 		error = 1;
1196 		break;
1197 	}
1198 
1199 	if (error)
1200 		return (EBUSY);
1201 
1202 	VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s",
1203 	    vcpu_state2str(vcpu->state), vcpu_state2str(newstate));
1204 
1205 	vcpu->state = newstate;
1206 	if (newstate == VCPU_RUNNING)
1207 		vcpu->hostcpu = curcpu;
1208 	else
1209 		vcpu->hostcpu = NOCPU;
1210 
1211 	if (newstate == VCPU_IDLE)
1212 		wakeup(&vcpu->state);
1213 
1214 	return (0);
1215 }
1216 
1217 static void
vcpu_require_state(struct vm * vm,int vcpuid,enum vcpu_state newstate)1218 vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1219 {
1220 	int error;
1221 
1222 	if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
1223 		panic("Error %d setting state to %d\n", error, newstate);
1224 }
1225 
1226 static void
vcpu_require_state_locked(struct vm * vm,int vcpuid,enum vcpu_state newstate)1227 vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
1228 {
1229 	int error;
1230 
1231 	if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0)
1232 		panic("Error %d setting state to %d", error, newstate);
1233 }
1234 
1235 static void
vm_set_rendezvous_func(struct vm * vm,vm_rendezvous_func_t func)1236 vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
1237 {
1238 
1239 	KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
1240 
1241 	/*
1242 	 * Update 'rendezvous_func' and execute a write memory barrier to
1243 	 * ensure that it is visible across all host cpus. This is not needed
1244 	 * for correctness but it does ensure that all the vcpus will notice
1245 	 * that the rendezvous is requested immediately.
1246 	 */
1247 	vm->rendezvous_func = func;
1248 	wmb();
1249 }
1250 
1251 #define	RENDEZVOUS_CTR0(vm, vcpuid, fmt)				\
1252 	do {								\
1253 		if (vcpuid >= 0)					\
1254 			VCPU_CTR0(vm, vcpuid, fmt);			\
1255 		else							\
1256 			VM_CTR0(vm, fmt);				\
1257 	} while (0)
1258 
1259 static void
vm_handle_rendezvous(struct vm * vm,int vcpuid)1260 vm_handle_rendezvous(struct vm *vm, int vcpuid)
1261 {
1262 
1263 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
1264 	    ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
1265 
1266 	mtx_lock(&vm->rendezvous_mtx);
1267 	while (vm->rendezvous_func != NULL) {
1268 		/* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
1269 		CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
1270 
1271 		if (vcpuid != -1 &&
1272 		    CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
1273 		    !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
1274 			VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
1275 			(*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
1276 			CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
1277 		}
1278 		if (CPU_CMP(&vm->rendezvous_req_cpus,
1279 		    &vm->rendezvous_done_cpus) == 0) {
1280 			VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
1281 			vm_set_rendezvous_func(vm, NULL);
1282 			wakeup(&vm->rendezvous_func);
1283 			break;
1284 		}
1285 		RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
1286 		mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
1287 		    "vmrndv", 0);
1288 	}
1289 	mtx_unlock(&vm->rendezvous_mtx);
1290 }
1291 
1292 /*
1293  * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
1294  */
1295 static int
vm_handle_hlt(struct vm * vm,int vcpuid,bool intr_disabled,bool * retu)1296 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
1297 {
1298 	struct vcpu *vcpu;
1299 	const char *wmesg;
1300 	int t, vcpu_halted, vm_halted;
1301 
1302 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
1303 
1304 	vcpu = &vm->vcpu[vcpuid];
1305 	vcpu_halted = 0;
1306 	vm_halted = 0;
1307 
1308 	vcpu_lock(vcpu);
1309 	while (1) {
1310 		/*
1311 		 * Do a final check for pending NMI or interrupts before
1312 		 * really putting this thread to sleep. Also check for
1313 		 * software events that would cause this vcpu to wakeup.
1314 		 *
1315 		 * These interrupts/events could have happened after the
1316 		 * vcpu returned from VMRUN() and before it acquired the
1317 		 * vcpu lock above.
1318 		 */
1319 		if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
1320 			break;
1321 		if (vm_nmi_pending(vm, vcpuid))
1322 			break;
1323 		if (!intr_disabled) {
1324 			if (vm_extint_pending(vm, vcpuid) ||
1325 			    vlapic_pending_intr(vcpu->vlapic, NULL)) {
1326 				break;
1327 			}
1328 		}
1329 
1330 		/* Don't go to sleep if the vcpu thread needs to yield */
1331 		if (vcpu_should_yield(vm, vcpuid))
1332 			break;
1333 
1334 		/*
1335 		 * Some Linux guests implement "halt" by having all vcpus
1336 		 * execute HLT with interrupts disabled. 'halted_cpus' keeps
1337 		 * track of the vcpus that have entered this state. When all
1338 		 * vcpus enter the halted state the virtual machine is halted.
1339 		 */
1340 		if (intr_disabled) {
1341 			wmesg = "vmhalt";
1342 			VCPU_CTR0(vm, vcpuid, "Halted");
1343 			if (!vcpu_halted && halt_detection_enabled) {
1344 				vcpu_halted = 1;
1345 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
1346 			}
1347 			if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) {
1348 				vm_halted = 1;
1349 				break;
1350 			}
1351 		} else {
1352 			wmesg = "vmidle";
1353 		}
1354 
1355 		t = ticks;
1356 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1357 		/*
1358 		 * XXX msleep_spin() cannot be interrupted by signals so
1359 		 * wake up periodically to check pending signals.
1360 		 */
1361 		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
1362 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1363 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
1364 	}
1365 
1366 	if (vcpu_halted)
1367 		CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus);
1368 
1369 	vcpu_unlock(vcpu);
1370 
1371 	if (vm_halted)
1372 		vm_suspend(vm, VM_SUSPEND_HALT);
1373 
1374 	return (0);
1375 }
1376 
1377 static int
vm_handle_paging(struct vm * vm,int vcpuid,bool * retu)1378 vm_handle_paging(struct vm *vm, int vcpuid, bool *retu)
1379 {
1380 	int rv, ftype;
1381 	struct vm_map *map;
1382 	struct vcpu *vcpu;
1383 	struct vm_exit *vme;
1384 
1385 	vcpu = &vm->vcpu[vcpuid];
1386 	vme = &vcpu->exitinfo;
1387 
1388 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1389 	    __func__, vme->inst_length));
1390 
1391 	ftype = vme->u.paging.fault_type;
1392 	KASSERT(ftype == VM_PROT_READ ||
1393 	    ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE,
1394 	    ("vm_handle_paging: invalid fault_type %d", ftype));
1395 
1396 	if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
1397 		rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace),
1398 		    vme->u.paging.gpa, ftype);
1399 		if (rv == 0) {
1400 			VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx",
1401 			    ftype == VM_PROT_READ ? "accessed" : "dirty",
1402 			    vme->u.paging.gpa);
1403 			goto done;
1404 		}
1405 	}
1406 
1407 	map = &vm->vmspace->vm_map;
1408 	rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL);
1409 
1410 	VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, "
1411 	    "ftype = %d", rv, vme->u.paging.gpa, ftype);
1412 
1413 	if (rv != KERN_SUCCESS)
1414 		return (EFAULT);
1415 done:
1416 	return (0);
1417 }
1418 
1419 static int
vm_handle_inst_emul(struct vm * vm,int vcpuid,bool * retu)1420 vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
1421 {
1422 	struct vie *vie;
1423 	struct vcpu *vcpu;
1424 	struct vm_exit *vme;
1425 	uint64_t gla, gpa, cs_base;
1426 	struct vm_guest_paging *paging;
1427 	mem_region_read_t mread;
1428 	mem_region_write_t mwrite;
1429 	enum vm_cpu_mode cpu_mode;
1430 	int cs_d, error, fault;
1431 
1432 	vcpu = &vm->vcpu[vcpuid];
1433 	vme = &vcpu->exitinfo;
1434 
1435 	KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d",
1436 	    __func__, vme->inst_length));
1437 
1438 	gla = vme->u.inst_emul.gla;
1439 	gpa = vme->u.inst_emul.gpa;
1440 	cs_base = vme->u.inst_emul.cs_base;
1441 	cs_d = vme->u.inst_emul.cs_d;
1442 	vie = &vme->u.inst_emul.vie;
1443 	paging = &vme->u.inst_emul.paging;
1444 	cpu_mode = paging->cpu_mode;
1445 
1446 	VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
1447 
1448 	/* Fetch, decode and emulate the faulting instruction */
1449 	if (vie->num_valid == 0) {
1450 		error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
1451 		    cs_base, VIE_INST_SIZE, vie, &fault);
1452 	} else {
1453 		/*
1454 		 * The instruction bytes have already been copied into 'vie'
1455 		 */
1456 		error = fault = 0;
1457 	}
1458 	if (error || fault)
1459 		return (error);
1460 
1461 	if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) {
1462 		VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx",
1463 		    vme->rip + cs_base);
1464 		*retu = true;	    /* dump instruction bytes in userspace */
1465 		return (0);
1466 	}
1467 
1468 	/*
1469 	 * Update 'nextrip' based on the length of the emulated instruction.
1470 	 */
1471 	vme->inst_length = vie->num_processed;
1472 	vcpu->nextrip += vie->num_processed;
1473 	VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction "
1474 	    "decoding", vcpu->nextrip);
1475 
1476 	/* return to userland unless this is an in-kernel emulated device */
1477 	if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
1478 		mread = lapic_mmio_read;
1479 		mwrite = lapic_mmio_write;
1480 	} else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
1481 		mread = vioapic_mmio_read;
1482 		mwrite = vioapic_mmio_write;
1483 	} else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
1484 		mread = vhpet_mmio_read;
1485 		mwrite = vhpet_mmio_write;
1486 	} else {
1487 		*retu = true;
1488 		return (0);
1489 	}
1490 
1491 	error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
1492 	    mread, mwrite, retu);
1493 
1494 	return (error);
1495 }
1496 
1497 static int
vm_handle_suspend(struct vm * vm,int vcpuid,bool * retu)1498 vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
1499 {
1500 	int i, done;
1501 	struct vcpu *vcpu;
1502 
1503 	done = 0;
1504 	vcpu = &vm->vcpu[vcpuid];
1505 
1506 	CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus);
1507 
1508 	/*
1509 	 * Wait until all 'active_cpus' have suspended themselves.
1510 	 *
1511 	 * Since a VM may be suspended at any time including when one or
1512 	 * more vcpus are doing a rendezvous we need to call the rendezvous
1513 	 * handler while we are waiting to prevent a deadlock.
1514 	 */
1515 	vcpu_lock(vcpu);
1516 	while (1) {
1517 		if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) {
1518 			VCPU_CTR0(vm, vcpuid, "All vcpus suspended");
1519 			break;
1520 		}
1521 
1522 		if (vm->rendezvous_func == NULL) {
1523 			VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
1524 			vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
1525 			msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
1526 			vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
1527 		} else {
1528 			VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
1529 			vcpu_unlock(vcpu);
1530 			vm_handle_rendezvous(vm, vcpuid);
1531 			vcpu_lock(vcpu);
1532 		}
1533 	}
1534 	vcpu_unlock(vcpu);
1535 
1536 	/*
1537 	 * Wakeup the other sleeping vcpus and return to userspace.
1538 	 */
1539 	for (i = 0; i < vm->maxcpus; i++) {
1540 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
1541 			vcpu_notify_event(vm, i, false);
1542 		}
1543 	}
1544 
1545 	*retu = true;
1546 	return (0);
1547 }
1548 
1549 static int
vm_handle_reqidle(struct vm * vm,int vcpuid,bool * retu)1550 vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu)
1551 {
1552 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
1553 
1554 	vcpu_lock(vcpu);
1555 	KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle));
1556 	vcpu->reqidle = 0;
1557 	vcpu_unlock(vcpu);
1558 	*retu = true;
1559 	return (0);
1560 }
1561 
1562 int
vm_suspend(struct vm * vm,enum vm_suspend_how how)1563 vm_suspend(struct vm *vm, enum vm_suspend_how how)
1564 {
1565 	int i;
1566 
1567 	if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST)
1568 		return (EINVAL);
1569 
1570 	if (atomic_cmpset_int(&vm->suspend, 0, how) == 0) {
1571 		VM_CTR2(vm, "virtual machine already suspended %d/%d",
1572 		    vm->suspend, how);
1573 		return (EALREADY);
1574 	}
1575 
1576 	VM_CTR1(vm, "virtual machine successfully suspended %d", how);
1577 
1578 	/*
1579 	 * Notify all active vcpus that they are now suspended.
1580 	 */
1581 	for (i = 0; i < vm->maxcpus; i++) {
1582 		if (CPU_ISSET(i, &vm->active_cpus))
1583 			vcpu_notify_event(vm, i, false);
1584 	}
1585 
1586 	return (0);
1587 }
1588 
1589 void
vm_exit_suspended(struct vm * vm,int vcpuid,uint64_t rip)1590 vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
1591 {
1592 	struct vm_exit *vmexit;
1593 
1594 	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
1595 	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
1596 
1597 	vmexit = vm_exitinfo(vm, vcpuid);
1598 	vmexit->rip = rip;
1599 	vmexit->inst_length = 0;
1600 	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
1601 	vmexit->u.suspended.how = vm->suspend;
1602 }
1603 
1604 void
vm_exit_rendezvous(struct vm * vm,int vcpuid,uint64_t rip)1605 vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
1606 {
1607 	struct vm_exit *vmexit;
1608 
1609 	KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
1610 
1611 	vmexit = vm_exitinfo(vm, vcpuid);
1612 	vmexit->rip = rip;
1613 	vmexit->inst_length = 0;
1614 	vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
1615 	vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
1616 }
1617 
1618 void
vm_exit_reqidle(struct vm * vm,int vcpuid,uint64_t rip)1619 vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
1620 {
1621 	struct vm_exit *vmexit;
1622 
1623 	vmexit = vm_exitinfo(vm, vcpuid);
1624 	vmexit->rip = rip;
1625 	vmexit->inst_length = 0;
1626 	vmexit->exitcode = VM_EXITCODE_REQIDLE;
1627 	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
1628 }
1629 
1630 void
vm_exit_astpending(struct vm * vm,int vcpuid,uint64_t rip)1631 vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
1632 {
1633 	struct vm_exit *vmexit;
1634 
1635 	vmexit = vm_exitinfo(vm, vcpuid);
1636 	vmexit->rip = rip;
1637 	vmexit->inst_length = 0;
1638 	vmexit->exitcode = VM_EXITCODE_BOGUS;
1639 	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
1640 }
1641 
1642 int
vm_run(struct vm * vm,struct vm_run * vmrun)1643 vm_run(struct vm *vm, struct vm_run *vmrun)
1644 {
1645 	struct vm_eventinfo evinfo;
1646 	int error, vcpuid;
1647 	struct vcpu *vcpu;
1648 	struct pcb *pcb;
1649 	uint64_t tscval;
1650 	struct vm_exit *vme;
1651 	bool retu, intr_disabled;
1652 	pmap_t pmap;
1653 
1654 	vcpuid = vmrun->cpuid;
1655 
1656 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1657 		return (EINVAL);
1658 
1659 	if (!CPU_ISSET(vcpuid, &vm->active_cpus))
1660 		return (EINVAL);
1661 
1662 	if (CPU_ISSET(vcpuid, &vm->suspended_cpus))
1663 		return (EINVAL);
1664 
1665 	pmap = vmspace_pmap(vm->vmspace);
1666 	vcpu = &vm->vcpu[vcpuid];
1667 	vme = &vcpu->exitinfo;
1668 	evinfo.rptr = &vm->rendezvous_func;
1669 	evinfo.sptr = &vm->suspend;
1670 	evinfo.iptr = &vcpu->reqidle;
1671 restart:
1672 	critical_enter();
1673 
1674 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
1675 	    ("vm_run: absurd pm_active"));
1676 
1677 	tscval = rdtsc();
1678 
1679 	pcb = PCPU_GET(curpcb);
1680 	set_pcb_flags(pcb, PCB_FULL_IRET);
1681 
1682 	restore_guest_fpustate(vcpu);
1683 
1684 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
1685 	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
1686 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
1687 
1688 	save_guest_fpustate(vcpu);
1689 
1690 	vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
1691 
1692 	critical_exit();
1693 
1694 	if (error == 0) {
1695 		retu = false;
1696 		vcpu->nextrip = vme->rip + vme->inst_length;
1697 		switch (vme->exitcode) {
1698 		case VM_EXITCODE_REQIDLE:
1699 			error = vm_handle_reqidle(vm, vcpuid, &retu);
1700 			break;
1701 		case VM_EXITCODE_SUSPENDED:
1702 			error = vm_handle_suspend(vm, vcpuid, &retu);
1703 			break;
1704 		case VM_EXITCODE_IOAPIC_EOI:
1705 			vioapic_process_eoi(vm, vcpuid,
1706 			    vme->u.ioapic_eoi.vector);
1707 			break;
1708 		case VM_EXITCODE_RENDEZVOUS:
1709 			vm_handle_rendezvous(vm, vcpuid);
1710 			error = 0;
1711 			break;
1712 		case VM_EXITCODE_HLT:
1713 			intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
1714 			error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
1715 			break;
1716 		case VM_EXITCODE_PAGING:
1717 			error = vm_handle_paging(vm, vcpuid, &retu);
1718 			break;
1719 		case VM_EXITCODE_INST_EMUL:
1720 			error = vm_handle_inst_emul(vm, vcpuid, &retu);
1721 			break;
1722 		case VM_EXITCODE_INOUT:
1723 		case VM_EXITCODE_INOUT_STR:
1724 			error = vm_handle_inout(vm, vcpuid, vme, &retu);
1725 			break;
1726 		case VM_EXITCODE_MONITOR:
1727 		case VM_EXITCODE_MWAIT:
1728 		case VM_EXITCODE_VMINSN:
1729 			vm_inject_ud(vm, vcpuid);
1730 			break;
1731 		default:
1732 			retu = true;	/* handled in userland */
1733 			break;
1734 		}
1735 	}
1736 
1737 	if (error == 0 && retu == false)
1738 		goto restart;
1739 
1740 	VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode);
1741 
1742 	/* copy the exit information */
1743 	bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
1744 	return (error);
1745 }
1746 
1747 int
vm_restart_instruction(void * arg,int vcpuid)1748 vm_restart_instruction(void *arg, int vcpuid)
1749 {
1750 	struct vm *vm;
1751 	struct vcpu *vcpu;
1752 	enum vcpu_state state;
1753 	uint64_t rip;
1754 	int error;
1755 
1756 	vm = arg;
1757 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1758 		return (EINVAL);
1759 
1760 	vcpu = &vm->vcpu[vcpuid];
1761 	state = vcpu_get_state(vm, vcpuid, NULL);
1762 	if (state == VCPU_RUNNING) {
1763 		/*
1764 		 * When a vcpu is "running" the next instruction is determined
1765 		 * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
1766 		 * Thus setting 'inst_length' to zero will cause the current
1767 		 * instruction to be restarted.
1768 		 */
1769 		vcpu->exitinfo.inst_length = 0;
1770 		VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
1771 		    "setting inst_length to zero", vcpu->exitinfo.rip);
1772 	} else if (state == VCPU_FROZEN) {
1773 		/*
1774 		 * When a vcpu is "frozen" it is outside the critical section
1775 		 * around VMRUN() and 'nextrip' points to the next instruction.
1776 		 * Thus instruction restart is achieved by setting 'nextrip'
1777 		 * to the vcpu's %rip.
1778 		 */
1779 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
1780 		KASSERT(!error, ("%s: error %d getting rip", __func__, error));
1781 		VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
1782 		    "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
1783 		vcpu->nextrip = rip;
1784 	} else {
1785 		panic("%s: invalid state %d", __func__, state);
1786 	}
1787 	return (0);
1788 }
1789 
1790 int
vm_exit_intinfo(struct vm * vm,int vcpuid,uint64_t info)1791 vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
1792 {
1793 	struct vcpu *vcpu;
1794 	int type, vector;
1795 
1796 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1797 		return (EINVAL);
1798 
1799 	vcpu = &vm->vcpu[vcpuid];
1800 
1801 	if (info & VM_INTINFO_VALID) {
1802 		type = info & VM_INTINFO_TYPE;
1803 		vector = info & 0xff;
1804 		if (type == VM_INTINFO_NMI && vector != IDT_NMI)
1805 			return (EINVAL);
1806 		if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
1807 			return (EINVAL);
1808 		if (info & VM_INTINFO_RSVD)
1809 			return (EINVAL);
1810 	} else {
1811 		info = 0;
1812 	}
1813 	VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
1814 	vcpu->exitintinfo = info;
1815 	return (0);
1816 }
1817 
1818 enum exc_class {
1819 	EXC_BENIGN,
1820 	EXC_CONTRIBUTORY,
1821 	EXC_PAGEFAULT
1822 };
1823 
1824 #define	IDT_VE	20	/* Virtualization Exception (Intel specific) */
1825 
1826 static enum exc_class
exception_class(uint64_t info)1827 exception_class(uint64_t info)
1828 {
1829 	int type, vector;
1830 
1831 	KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
1832 	type = info & VM_INTINFO_TYPE;
1833 	vector = info & 0xff;
1834 
1835 	/* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
1836 	switch (type) {
1837 	case VM_INTINFO_HWINTR:
1838 	case VM_INTINFO_SWINTR:
1839 	case VM_INTINFO_NMI:
1840 		return (EXC_BENIGN);
1841 	default:
1842 		/*
1843 		 * Hardware exception.
1844 		 *
1845 		 * SVM and VT-x use identical type values to represent NMI,
1846 		 * hardware interrupt and software interrupt.
1847 		 *
1848 		 * SVM uses type '3' for all exceptions. VT-x uses type '3'
1849 		 * for exceptions except #BP and #OF. #BP and #OF use a type
1850 		 * value of '5' or '6'. Therefore we don't check for explicit
1851 		 * values of 'type' to classify 'intinfo' into a hardware
1852 		 * exception.
1853 		 */
1854 		break;
1855 	}
1856 
1857 	switch (vector) {
1858 	case IDT_PF:
1859 	case IDT_VE:
1860 		return (EXC_PAGEFAULT);
1861 	case IDT_DE:
1862 	case IDT_TS:
1863 	case IDT_NP:
1864 	case IDT_SS:
1865 	case IDT_GP:
1866 		return (EXC_CONTRIBUTORY);
1867 	default:
1868 		return (EXC_BENIGN);
1869 	}
1870 }
1871 
1872 static int
nested_fault(struct vm * vm,int vcpuid,uint64_t info1,uint64_t info2,uint64_t * retinfo)1873 nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
1874     uint64_t *retinfo)
1875 {
1876 	enum exc_class exc1, exc2;
1877 	int type1, vector1;
1878 
1879 	KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
1880 	KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
1881 
1882 	/*
1883 	 * If an exception occurs while attempting to call the double-fault
1884 	 * handler the processor enters shutdown mode (aka triple fault).
1885 	 */
1886 	type1 = info1 & VM_INTINFO_TYPE;
1887 	vector1 = info1 & 0xff;
1888 	if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
1889 		VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
1890 		    info1, info2);
1891 		vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
1892 		*retinfo = 0;
1893 		return (0);
1894 	}
1895 
1896 	/*
1897 	 * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
1898 	 */
1899 	exc1 = exception_class(info1);
1900 	exc2 = exception_class(info2);
1901 	if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
1902 	    (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
1903 		/* Convert nested fault into a double fault. */
1904 		*retinfo = IDT_DF;
1905 		*retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1906 		*retinfo |= VM_INTINFO_DEL_ERRCODE;
1907 	} else {
1908 		/* Handle exceptions serially */
1909 		*retinfo = info2;
1910 	}
1911 	return (1);
1912 }
1913 
1914 static uint64_t
vcpu_exception_intinfo(struct vcpu * vcpu)1915 vcpu_exception_intinfo(struct vcpu *vcpu)
1916 {
1917 	uint64_t info = 0;
1918 
1919 	if (vcpu->exception_pending) {
1920 		info = vcpu->exc_vector & 0xff;
1921 		info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
1922 		if (vcpu->exc_errcode_valid) {
1923 			info |= VM_INTINFO_DEL_ERRCODE;
1924 			info |= (uint64_t)vcpu->exc_errcode << 32;
1925 		}
1926 	}
1927 	return (info);
1928 }
1929 
1930 int
vm_entry_intinfo(struct vm * vm,int vcpuid,uint64_t * retinfo)1931 vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
1932 {
1933 	struct vcpu *vcpu;
1934 	uint64_t info1, info2;
1935 	int valid;
1936 
1937 	KASSERT(vcpuid >= 0 &&
1938 	    vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid));
1939 
1940 	vcpu = &vm->vcpu[vcpuid];
1941 
1942 	info1 = vcpu->exitintinfo;
1943 	vcpu->exitintinfo = 0;
1944 
1945 	info2 = 0;
1946 	if (vcpu->exception_pending) {
1947 		info2 = vcpu_exception_intinfo(vcpu);
1948 		vcpu->exception_pending = 0;
1949 		VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
1950 		    vcpu->exc_vector, info2);
1951 	}
1952 
1953 	if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
1954 		valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
1955 	} else if (info1 & VM_INTINFO_VALID) {
1956 		*retinfo = info1;
1957 		valid = 1;
1958 	} else if (info2 & VM_INTINFO_VALID) {
1959 		*retinfo = info2;
1960 		valid = 1;
1961 	} else {
1962 		valid = 0;
1963 	}
1964 
1965 	if (valid) {
1966 		VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
1967 		    "retinfo(%#lx)", __func__, info1, info2, *retinfo);
1968 	}
1969 
1970 	return (valid);
1971 }
1972 
1973 int
vm_get_intinfo(struct vm * vm,int vcpuid,uint64_t * info1,uint64_t * info2)1974 vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2)
1975 {
1976 	struct vcpu *vcpu;
1977 
1978 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1979 		return (EINVAL);
1980 
1981 	vcpu = &vm->vcpu[vcpuid];
1982 	*info1 = vcpu->exitintinfo;
1983 	*info2 = vcpu_exception_intinfo(vcpu);
1984 	return (0);
1985 }
1986 
1987 int
vm_inject_exception(struct vm * vm,int vcpuid,int vector,int errcode_valid,uint32_t errcode,int restart_instruction)1988 vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
1989     uint32_t errcode, int restart_instruction)
1990 {
1991 	struct vcpu *vcpu;
1992 	uint64_t regval;
1993 	int error;
1994 
1995 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
1996 		return (EINVAL);
1997 
1998 	if (vector < 0 || vector >= 32)
1999 		return (EINVAL);
2000 
2001 	/*
2002 	 * A double fault exception should never be injected directly into
2003 	 * the guest. It is a derived exception that results from specific
2004 	 * combinations of nested faults.
2005 	 */
2006 	if (vector == IDT_DF)
2007 		return (EINVAL);
2008 
2009 	vcpu = &vm->vcpu[vcpuid];
2010 
2011 	if (vcpu->exception_pending) {
2012 		VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
2013 		    "pending exception %d", vector, vcpu->exc_vector);
2014 		return (EBUSY);
2015 	}
2016 
2017 	if (errcode_valid) {
2018 		/*
2019 		 * Exceptions don't deliver an error code in real mode.
2020 		 */
2021 		error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &regval);
2022 		KASSERT(!error, ("%s: error %d getting CR0", __func__, error));
2023 		if (!(regval & CR0_PE))
2024 			errcode_valid = 0;
2025 	}
2026 
2027 	/*
2028 	 * From section 26.6.1 "Interruptibility State" in Intel SDM:
2029 	 *
2030 	 * Event blocking by "STI" or "MOV SS" is cleared after guest executes
2031 	 * one instruction or incurs an exception.
2032 	 */
2033 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0);
2034 	KASSERT(error == 0, ("%s: error %d clearing interrupt shadow",
2035 	    __func__, error));
2036 
2037 	if (restart_instruction)
2038 		vm_restart_instruction(vm, vcpuid);
2039 
2040 	vcpu->exception_pending = 1;
2041 	vcpu->exc_vector = vector;
2042 	vcpu->exc_errcode = errcode;
2043 	vcpu->exc_errcode_valid = errcode_valid;
2044 	VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector);
2045 	return (0);
2046 }
2047 
2048 void
vm_inject_fault(void * vmarg,int vcpuid,int vector,int errcode_valid,int errcode)2049 vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
2050     int errcode)
2051 {
2052 	struct vm *vm;
2053 	int error, restart_instruction;
2054 
2055 	vm = vmarg;
2056 	restart_instruction = 1;
2057 
2058 	error = vm_inject_exception(vm, vcpuid, vector, errcode_valid,
2059 	    errcode, restart_instruction);
2060 	KASSERT(error == 0, ("vm_inject_exception error %d", error));
2061 }
2062 
2063 void
vm_inject_pf(void * vmarg,int vcpuid,int error_code,uint64_t cr2)2064 vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
2065 {
2066 	struct vm *vm;
2067 	int error;
2068 
2069 	vm = vmarg;
2070 	VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
2071 	    error_code, cr2);
2072 
2073 	error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
2074 	KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
2075 
2076 	vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
2077 }
2078 
2079 static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
2080 
2081 int
vm_inject_nmi(struct vm * vm,int vcpuid)2082 vm_inject_nmi(struct vm *vm, int vcpuid)
2083 {
2084 	struct vcpu *vcpu;
2085 
2086 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2087 		return (EINVAL);
2088 
2089 	vcpu = &vm->vcpu[vcpuid];
2090 
2091 	vcpu->nmi_pending = 1;
2092 	vcpu_notify_event(vm, vcpuid, false);
2093 	return (0);
2094 }
2095 
2096 int
vm_nmi_pending(struct vm * vm,int vcpuid)2097 vm_nmi_pending(struct vm *vm, int vcpuid)
2098 {
2099 	struct vcpu *vcpu;
2100 
2101 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2102 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2103 
2104 	vcpu = &vm->vcpu[vcpuid];
2105 
2106 	return (vcpu->nmi_pending);
2107 }
2108 
2109 void
vm_nmi_clear(struct vm * vm,int vcpuid)2110 vm_nmi_clear(struct vm *vm, int vcpuid)
2111 {
2112 	struct vcpu *vcpu;
2113 
2114 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2115 		panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
2116 
2117 	vcpu = &vm->vcpu[vcpuid];
2118 
2119 	if (vcpu->nmi_pending == 0)
2120 		panic("vm_nmi_clear: inconsistent nmi_pending state");
2121 
2122 	vcpu->nmi_pending = 0;
2123 	vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
2124 }
2125 
2126 static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
2127 
2128 int
vm_inject_extint(struct vm * vm,int vcpuid)2129 vm_inject_extint(struct vm *vm, int vcpuid)
2130 {
2131 	struct vcpu *vcpu;
2132 
2133 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2134 		return (EINVAL);
2135 
2136 	vcpu = &vm->vcpu[vcpuid];
2137 
2138 	vcpu->extint_pending = 1;
2139 	vcpu_notify_event(vm, vcpuid, false);
2140 	return (0);
2141 }
2142 
2143 int
vm_extint_pending(struct vm * vm,int vcpuid)2144 vm_extint_pending(struct vm *vm, int vcpuid)
2145 {
2146 	struct vcpu *vcpu;
2147 
2148 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2149 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2150 
2151 	vcpu = &vm->vcpu[vcpuid];
2152 
2153 	return (vcpu->extint_pending);
2154 }
2155 
2156 void
vm_extint_clear(struct vm * vm,int vcpuid)2157 vm_extint_clear(struct vm *vm, int vcpuid)
2158 {
2159 	struct vcpu *vcpu;
2160 
2161 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2162 		panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
2163 
2164 	vcpu = &vm->vcpu[vcpuid];
2165 
2166 	if (vcpu->extint_pending == 0)
2167 		panic("vm_extint_clear: inconsistent extint_pending state");
2168 
2169 	vcpu->extint_pending = 0;
2170 	vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
2171 }
2172 
2173 int
vm_get_capability(struct vm * vm,int vcpu,int type,int * retval)2174 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
2175 {
2176 	if (vcpu < 0 || vcpu >= vm->maxcpus)
2177 		return (EINVAL);
2178 
2179 	if (type < 0 || type >= VM_CAP_MAX)
2180 		return (EINVAL);
2181 
2182 	return (VMGETCAP(vm->cookie, vcpu, type, retval));
2183 }
2184 
2185 int
vm_set_capability(struct vm * vm,int vcpu,int type,int val)2186 vm_set_capability(struct vm *vm, int vcpu, int type, int val)
2187 {
2188 	if (vcpu < 0 || vcpu >= vm->maxcpus)
2189 		return (EINVAL);
2190 
2191 	if (type < 0 || type >= VM_CAP_MAX)
2192 		return (EINVAL);
2193 
2194 	return (VMSETCAP(vm->cookie, vcpu, type, val));
2195 }
2196 
2197 struct vlapic *
vm_lapic(struct vm * vm,int cpu)2198 vm_lapic(struct vm *vm, int cpu)
2199 {
2200 	return (vm->vcpu[cpu].vlapic);
2201 }
2202 
2203 struct vioapic *
vm_ioapic(struct vm * vm)2204 vm_ioapic(struct vm *vm)
2205 {
2206 
2207 	return (vm->vioapic);
2208 }
2209 
2210 struct vhpet *
vm_hpet(struct vm * vm)2211 vm_hpet(struct vm *vm)
2212 {
2213 
2214 	return (vm->vhpet);
2215 }
2216 
2217 bool
vmm_is_pptdev(int bus,int slot,int func)2218 vmm_is_pptdev(int bus, int slot, int func)
2219 {
2220 	int b, f, i, n, s;
2221 	char *val, *cp, *cp2;
2222 	bool found;
2223 
2224 	/*
2225 	 * XXX
2226 	 * The length of an environment variable is limited to 128 bytes which
2227 	 * puts an upper limit on the number of passthru devices that may be
2228 	 * specified using a single environment variable.
2229 	 *
2230 	 * Work around this by scanning multiple environment variable
2231 	 * names instead of a single one - yuck!
2232 	 */
2233 	const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
2234 
2235 	/* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
2236 	found = false;
2237 	for (i = 0; names[i] != NULL && !found; i++) {
2238 		cp = val = kern_getenv(names[i]);
2239 		while (cp != NULL && *cp != '\0') {
2240 			if ((cp2 = strchr(cp, ' ')) != NULL)
2241 				*cp2 = '\0';
2242 
2243 			n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
2244 			if (n == 3 && bus == b && slot == s && func == f) {
2245 				found = true;
2246 				break;
2247 			}
2248 
2249 			if (cp2 != NULL)
2250 				*cp2++ = ' ';
2251 
2252 			cp = cp2;
2253 		}
2254 		freeenv(val);
2255 	}
2256 	return (found);
2257 }
2258 
2259 void *
vm_iommu_domain(struct vm * vm)2260 vm_iommu_domain(struct vm *vm)
2261 {
2262 
2263 	return (vm->iommu);
2264 }
2265 
2266 int
vcpu_set_state(struct vm * vm,int vcpuid,enum vcpu_state newstate,bool from_idle)2267 vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
2268     bool from_idle)
2269 {
2270 	int error;
2271 	struct vcpu *vcpu;
2272 
2273 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2274 		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
2275 
2276 	vcpu = &vm->vcpu[vcpuid];
2277 
2278 	vcpu_lock(vcpu);
2279 	error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle);
2280 	vcpu_unlock(vcpu);
2281 
2282 	return (error);
2283 }
2284 
2285 enum vcpu_state
vcpu_get_state(struct vm * vm,int vcpuid,int * hostcpu)2286 vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
2287 {
2288 	struct vcpu *vcpu;
2289 	enum vcpu_state state;
2290 
2291 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2292 		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
2293 
2294 	vcpu = &vm->vcpu[vcpuid];
2295 
2296 	vcpu_lock(vcpu);
2297 	state = vcpu->state;
2298 	if (hostcpu != NULL)
2299 		*hostcpu = vcpu->hostcpu;
2300 	vcpu_unlock(vcpu);
2301 
2302 	return (state);
2303 }
2304 
2305 int
vm_activate_cpu(struct vm * vm,int vcpuid)2306 vm_activate_cpu(struct vm *vm, int vcpuid)
2307 {
2308 
2309 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2310 		return (EINVAL);
2311 
2312 	if (CPU_ISSET(vcpuid, &vm->active_cpus))
2313 		return (EBUSY);
2314 
2315 	VCPU_CTR0(vm, vcpuid, "activated");
2316 	CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
2317 	return (0);
2318 }
2319 
2320 cpuset_t
vm_active_cpus(struct vm * vm)2321 vm_active_cpus(struct vm *vm)
2322 {
2323 
2324 	return (vm->active_cpus);
2325 }
2326 
2327 cpuset_t
vm_suspended_cpus(struct vm * vm)2328 vm_suspended_cpus(struct vm *vm)
2329 {
2330 
2331 	return (vm->suspended_cpus);
2332 }
2333 
2334 void *
vcpu_stats(struct vm * vm,int vcpuid)2335 vcpu_stats(struct vm *vm, int vcpuid)
2336 {
2337 
2338 	return (vm->vcpu[vcpuid].stats);
2339 }
2340 
2341 int
vm_get_x2apic_state(struct vm * vm,int vcpuid,enum x2apic_state * state)2342 vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
2343 {
2344 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2345 		return (EINVAL);
2346 
2347 	*state = vm->vcpu[vcpuid].x2apic_state;
2348 
2349 	return (0);
2350 }
2351 
2352 int
vm_set_x2apic_state(struct vm * vm,int vcpuid,enum x2apic_state state)2353 vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
2354 {
2355 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
2356 		return (EINVAL);
2357 
2358 	if (state >= X2APIC_STATE_LAST)
2359 		return (EINVAL);
2360 
2361 	vm->vcpu[vcpuid].x2apic_state = state;
2362 
2363 	vlapic_set_x2apic_state(vm, vcpuid, state);
2364 
2365 	return (0);
2366 }
2367 
2368 /*
2369  * This function is called to ensure that a vcpu "sees" a pending event
2370  * as soon as possible:
2371  * - If the vcpu thread is sleeping then it is woken up.
2372  * - If the vcpu is running on a different host_cpu then an IPI will be directed
2373  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
2374  */
2375 static void
vcpu_notify_event_locked(struct vcpu * vcpu,bool lapic_intr)2376 vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
2377 {
2378 	int hostcpu;
2379 
2380 	hostcpu = vcpu->hostcpu;
2381 	if (vcpu->state == VCPU_RUNNING) {
2382 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
2383 		if (hostcpu != curcpu) {
2384 			if (lapic_intr) {
2385 				vlapic_post_intr(vcpu->vlapic, hostcpu,
2386 				    vmm_ipinum);
2387 			} else {
2388 				ipi_cpu(hostcpu, vmm_ipinum);
2389 			}
2390 		} else {
2391 			/*
2392 			 * If the 'vcpu' is running on 'curcpu' then it must
2393 			 * be sending a notification to itself (e.g. SELF_IPI).
2394 			 * The pending event will be picked up when the vcpu
2395 			 * transitions back to guest context.
2396 			 */
2397 		}
2398 	} else {
2399 		KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
2400 		    "with hostcpu %d", vcpu->state, hostcpu));
2401 		if (vcpu->state == VCPU_SLEEPING)
2402 			wakeup_one(vcpu);
2403 	}
2404 }
2405 
2406 void
vcpu_notify_event(struct vm * vm,int vcpuid,bool lapic_intr)2407 vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
2408 {
2409 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
2410 
2411 	vcpu_lock(vcpu);
2412 	vcpu_notify_event_locked(vcpu, lapic_intr);
2413 	vcpu_unlock(vcpu);
2414 }
2415 
2416 struct vmspace *
vm_get_vmspace(struct vm * vm)2417 vm_get_vmspace(struct vm *vm)
2418 {
2419 
2420 	return (vm->vmspace);
2421 }
2422 
2423 int
vm_apicid2vcpuid(struct vm * vm,int apicid)2424 vm_apicid2vcpuid(struct vm *vm, int apicid)
2425 {
2426 	/*
2427 	 * XXX apic id is assumed to be numerically identical to vcpu id
2428 	 */
2429 	return (apicid);
2430 }
2431 
2432 void
vm_smp_rendezvous(struct vm * vm,int vcpuid,cpuset_t dest,vm_rendezvous_func_t func,void * arg)2433 vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
2434     vm_rendezvous_func_t func, void *arg)
2435 {
2436 	int i;
2437 
2438 	/*
2439 	 * Enforce that this function is called without any locks
2440 	 */
2441 	WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
2442 	KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
2443 	    ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
2444 
2445 restart:
2446 	mtx_lock(&vm->rendezvous_mtx);
2447 	if (vm->rendezvous_func != NULL) {
2448 		/*
2449 		 * If a rendezvous is already in progress then we need to
2450 		 * call the rendezvous handler in case this 'vcpuid' is one
2451 		 * of the targets of the rendezvous.
2452 		 */
2453 		RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
2454 		mtx_unlock(&vm->rendezvous_mtx);
2455 		vm_handle_rendezvous(vm, vcpuid);
2456 		goto restart;
2457 	}
2458 	KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
2459 	    "rendezvous is still in progress"));
2460 
2461 	RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
2462 	vm->rendezvous_req_cpus = dest;
2463 	CPU_ZERO(&vm->rendezvous_done_cpus);
2464 	vm->rendezvous_arg = arg;
2465 	vm_set_rendezvous_func(vm, func);
2466 	mtx_unlock(&vm->rendezvous_mtx);
2467 
2468 	/*
2469 	 * Wake up any sleeping vcpus and trigger a VM-exit in any running
2470 	 * vcpus so they handle the rendezvous as soon as possible.
2471 	 */
2472 	for (i = 0; i < vm->maxcpus; i++) {
2473 		if (CPU_ISSET(i, &dest))
2474 			vcpu_notify_event(vm, i, false);
2475 	}
2476 
2477 	vm_handle_rendezvous(vm, vcpuid);
2478 }
2479 
2480 struct vatpic *
vm_atpic(struct vm * vm)2481 vm_atpic(struct vm *vm)
2482 {
2483 	return (vm->vatpic);
2484 }
2485 
2486 struct vatpit *
vm_atpit(struct vm * vm)2487 vm_atpit(struct vm *vm)
2488 {
2489 	return (vm->vatpit);
2490 }
2491 
2492 struct vpmtmr *
vm_pmtmr(struct vm * vm)2493 vm_pmtmr(struct vm *vm)
2494 {
2495 
2496 	return (vm->vpmtmr);
2497 }
2498 
2499 struct vrtc *
vm_rtc(struct vm * vm)2500 vm_rtc(struct vm *vm)
2501 {
2502 
2503 	return (vm->vrtc);
2504 }
2505 
2506 enum vm_reg_name
vm_segment_name(int seg)2507 vm_segment_name(int seg)
2508 {
2509 	static enum vm_reg_name seg_names[] = {
2510 		VM_REG_GUEST_ES,
2511 		VM_REG_GUEST_CS,
2512 		VM_REG_GUEST_SS,
2513 		VM_REG_GUEST_DS,
2514 		VM_REG_GUEST_FS,
2515 		VM_REG_GUEST_GS
2516 	};
2517 
2518 	KASSERT(seg >= 0 && seg < nitems(seg_names),
2519 	    ("%s: invalid segment encoding %d", __func__, seg));
2520 	return (seg_names[seg]);
2521 }
2522 
2523 void
vm_copy_teardown(struct vm * vm,int vcpuid,struct vm_copyinfo * copyinfo,int num_copyinfo)2524 vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
2525     int num_copyinfo)
2526 {
2527 	int idx;
2528 
2529 	for (idx = 0; idx < num_copyinfo; idx++) {
2530 		if (copyinfo[idx].cookie != NULL)
2531 			vm_gpa_release(copyinfo[idx].cookie);
2532 	}
2533 	bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
2534 }
2535 
2536 int
vm_copy_setup(struct vm * vm,int vcpuid,struct vm_guest_paging * paging,uint64_t gla,size_t len,int prot,struct vm_copyinfo * copyinfo,int num_copyinfo,int * fault)2537 vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
2538     uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
2539     int num_copyinfo, int *fault)
2540 {
2541 	int error, idx, nused;
2542 	size_t n, off, remaining;
2543 	void *hva, *cookie;
2544 	uint64_t gpa;
2545 
2546 	bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
2547 
2548 	nused = 0;
2549 	remaining = len;
2550 	while (remaining > 0) {
2551 		KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
2552 		error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault);
2553 		if (error || *fault)
2554 			return (error);
2555 		off = gpa & PAGE_MASK;
2556 		n = min(remaining, PAGE_SIZE - off);
2557 		copyinfo[nused].gpa = gpa;
2558 		copyinfo[nused].len = n;
2559 		remaining -= n;
2560 		gla += n;
2561 		nused++;
2562 	}
2563 
2564 	for (idx = 0; idx < nused; idx++) {
2565 		hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa,
2566 		    copyinfo[idx].len, prot, &cookie);
2567 		if (hva == NULL)
2568 			break;
2569 		copyinfo[idx].hva = hva;
2570 		copyinfo[idx].cookie = cookie;
2571 	}
2572 
2573 	if (idx != nused) {
2574 		vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
2575 		return (EFAULT);
2576 	} else {
2577 		*fault = 0;
2578 		return (0);
2579 	}
2580 }
2581 
2582 void
vm_copyin(struct vm * vm,int vcpuid,struct vm_copyinfo * copyinfo,void * kaddr,size_t len)2583 vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
2584     size_t len)
2585 {
2586 	char *dst;
2587 	int idx;
2588 
2589 	dst = kaddr;
2590 	idx = 0;
2591 	while (len > 0) {
2592 		bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
2593 		len -= copyinfo[idx].len;
2594 		dst += copyinfo[idx].len;
2595 		idx++;
2596 	}
2597 }
2598 
2599 void
vm_copyout(struct vm * vm,int vcpuid,const void * kaddr,struct vm_copyinfo * copyinfo,size_t len)2600 vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
2601     struct vm_copyinfo *copyinfo, size_t len)
2602 {
2603 	const char *src;
2604 	int idx;
2605 
2606 	src = kaddr;
2607 	idx = 0;
2608 	while (len > 0) {
2609 		bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
2610 		len -= copyinfo[idx].len;
2611 		src += copyinfo[idx].len;
2612 		idx++;
2613 	}
2614 }
2615 
2616 /*
2617  * Return the amount of in-use and wired memory for the VM. Since
2618  * these are global stats, only return the values with for vCPU 0
2619  */
2620 VMM_STAT_DECLARE(VMM_MEM_RESIDENT);
2621 VMM_STAT_DECLARE(VMM_MEM_WIRED);
2622 
2623 static void
vm_get_rescnt(struct vm * vm,int vcpu,struct vmm_stat_type * stat)2624 vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2625 {
2626 
2627 	if (vcpu == 0) {
2628 		vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT,
2629 	       	    PAGE_SIZE * vmspace_resident_count(vm->vmspace));
2630 	}
2631 }
2632 
2633 static void
vm_get_wiredcnt(struct vm * vm,int vcpu,struct vmm_stat_type * stat)2634 vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat)
2635 {
2636 
2637 	if (vcpu == 0) {
2638 		vmm_stat_set(vm, vcpu, VMM_MEM_WIRED,
2639 	      	    PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace)));
2640 	}
2641 }
2642 
2643 VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt);
2644 VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt);
2645