xref: /trueos/usr.sbin/bhyve/task_switch.c (revision fd9c7ff5320b460878bd4f9a4264a2efbf79bc6c)
1 /*-
2  * Copyright (c) 2014 Neel Natu <neel@freebsd.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/_iovec.h>
32 #include <sys/mman.h>
33 
34 #include <x86/psl.h>
35 #include <x86/segments.h>
36 #include <x86/specialreg.h>
37 #include <machine/vmm.h>
38 #include <machine/vmm_instruction_emul.h>
39 
40 #include <stdbool.h>
41 #include <stdio.h>
42 #include <stdlib.h>
43 #include <assert.h>
44 #include <errno.h>
45 
46 #include <vmmapi.h>
47 
48 #include "bhyverun.h"
49 
50 /*
51  * Using 'struct i386tss' is tempting but causes myriad sign extension
52  * issues because all of its fields are defined as signed integers.
53  */
54 struct tss32 {
55 	uint16_t	tss_link;
56 	uint16_t	rsvd1;
57 	uint32_t	tss_esp0;
58 	uint16_t	tss_ss0;
59 	uint16_t	rsvd2;
60 	uint32_t	tss_esp1;
61 	uint16_t	tss_ss1;
62 	uint16_t	rsvd3;
63 	uint32_t	tss_esp2;
64 	uint16_t	tss_ss2;
65 	uint16_t	rsvd4;
66 	uint32_t	tss_cr3;
67 	uint32_t	tss_eip;
68 	uint32_t	tss_eflags;
69 	uint32_t	tss_eax;
70 	uint32_t	tss_ecx;
71 	uint32_t	tss_edx;
72 	uint32_t	tss_ebx;
73 	uint32_t	tss_esp;
74 	uint32_t	tss_ebp;
75 	uint32_t	tss_esi;
76 	uint32_t	tss_edi;
77 	uint16_t	tss_es;
78 	uint16_t	rsvd5;
79 	uint16_t	tss_cs;
80 	uint16_t	rsvd6;
81 	uint16_t	tss_ss;
82 	uint16_t	rsvd7;
83 	uint16_t	tss_ds;
84 	uint16_t	rsvd8;
85 	uint16_t	tss_fs;
86 	uint16_t	rsvd9;
87 	uint16_t	tss_gs;
88 	uint16_t	rsvd10;
89 	uint16_t	tss_ldt;
90 	uint16_t	rsvd11;
91 	uint16_t	tss_trap;
92 	uint16_t	tss_iomap;
93 };
94 CTASSERT(sizeof(struct tss32) == 104);
95 
96 #define	SEL_START(sel)	(((sel) & ~0x7))
97 #define	SEL_LIMIT(sel)	(((sel) | 0x7))
98 #define	TSS_BUSY(type)	(((type) & 0x2) != 0)
99 
100 static uint64_t
GETREG(struct vmctx * ctx,int vcpu,int reg)101 GETREG(struct vmctx *ctx, int vcpu, int reg)
102 {
103 	uint64_t val;
104 	int error;
105 
106 	error = vm_get_register(ctx, vcpu, reg, &val);
107 	assert(error == 0);
108 	return (val);
109 }
110 
111 static void
SETREG(struct vmctx * ctx,int vcpu,int reg,uint64_t val)112 SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
113 {
114 	int error;
115 
116 	error = vm_set_register(ctx, vcpu, reg, val);
117 	assert(error == 0);
118 }
119 
120 static struct seg_desc
usd_to_seg_desc(struct user_segment_descriptor * usd)121 usd_to_seg_desc(struct user_segment_descriptor *usd)
122 {
123 	struct seg_desc seg_desc;
124 
125 	seg_desc.base = (u_int)USD_GETBASE(usd);
126 	if (usd->sd_gran)
127 		seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff;
128 	else
129 		seg_desc.limit = (u_int)USD_GETLIMIT(usd);
130 	seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7;
131 	seg_desc.access |= usd->sd_xx << 12;
132 	seg_desc.access |= usd->sd_def32 << 14;
133 	seg_desc.access |= usd->sd_gran << 15;
134 
135 	return (seg_desc);
136 }
137 
138 /*
139  * Inject an exception with an error code that is a segment selector.
140  * The format of the error code is described in section 6.13, "Error Code",
141  * Intel SDM volume 3.
142  *
143  * Bit 0 (EXT) denotes whether the exception occurred during delivery
144  * of an external event like an interrupt.
145  *
146  * Bit 1 (IDT) indicates whether the selector points to a gate descriptor
147  * in the IDT.
148  *
149  * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI).
150  */
151 static void
sel_exception(struct vmctx * ctx,int vcpu,int vector,uint16_t sel,int ext)152 sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext)
153 {
154 	/*
155 	 * Bit 2 from the selector is retained as-is in the error code.
156 	 *
157 	 * Bit 1 can be safely cleared because none of the selectors
158 	 * encountered during task switch emulation refer to a task
159 	 * gate in the IDT.
160 	 *
161 	 * Bit 0 is set depending on the value of 'ext'.
162 	 */
163 	sel &= ~0x3;
164 	if (ext)
165 		sel |= 0x1;
166 	vm_inject_fault(ctx, vcpu, vector, 1, sel);
167 }
168 
169 /*
170  * Return 0 if the selector 'sel' in within the limits of the GDT/LDT
171  * and non-zero otherwise.
172  */
173 static int
desc_table_limit_check(struct vmctx * ctx,int vcpu,uint16_t sel)174 desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel)
175 {
176 	uint64_t base;
177 	uint32_t limit, access;
178 	int error, reg;
179 
180 	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
181 	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
182 	assert(error == 0);
183 
184 	if (reg == VM_REG_GUEST_LDTR) {
185 		if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access))
186 			return (-1);
187 	}
188 
189 	if (limit < SEL_LIMIT(sel))
190 		return (-1);
191 	else
192 		return (0);
193 }
194 
195 /*
196  * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced
197  * by the selector 'sel'.
198  *
199  * Returns 0 on success.
200  * Returns 1 if an exception was injected into the guest.
201  * Returns -1 otherwise.
202  */
203 static int
desc_table_rw(struct vmctx * ctx,int vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc,bool doread)204 desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
205     uint16_t sel, struct user_segment_descriptor *desc, bool doread)
206 {
207 	struct iovec iov[2];
208 	uint64_t base;
209 	uint32_t limit, access;
210 	int error, reg;
211 
212 	reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR;
213 	error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access);
214 	assert(error == 0);
215 	assert(limit >= SEL_LIMIT(sel));
216 
217 	error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel),
218 	    sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov));
219 	if (error == 0) {
220 		if (doread)
221 			vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc));
222 		else
223 			vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc));
224 	}
225 	return (error);
226 }
227 
228 static int
desc_table_read(struct vmctx * ctx,int vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc)229 desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
230     uint16_t sel, struct user_segment_descriptor *desc)
231 {
232 	return (desc_table_rw(ctx, vcpu, paging, sel, desc, true));
233 }
234 
235 static int
desc_table_write(struct vmctx * ctx,int vcpu,struct vm_guest_paging * paging,uint16_t sel,struct user_segment_descriptor * desc)236 desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
237     uint16_t sel, struct user_segment_descriptor *desc)
238 {
239 	return (desc_table_rw(ctx, vcpu, paging, sel, desc, false));
240 }
241 
242 /*
243  * Read the TSS descriptor referenced by 'sel' into 'desc'.
244  *
245  * Returns 0 on success.
246  * Returns 1 if an exception was injected into the guest.
247  * Returns -1 otherwise.
248  */
249 static int
read_tss_descriptor(struct vmctx * ctx,int vcpu,struct vm_task_switch * ts,uint16_t sel,struct user_segment_descriptor * desc)250 read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
251     uint16_t sel, struct user_segment_descriptor *desc)
252 {
253 	struct vm_guest_paging sup_paging;
254 	int error;
255 
256 	assert(!ISLDT(sel));
257 	assert(IDXSEL(sel) != 0);
258 
259 	/* Fetch the new TSS descriptor */
260 	if (desc_table_limit_check(ctx, vcpu, sel)) {
261 		if (ts->reason == TSR_IRET)
262 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
263 		else
264 			sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext);
265 		return (1);
266 	}
267 
268 	sup_paging = ts->paging;
269 	sup_paging.cpl = 0;		/* implicit supervisor mode */
270 	error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc);
271 	return (error);
272 }
273 
274 static bool
code_desc(int sd_type)275 code_desc(int sd_type)
276 {
277 	/* code descriptor */
278 	return ((sd_type & 0x18) == 0x18);
279 }
280 
281 static bool
stack_desc(int sd_type)282 stack_desc(int sd_type)
283 {
284 	/* writable data descriptor */
285 	return ((sd_type & 0x1A) == 0x12);
286 }
287 
288 static bool
data_desc(int sd_type)289 data_desc(int sd_type)
290 {
291 	/* data descriptor or a readable code descriptor */
292 	return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A);
293 }
294 
295 static bool
ldt_desc(int sd_type)296 ldt_desc(int sd_type)
297 {
298 
299 	return (sd_type == SDT_SYSLDT);
300 }
301 
302 /*
303  * Validate the descriptor 'seg_desc' associated with 'segment'.
304  *
305  * Returns 0 on success.
306  * Returns 1 if an exception was injected into the guest.
307  * Returns -1 otherwise.
308  */
309 static int
validate_seg_desc(struct vmctx * ctx,int vcpu,struct vm_task_switch * ts,int segment,struct seg_desc * seg_desc)310 validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
311     int segment, struct seg_desc *seg_desc)
312 {
313 	struct vm_guest_paging sup_paging;
314 	struct user_segment_descriptor usd;
315 	int error, idtvec;
316 	int cpl, dpl, rpl;
317 	uint16_t sel, cs;
318 	bool ldtseg, codeseg, stackseg, dataseg, conforming;
319 
320 	ldtseg = codeseg = stackseg = dataseg = false;
321 	switch (segment) {
322 	case VM_REG_GUEST_LDTR:
323 		ldtseg = true;
324 		break;
325 	case VM_REG_GUEST_CS:
326 		codeseg = true;
327 		break;
328 	case VM_REG_GUEST_SS:
329 		stackseg = true;
330 		break;
331 	case VM_REG_GUEST_DS:
332 	case VM_REG_GUEST_ES:
333 	case VM_REG_GUEST_FS:
334 	case VM_REG_GUEST_GS:
335 		dataseg = true;
336 		break;
337 	default:
338 		assert(0);
339 	}
340 
341 	/* Get the segment selector */
342 	sel = GETREG(ctx, vcpu, segment);
343 
344 	/* LDT selector must point into the GDT */
345 	if (ldtseg && ISLDT(sel)) {
346 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
347 		return (1);
348 	}
349 
350 	/* Descriptor table limit check */
351 	if (desc_table_limit_check(ctx, vcpu, sel)) {
352 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
353 		return (1);
354 	}
355 
356 	/* NULL selector */
357 	if (IDXSEL(sel) == 0) {
358 		/* Code and stack segment selectors cannot be NULL */
359 		if (codeseg || stackseg) {
360 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
361 			return (1);
362 		}
363 		seg_desc->base = 0;
364 		seg_desc->limit = 0;
365 		seg_desc->access = 0x10000;	/* unusable */
366 		return (0);
367 	}
368 
369 	/* Read the descriptor from the GDT/LDT */
370 	sup_paging = ts->paging;
371 	sup_paging.cpl = 0;	/* implicit supervisor mode */
372 	error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd);
373 	if (error)
374 		return (error);
375 
376 	/* Verify that the descriptor type is compatible with the segment */
377 	if ((ldtseg && !ldt_desc(usd.sd_type)) ||
378 	    (codeseg && !code_desc(usd.sd_type)) ||
379 	    (dataseg && !data_desc(usd.sd_type)) ||
380 	    (stackseg && !stack_desc(usd.sd_type))) {
381 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
382 		return (1);
383 	}
384 
385 	/* Segment must be marked present */
386 	if (!usd.sd_p) {
387 		if (ldtseg)
388 			idtvec = IDT_TS;
389 		else if (stackseg)
390 			idtvec = IDT_SS;
391 		else
392 			idtvec = IDT_NP;
393 		sel_exception(ctx, vcpu, idtvec, sel, ts->ext);
394 		return (1);
395 	}
396 
397 	cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
398 	cpl = cs & SEL_RPL_MASK;
399 	rpl = sel & SEL_RPL_MASK;
400 	dpl = usd.sd_dpl;
401 
402 	if (stackseg && (rpl != cpl || dpl != cpl)) {
403 		sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
404 		return (1);
405 	}
406 
407 	if (codeseg) {
408 		conforming = (usd.sd_type & 0x4) ? true : false;
409 		if ((conforming && (cpl < dpl)) ||
410 		    (!conforming && (cpl != dpl))) {
411 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
412 			return (1);
413 		}
414 	}
415 
416 	if (dataseg) {
417 		/*
418 		 * A data segment is always non-conforming except when it's
419 		 * descriptor is a readable, conforming code segment.
420 		 */
421 		if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0)
422 			conforming = true;
423 		else
424 			conforming = false;
425 
426 		if (!conforming && (rpl > dpl || cpl > dpl)) {
427 			sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext);
428 			return (1);
429 		}
430 	}
431 	*seg_desc = usd_to_seg_desc(&usd);
432 	return (0);
433 }
434 
435 static void
tss32_save(struct vmctx * ctx,int vcpu,struct vm_task_switch * task_switch,uint32_t eip,struct tss32 * tss,struct iovec * iov)436 tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch,
437     uint32_t eip, struct tss32 *tss, struct iovec *iov)
438 {
439 
440 	/* General purpose registers */
441 	tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX);
442 	tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX);
443 	tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX);
444 	tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX);
445 	tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
446 	tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP);
447 	tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI);
448 	tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI);
449 
450 	/* Segment selectors */
451 	tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES);
452 	tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS);
453 	tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
454 	tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS);
455 	tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS);
456 	tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS);
457 
458 	/* eflags and eip */
459 	tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
460 	if (task_switch->reason == TSR_IRET)
461 		tss->tss_eflags &= ~PSL_NT;
462 	tss->tss_eip = eip;
463 
464 	/* Copy updated old TSS into guest memory */
465 	vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32));
466 }
467 
468 static void
update_seg_desc(struct vmctx * ctx,int vcpu,int reg,struct seg_desc * sd)469 update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd)
470 {
471 	int error;
472 
473 	error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access);
474 	assert(error == 0);
475 }
476 
477 /*
478  * Update the vcpu registers to reflect the state of the new task.
479  *
480  * Returns 0 on success.
481  * Returns 1 if an exception was injected into the guest.
482  * Returns -1 otherwise.
483  */
484 static int
tss32_restore(struct vmctx * ctx,int vcpu,struct vm_task_switch * ts,uint16_t ot_sel,struct tss32 * tss,struct iovec * iov)485 tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts,
486     uint16_t ot_sel, struct tss32 *tss, struct iovec *iov)
487 {
488 	struct seg_desc seg_desc, seg_desc2;
489 	uint64_t *pdpte, maxphyaddr, reserved;
490 	uint32_t eflags;
491 	int error, i;
492 	bool nested;
493 
494 	nested = false;
495 	if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) {
496 		tss->tss_link = ot_sel;
497 		nested = true;
498 	}
499 
500 	eflags = tss->tss_eflags;
501 	if (nested)
502 		eflags |= PSL_NT;
503 
504 	/* LDTR */
505 	SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt);
506 
507 	/* PBDR */
508 	if (ts->paging.paging_mode != PAGING_MODE_FLAT) {
509 		if (ts->paging.paging_mode == PAGING_MODE_PAE) {
510 			/*
511 			 * XXX Assuming 36-bit MAXPHYADDR.
512 			 */
513 			maxphyaddr = (1UL << 36) - 1;
514 			pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32);
515 			for (i = 0; i < 4; i++) {
516 				/* Check reserved bits if the PDPTE is valid */
517 				if (!(pdpte[i] & 0x1))
518 					continue;
519 				/*
520 				 * Bits 2:1, 8:5 and bits above the processor's
521 				 * maximum physical address are reserved.
522 				 */
523 				reserved = ~maxphyaddr | 0x1E6;
524 				if (pdpte[i] & reserved) {
525 					vm_inject_gp(ctx, vcpu);
526 					return (1);
527 				}
528 			}
529 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]);
530 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]);
531 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]);
532 			SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]);
533 		}
534 		SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3);
535 		ts->paging.cr3 = tss->tss_cr3;
536 	}
537 
538 	/* eflags and eip */
539 	SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags);
540 	SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip);
541 
542 	/* General purpose registers */
543 	SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax);
544 	SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx);
545 	SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx);
546 	SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx);
547 	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp);
548 	SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp);
549 	SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi);
550 	SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi);
551 
552 	/* Segment selectors */
553 	SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es);
554 	SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs);
555 	SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss);
556 	SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds);
557 	SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs);
558 	SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs);
559 
560 	/*
561 	 * If this is a nested task then write out the new TSS to update
562 	 * the previous link field.
563 	 */
564 	if (nested)
565 		vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss));
566 
567 	/* Validate segment descriptors */
568 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc);
569 	if (error)
570 		return (error);
571 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc);
572 
573 	/*
574 	 * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3.
575 	 *
576 	 * The SS and CS attribute checks on VM-entry are inter-dependent so
577 	 * we need to make sure that both segments are valid before updating
578 	 * either of them. This ensures that the VMCS state can pass the
579 	 * VM-entry checks so the guest can handle any exception injected
580 	 * during task switch emulation.
581 	 */
582 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc);
583 	if (error)
584 		return (error);
585 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2);
586 	if (error)
587 		return (error);
588 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc);
589 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2);
590 	ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK;
591 
592 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc);
593 	if (error)
594 		return (error);
595 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc);
596 
597 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc);
598 	if (error)
599 		return (error);
600 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc);
601 
602 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc);
603 	if (error)
604 		return (error);
605 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc);
606 
607 	error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc);
608 	if (error)
609 		return (error);
610 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc);
611 
612 	return (0);
613 }
614 
615 /*
616  * Push an error code on the stack of the new task. This is needed if the
617  * task switch was triggered by a hardware exception that causes an error
618  * code to be saved (e.g. #PF).
619  *
620  * Returns 0 on success.
621  * Returns 1 if an exception was injected into the guest.
622  * Returns -1 otherwise.
623  */
624 static int
push_errcode(struct vmctx * ctx,int vcpu,struct vm_guest_paging * paging,int task_type,uint32_t errcode)625 push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
626     int task_type, uint32_t errcode)
627 {
628 	struct iovec iov[2];
629 	struct seg_desc seg_desc;
630 	int stacksize, bytes, error;
631 	uint64_t gla, cr0, rflags;
632 	uint32_t esp;
633 	uint16_t stacksel;
634 
635 	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
636 	rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS);
637 	stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS);
638 
639 	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base,
640 	    &seg_desc.limit, &seg_desc.access);
641 	assert(error == 0);
642 
643 	/*
644 	 * Section "Error Code" in the Intel SDM vol 3: the error code is
645 	 * pushed on the stack as a doubleword or word (depending on the
646 	 * default interrupt, trap or task gate size).
647 	 */
648 	if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS)
649 		bytes = 4;
650 	else
651 		bytes = 2;
652 
653 	/*
654 	 * PUSH instruction from Intel SDM vol 2: the 'B' flag in the
655 	 * stack-segment descriptor determines the size of the stack
656 	 * pointer outside of 64-bit mode.
657 	 */
658 	if (SEG_DESC_DEF32(seg_desc.access))
659 		stacksize = 4;
660 	else
661 		stacksize = 2;
662 
663 	esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP);
664 	esp -= bytes;
665 
666 	if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS,
667 	    &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) {
668 		sel_exception(ctx, vcpu, IDT_SS, stacksel, 1);
669 		return (1);
670 	}
671 
672 	if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) {
673 		vm_inject_ac(ctx, vcpu, 1);
674 		return (1);
675 	}
676 
677 	error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE,
678 	    iov, nitems(iov));
679 	if (error)
680 		return (error);
681 
682 	vm_copyout(ctx, vcpu, &errcode, iov, bytes);
683 	SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp);
684 	return (0);
685 }
686 
687 /*
688  * Evaluate return value from helper functions and potentially return to
689  * the VM run loop.
690  *  0: success
691  * +1: an exception was injected into the guest vcpu
692  * -1: unrecoverable/programming error
693  */
694 #define	CHKERR(x)							\
695 	do {								\
696 		assert(((x) == 0) || ((x) == 1) || ((x) == -1));	\
697 		if ((x) == -1)						\
698 			return (VMEXIT_ABORT);				\
699 		else if ((x) == 1)					\
700 			return (VMEXIT_CONTINUE);			\
701 	} while (0)
702 
703 int
vmexit_task_switch(struct vmctx * ctx,struct vm_exit * vmexit,int * pvcpu)704 vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
705 {
706 	struct seg_desc nt;
707 	struct tss32 oldtss, newtss;
708 	struct vm_task_switch *task_switch;
709 	struct vm_guest_paging *paging, sup_paging;
710 	struct user_segment_descriptor nt_desc, ot_desc;
711 	struct iovec nt_iov[2], ot_iov[2];
712 	uint64_t cr0, ot_base;
713 	uint32_t eip, ot_lim, access;
714 	int error, ext, minlimit, nt_type, ot_type, vcpu;
715 	enum task_switch_reason reason;
716 	uint16_t nt_sel, ot_sel;
717 
718 	task_switch = &vmexit->u.task_switch;
719 	nt_sel = task_switch->tsssel;
720 	ext = vmexit->u.task_switch.ext;
721 	reason = vmexit->u.task_switch.reason;
722 	paging = &vmexit->u.task_switch.paging;
723 	vcpu = *pvcpu;
724 
725 	assert(paging->cpu_mode == CPU_MODE_PROTECTED);
726 
727 	/*
728 	 * Calculate the %eip to store in the old TSS before modifying the
729 	 * 'inst_length'.
730 	 */
731 	eip = vmexit->rip + vmexit->inst_length;
732 
733 	/*
734 	 * Set the 'inst_length' to '0'.
735 	 *
736 	 * If an exception is triggered during emulation of the task switch
737 	 * then the exception handler should return to the instruction that
738 	 * caused the task switch as opposed to the subsequent instruction.
739 	 */
740 	vmexit->inst_length = 0;
741 
742 	/*
743 	 * Section 4.6, "Access Rights" in Intel SDM Vol 3.
744 	 * The following page table accesses are implicitly supervisor mode:
745 	 * - accesses to GDT or LDT to load segment descriptors
746 	 * - accesses to the task state segment during task switch
747 	 */
748 	sup_paging = *paging;
749 	sup_paging.cpl = 0;	/* implicit supervisor mode */
750 
751 	/* Fetch the new TSS descriptor */
752 	error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc);
753 	CHKERR(error);
754 
755 	nt = usd_to_seg_desc(&nt_desc);
756 
757 	/* Verify the type of the new TSS */
758 	nt_type = SEG_DESC_TYPE(nt.access);
759 	if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS &&
760 	    nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) {
761 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
762 		goto done;
763 	}
764 
765 	/* TSS descriptor must have present bit set */
766 	if (!SEG_DESC_PRESENT(nt.access)) {
767 		sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext);
768 		goto done;
769 	}
770 
771 	/*
772 	 * TSS must have a minimum length of 104 bytes for a 32-bit TSS and
773 	 * 44 bytes for a 16-bit TSS.
774 	 */
775 	if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS)
776 		minlimit = 104 - 1;
777 	else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS)
778 		minlimit = 44 - 1;
779 	else
780 		minlimit = 0;
781 
782 	assert(minlimit > 0);
783 	if (nt.limit < minlimit) {
784 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
785 		goto done;
786 	}
787 
788 	/* TSS must be busy if task switch is due to IRET */
789 	if (reason == TSR_IRET && !TSS_BUSY(nt_type)) {
790 		sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext);
791 		goto done;
792 	}
793 
794 	/*
795 	 * TSS must be available (not busy) if task switch reason is
796 	 * CALL, JMP, exception or interrupt.
797 	 */
798 	if (reason != TSR_IRET && TSS_BUSY(nt_type)) {
799 		sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext);
800 		goto done;
801 	}
802 
803 	/* Fetch the new TSS */
804 	error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1,
805 	    PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov));
806 	CHKERR(error);
807 	vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1);
808 
809 	/* Get the old TSS selector from the guest's task register */
810 	ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR);
811 	if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) {
812 		/*
813 		 * This might happen if a task switch was attempted without
814 		 * ever loading the task register with LTR. In this case the
815 		 * TR would contain the values from power-on:
816 		 * (sel = 0, base = 0, limit = 0xffff).
817 		 */
818 		sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext);
819 		goto done;
820 	}
821 
822 	/* Get the old TSS base and limit from the guest's task register */
823 	error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim,
824 	    &access);
825 	assert(error == 0);
826 	assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access));
827 	ot_type = SEG_DESC_TYPE(access);
828 	assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY);
829 
830 	/* Fetch the old TSS descriptor */
831 	error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc);
832 	CHKERR(error);
833 
834 	/* Get the old TSS */
835 	error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1,
836 	    PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov));
837 	CHKERR(error);
838 	vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1);
839 
840 	/*
841 	 * Clear the busy bit in the old TSS descriptor if the task switch
842 	 * due to an IRET or JMP instruction.
843 	 */
844 	if (reason == TSR_IRET || reason == TSR_JMP) {
845 		ot_desc.sd_type &= ~0x2;
846 		error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel,
847 		    &ot_desc);
848 		CHKERR(error);
849 	}
850 
851 	if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) {
852 		fprintf(stderr, "Task switch to 16-bit TSS not supported\n");
853 		return (VMEXIT_ABORT);
854 	}
855 
856 	/* Save processor state in old TSS */
857 	tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov);
858 
859 	/*
860 	 * If the task switch was triggered for any reason other than IRET
861 	 * then set the busy bit in the new TSS descriptor.
862 	 */
863 	if (reason != TSR_IRET) {
864 		nt_desc.sd_type |= 0x2;
865 		error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel,
866 		    &nt_desc);
867 		CHKERR(error);
868 	}
869 
870 	/* Update task register to point at the new TSS */
871 	SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel);
872 
873 	/* Update the hidden descriptor state of the task register */
874 	nt = usd_to_seg_desc(&nt_desc);
875 	update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt);
876 
877 	/* Set CR0.TS */
878 	cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0);
879 	SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS);
880 
881 	/*
882 	 * We are now committed to the task switch. Any exceptions encountered
883 	 * after this point will be handled in the context of the new task and
884 	 * the saved instruction pointer will belong to the new task.
885 	 */
886 	vmexit->rip = newtss.tss_eip;
887 	assert(vmexit->inst_length == 0);
888 
889 	/* Load processor state from new TSS */
890 	error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov);
891 	CHKERR(error);
892 
893 	/*
894 	 * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception
895 	 * caused an error code to be generated, this error code is copied
896 	 * to the stack of the new task.
897 	 */
898 	if (task_switch->errcode_valid) {
899 		assert(task_switch->ext);
900 		assert(task_switch->reason == TSR_IDT_GATE);
901 		error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type,
902 		    task_switch->errcode);
903 		CHKERR(error);
904 	}
905 
906 	/*
907 	 * Treatment of virtual-NMI blocking if NMI is delivered through
908 	 * a task gate.
909 	 *
910 	 * Section "Architectural State Before A VM Exit", Intel SDM, Vol3:
911 	 * If the virtual NMIs VM-execution control is 1, VM entry injects
912 	 * an NMI, and delivery of the NMI causes a task switch that causes
913 	 * a VM exit, virtual-NMI blocking is in effect before the VM exit
914 	 * commences.
915 	 *
916 	 * Thus, virtual-NMI blocking is in effect at the time of the task
917 	 * switch VM exit.
918 	 */
919 
920 	/*
921 	 * Treatment of virtual-NMI unblocking on IRET from NMI handler task.
922 	 *
923 	 * Section "Changes to Instruction Behavior in VMX Non-Root Operation"
924 	 * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking.
925 	 * This unblocking of virtual-NMI occurs even if IRET causes a fault.
926 	 *
927 	 * Thus, virtual-NMI blocking is cleared at the time of the task switch
928 	 * VM exit.
929 	 */
930 
931 	/*
932 	 * If the task switch was triggered by an event delivered through
933 	 * the IDT then extinguish the pending event from the vcpu's
934 	 * exitintinfo.
935 	 */
936 	if (task_switch->reason == TSR_IDT_GATE) {
937 		error = vm_set_intinfo(ctx, vcpu, 0);
938 		assert(error == 0);
939 	}
940 
941 	/*
942 	 * XXX should inject debug exception if 'T' bit is 1
943 	 */
944 done:
945 	return (VMEXIT_CONTINUE);
946 }
947