xref: /freebsd-11-stable/sys/amd64/amd64/machdep.c (revision 8d931942265177ba231ffa9cd270d66463307899)
1 /*-
2  * Copyright (c) 2003 Peter Wemm.
3  * Copyright (c) 1992 Terrence R. Lambert.
4  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * William Jolitz.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
39  */
40 
41 #include <sys/cdefs.h>
42 __FBSDID("$FreeBSD$");
43 
44 #include "opt_atpic.h"
45 #include "opt_compat.h"
46 #include "opt_cpu.h"
47 #include "opt_ddb.h"
48 #include "opt_inet.h"
49 #include "opt_isa.h"
50 #include "opt_kstack_pages.h"
51 #include "opt_maxmem.h"
52 #include "opt_mp_watchdog.h"
53 #include "opt_perfmon.h"
54 #include "opt_platform.h"
55 #include "opt_sched.h"
56 
57 #include <sys/param.h>
58 #include <sys/proc.h>
59 #include <sys/systm.h>
60 #include <sys/bio.h>
61 #include <sys/buf.h>
62 #include <sys/bus.h>
63 #include <sys/callout.h>
64 #include <sys/cons.h>
65 #include <sys/cpu.h>
66 #include <sys/efi.h>
67 #include <sys/eventhandler.h>
68 #include <sys/exec.h>
69 #include <sys/imgact.h>
70 #include <sys/kdb.h>
71 #include <sys/kernel.h>
72 #include <sys/ktr.h>
73 #include <sys/linker.h>
74 #include <sys/lock.h>
75 #include <sys/malloc.h>
76 #include <sys/memrange.h>
77 #include <sys/msgbuf.h>
78 #include <sys/mutex.h>
79 #include <sys/pcpu.h>
80 #include <sys/ptrace.h>
81 #include <sys/reboot.h>
82 #include <sys/rwlock.h>
83 #include <sys/sched.h>
84 #include <sys/signalvar.h>
85 #ifdef SMP
86 #include <sys/smp.h>
87 #endif
88 #include <sys/syscallsubr.h>
89 #include <sys/sysctl.h>
90 #include <sys/sysent.h>
91 #include <sys/sysproto.h>
92 #include <sys/ucontext.h>
93 #include <sys/vmmeter.h>
94 
95 #include <vm/vm.h>
96 #include <vm/vm_extern.h>
97 #include <vm/vm_kern.h>
98 #include <vm/vm_page.h>
99 #include <vm/vm_map.h>
100 #include <vm/vm_object.h>
101 #include <vm/vm_pager.h>
102 #include <vm/vm_param.h>
103 #include <vm/vm_phys.h>
104 
105 #ifdef DDB
106 #ifndef KDB
107 #error KDB must be enabled in order for DDB to work!
108 #endif
109 #include <ddb/ddb.h>
110 #include <ddb/db_sym.h>
111 #endif
112 
113 #include <net/netisr.h>
114 
115 #include <machine/clock.h>
116 #include <machine/cpu.h>
117 #include <machine/cputypes.h>
118 #include <machine/frame.h>
119 #include <machine/intr_machdep.h>
120 #include <x86/mca.h>
121 #include <machine/md_var.h>
122 #include <machine/metadata.h>
123 #include <machine/mp_watchdog.h>
124 #include <machine/pc/bios.h>
125 #include <machine/pcb.h>
126 #include <machine/proc.h>
127 #include <machine/reg.h>
128 #include <machine/sigframe.h>
129 #include <machine/specialreg.h>
130 #ifdef PERFMON
131 #include <machine/perfmon.h>
132 #endif
133 #include <machine/tss.h>
134 #include <x86/ucode.h>
135 #ifdef SMP
136 #include <machine/smp.h>
137 #endif
138 #ifdef FDT
139 #include <x86/fdt.h>
140 #endif
141 
142 #ifdef DEV_ATPIC
143 #include <x86/isa/icu.h>
144 #else
145 #include <x86/apicvar.h>
146 #endif
147 
148 #include <isa/isareg.h>
149 #include <isa/rtc.h>
150 #include <x86/init.h>
151 
152 /* Sanity check for __curthread() */
153 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
154 
155 /*
156  * The PTI trampoline stack needs enough space for a hardware trapframe and a
157  * couple of scratch registers, as well as the trapframe left behind after an
158  * iret fault.
159  */
160 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
161     offsetof(struct pti_frame, pti_rip));
162 
163 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
164 
165 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
166 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
167 
168 static void cpu_startup(void *);
169 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
170     char *xfpusave, size_t xfpusave_len);
171 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
172     char *xfpustate, size_t xfpustate_len);
173 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
174 
175 /* Preload data parse function */
176 static caddr_t native_parse_preload_data(u_int64_t);
177 
178 /* Native function to fetch and parse the e820 map */
179 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
180 
181 /* Default init_ops implementation. */
182 struct init_ops init_ops = {
183 	.parse_preload_data =	native_parse_preload_data,
184 	.early_clock_source_init =	i8254_init,
185 	.early_delay =			i8254_delay,
186 	.parse_memmap =			native_parse_memmap,
187 #ifdef SMP
188 	.mp_bootaddress =		mp_bootaddress,
189 	.start_all_aps =		native_start_all_aps,
190 #endif
191 	.msi_init =			msi_init,
192 };
193 
194 struct msgbuf *msgbufp;
195 
196 /*
197  * Physical address of the EFI System Table. Stashed from the metadata hints
198  * passed into the kernel and used by the EFI code to call runtime services.
199  */
200 vm_paddr_t efi_systbl_phys;
201 
202 /* Intel ICH registers */
203 #define ICH_PMBASE	0x400
204 #define ICH_SMI_EN	ICH_PMBASE + 0x30
205 
206 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
207 
208 int cold = 1;
209 
210 long Maxmem = 0;
211 long realmem = 0;
212 
213 /*
214  * The number of PHYSMAP entries must be one less than the number of
215  * PHYSSEG entries because the PHYSMAP entry that spans the largest
216  * physical address that is accessible by ISA DMA is split into two
217  * PHYSSEG entries.
218  */
219 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
220 
221 vm_paddr_t phys_avail[PHYSMAP_SIZE + 2];
222 vm_paddr_t dump_avail[PHYSMAP_SIZE + 2];
223 
224 /* must be 2 less so 0 0 can signal end of chunks */
225 #define	PHYS_AVAIL_ARRAY_END (nitems(phys_avail) - 2)
226 #define	DUMP_AVAIL_ARRAY_END (nitems(dump_avail) - 2)
227 
228 struct kva_md_info kmi;
229 
230 static struct trapframe proc0_tf;
231 struct region_descriptor r_gdt, r_idt;
232 
233 struct pcpu __pcpu[MAXCPU];
234 
235 struct mtx icu_lock;
236 
237 struct mem_range_softc mem_range_softc;
238 
239 struct mtx dt_lock;	/* lock for GDT and LDT */
240 
241 void (*vmm_resume_p)(void);
242 
243 static void
cpu_startup(dummy)244 cpu_startup(dummy)
245 	void *dummy;
246 {
247 	uintmax_t memsize;
248 	char *sysenv;
249 
250 	/*
251 	 * On MacBooks, we need to disallow the legacy USB circuit to
252 	 * generate an SMI# because this can cause several problems,
253 	 * namely: incorrect CPU frequency detection and failure to
254 	 * start the APs.
255 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
256 	 * Enable register) of the Intel ICH LPC Interface Bridge.
257 	 */
258 	sysenv = kern_getenv("smbios.system.product");
259 	if (sysenv != NULL) {
260 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
261 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
262 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
263 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
264 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
265 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
266 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
267 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
268 			if (bootverbose)
269 				printf("Disabling LEGACY_USB_EN bit on "
270 				    "Intel ICH.\n");
271 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
272 		}
273 		freeenv(sysenv);
274 	}
275 
276 	/*
277 	 * Good {morning,afternoon,evening,night}.
278 	 */
279 	startrtclock();
280 	printcpuinfo();
281 #ifdef PERFMON
282 	perfmon_init();
283 #endif
284 
285 	/*
286 	 * Display physical memory if SMBIOS reports reasonable amount.
287 	 */
288 	memsize = 0;
289 	sysenv = kern_getenv("smbios.memory.enabled");
290 	if (sysenv != NULL) {
291 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
292 		freeenv(sysenv);
293 	}
294 	if (memsize < ptoa((uintmax_t)vm_cnt.v_free_count))
295 		memsize = ptoa((uintmax_t)Maxmem);
296 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
297 	realmem = atop(memsize);
298 
299 	/*
300 	 * Display any holes after the first chunk of extended memory.
301 	 */
302 	if (bootverbose) {
303 		int indx;
304 
305 		printf("Physical memory chunk(s):\n");
306 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
307 			vm_paddr_t size;
308 
309 			size = phys_avail[indx + 1] - phys_avail[indx];
310 			printf(
311 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
312 			    (uintmax_t)phys_avail[indx],
313 			    (uintmax_t)phys_avail[indx + 1] - 1,
314 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
315 		}
316 	}
317 
318 	vm_ksubmap_init(&kmi);
319 
320 	printf("avail memory = %ju (%ju MB)\n",
321 	    ptoa((uintmax_t)vm_cnt.v_free_count),
322 	    ptoa((uintmax_t)vm_cnt.v_free_count) / 1048576);
323 
324 	/*
325 	 * Set up buffers, so they can be used to read disk labels.
326 	 */
327 	bufinit();
328 	vm_pager_bufferinit();
329 
330 	cpu_setregs();
331 }
332 
333 /*
334  * Send an interrupt to process.
335  *
336  * Stack is set up to allow sigcode stored
337  * at top to call routine, followed by call
338  * to sigreturn routine below.  After sigreturn
339  * resets the signal mask, the stack, and the
340  * frame pointer, it returns to the user
341  * specified pc, psl.
342  */
343 void
sendsig(sig_t catcher,ksiginfo_t * ksi,sigset_t * mask)344 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
345 {
346 	struct sigframe sf, *sfp;
347 	struct pcb *pcb;
348 	struct proc *p;
349 	struct thread *td;
350 	struct sigacts *psp;
351 	char *sp;
352 	struct trapframe *regs;
353 	char *xfpusave;
354 	size_t xfpusave_len;
355 	int sig;
356 	int oonstack;
357 
358 	td = curthread;
359 	pcb = td->td_pcb;
360 	p = td->td_proc;
361 	PROC_LOCK_ASSERT(p, MA_OWNED);
362 	sig = ksi->ksi_signo;
363 	psp = p->p_sigacts;
364 	mtx_assert(&psp->ps_mtx, MA_OWNED);
365 	regs = td->td_frame;
366 	oonstack = sigonstack(regs->tf_rsp);
367 
368 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
369 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
370 		xfpusave = __builtin_alloca(xfpusave_len);
371 	} else {
372 		xfpusave_len = 0;
373 		xfpusave = NULL;
374 	}
375 
376 	/* Save user context. */
377 	bzero(&sf, sizeof(sf));
378 	sf.sf_uc.uc_sigmask = *mask;
379 	sf.sf_uc.uc_stack = td->td_sigstk;
380 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
381 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
382 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
383 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
384 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
385 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
386 	fpstate_drop(td);
387 	update_pcb_bases(pcb);
388 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
389 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
390 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
391 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
392 	bzero(sf.sf_uc.__spare__, sizeof(sf.sf_uc.__spare__));
393 
394 	/* Allocate space for the signal handler context. */
395 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
396 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
397 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
398 #if defined(COMPAT_43)
399 		td->td_sigstk.ss_flags |= SS_ONSTACK;
400 #endif
401 	} else
402 		sp = (char *)regs->tf_rsp - 128;
403 	if (xfpusave != NULL) {
404 		sp -= xfpusave_len;
405 		sp = (char *)((unsigned long)sp & ~0x3Ful);
406 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
407 	}
408 	sp -= sizeof(struct sigframe);
409 	/* Align to 16 bytes. */
410 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
411 
412 	/* Build the argument list for the signal handler. */
413 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
414 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
415 	bzero(&sf.sf_si, sizeof(sf.sf_si));
416 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
417 		/* Signal handler installed with SA_SIGINFO. */
418 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
419 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
420 
421 		/* Fill in POSIX parts */
422 		sf.sf_si = ksi->ksi_info;
423 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
424 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
425 	} else {
426 		/* Old FreeBSD-style arguments. */
427 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
428 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
429 		sf.sf_ahu.sf_handler = catcher;
430 	}
431 	mtx_unlock(&psp->ps_mtx);
432 	PROC_UNLOCK(p);
433 
434 	/*
435 	 * Copy the sigframe out to the user's stack.
436 	 */
437 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
438 	    (xfpusave != NULL && copyout(xfpusave,
439 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
440 	    != 0)) {
441 #ifdef DEBUG
442 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
443 #endif
444 		PROC_LOCK(p);
445 		sigexit(td, SIGILL);
446 	}
447 
448 	regs->tf_rsp = (long)sfp;
449 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
450 	regs->tf_rflags &= ~(PSL_T | PSL_D);
451 	regs->tf_cs = _ucodesel;
452 	regs->tf_ds = _udatasel;
453 	regs->tf_ss = _udatasel;
454 	regs->tf_es = _udatasel;
455 	regs->tf_fs = _ufssel;
456 	regs->tf_gs = _ugssel;
457 	regs->tf_flags = TF_HASSEGS;
458 	PROC_LOCK(p);
459 	mtx_lock(&psp->ps_mtx);
460 }
461 
462 /*
463  * System call to cleanup state after a signal
464  * has been taken.  Reset signal mask and
465  * stack state from context left by sendsig (above).
466  * Return to previous pc and psl as specified by
467  * context left by sendsig. Check carefully to
468  * make sure that the user has not modified the
469  * state to gain improper privileges.
470  *
471  * MPSAFE
472  */
473 int
sys_sigreturn(td,uap)474 sys_sigreturn(td, uap)
475 	struct thread *td;
476 	struct sigreturn_args /* {
477 		const struct __ucontext *sigcntxp;
478 	} */ *uap;
479 {
480 	ucontext_t uc;
481 	struct pcb *pcb;
482 	struct proc *p;
483 	struct trapframe *regs;
484 	ucontext_t *ucp;
485 	char *xfpustate;
486 	size_t xfpustate_len;
487 	long rflags;
488 	int cs, error, ret;
489 	ksiginfo_t ksi;
490 
491 	pcb = td->td_pcb;
492 	p = td->td_proc;
493 
494 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
495 	if (error != 0) {
496 		uprintf("pid %d (%s): sigreturn copyin failed\n",
497 		    p->p_pid, td->td_name);
498 		return (error);
499 	}
500 	ucp = &uc;
501 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
502 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
503 		    td->td_name, ucp->uc_mcontext.mc_flags);
504 		return (EINVAL);
505 	}
506 	regs = td->td_frame;
507 	rflags = ucp->uc_mcontext.mc_rflags;
508 	/*
509 	 * Don't allow users to change privileged or reserved flags.
510 	 */
511 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
512 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
513 		    td->td_name, rflags);
514 		return (EINVAL);
515 	}
516 
517 	/*
518 	 * Don't allow users to load a valid privileged %cs.  Let the
519 	 * hardware check for invalid selectors, excess privilege in
520 	 * other selectors, invalid %eip's and invalid %esp's.
521 	 */
522 	cs = ucp->uc_mcontext.mc_cs;
523 	if (!CS_SECURE(cs)) {
524 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
525 		    td->td_name, cs);
526 		ksiginfo_init_trap(&ksi);
527 		ksi.ksi_signo = SIGBUS;
528 		ksi.ksi_code = BUS_OBJERR;
529 		ksi.ksi_trapno = T_PROTFLT;
530 		ksi.ksi_addr = (void *)regs->tf_rip;
531 		trapsignal(td, &ksi);
532 		return (EINVAL);
533 	}
534 
535 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
536 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
537 		if (xfpustate_len > cpu_max_ext_state_size -
538 		    sizeof(struct savefpu)) {
539 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
540 			    p->p_pid, td->td_name, xfpustate_len);
541 			return (EINVAL);
542 		}
543 		xfpustate = __builtin_alloca(xfpustate_len);
544 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
545 		    xfpustate, xfpustate_len);
546 		if (error != 0) {
547 			uprintf(
548 	"pid %d (%s): sigreturn copying xfpustate failed\n",
549 			    p->p_pid, td->td_name);
550 			return (error);
551 		}
552 	} else {
553 		xfpustate = NULL;
554 		xfpustate_len = 0;
555 	}
556 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
557 	if (ret != 0) {
558 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
559 		    p->p_pid, td->td_name, ret);
560 		return (ret);
561 	}
562 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
563 	update_pcb_bases(pcb);
564 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
565 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
566 
567 #if defined(COMPAT_43)
568 	if (ucp->uc_mcontext.mc_onstack & 1)
569 		td->td_sigstk.ss_flags |= SS_ONSTACK;
570 	else
571 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
572 #endif
573 
574 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
575 	return (EJUSTRETURN);
576 }
577 
578 #ifdef COMPAT_FREEBSD4
579 int
freebsd4_sigreturn(struct thread * td,struct freebsd4_sigreturn_args * uap)580 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
581 {
582 
583 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
584 }
585 #endif
586 
587 /*
588  * Reset registers to default values on exec.
589  */
590 void
exec_setregs(struct thread * td,struct image_params * imgp,u_long stack)591 exec_setregs(struct thread *td, struct image_params *imgp, u_long stack)
592 {
593 	struct trapframe *regs;
594 	struct pcb *pcb;
595 	register_t saved_rflags;
596 
597 	regs = td->td_frame;
598 	pcb = td->td_pcb;
599 
600 	mtx_lock(&dt_lock);
601 	if (td->td_proc->p_md.md_ldt != NULL)
602 		user_ldt_free(td);
603 	else
604 		mtx_unlock(&dt_lock);
605 
606 	update_pcb_bases(pcb);
607 	pcb->pcb_fsbase = 0;
608 	pcb->pcb_gsbase = 0;
609 	clear_pcb_flags(pcb, PCB_32BIT);
610 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
611 
612 	saved_rflags = regs->tf_rflags & PSL_T;
613 	bzero((char *)regs, sizeof(struct trapframe));
614 	regs->tf_rip = imgp->entry_addr;
615 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
616 	regs->tf_rdi = stack;		/* argv */
617 	regs->tf_rflags = PSL_USER | saved_rflags;
618 	regs->tf_ss = _udatasel;
619 	regs->tf_cs = _ucodesel;
620 	regs->tf_ds = _udatasel;
621 	regs->tf_es = _udatasel;
622 	regs->tf_fs = _ufssel;
623 	regs->tf_gs = _ugssel;
624 	regs->tf_flags = TF_HASSEGS;
625 	td->td_retval[1] = 0;
626 
627 	/*
628 	 * Reset the hardware debug registers if they were in use.
629 	 * They won't have any meaning for the newly exec'd process.
630 	 */
631 	if (pcb->pcb_flags & PCB_DBREGS) {
632 		pcb->pcb_dr0 = 0;
633 		pcb->pcb_dr1 = 0;
634 		pcb->pcb_dr2 = 0;
635 		pcb->pcb_dr3 = 0;
636 		pcb->pcb_dr6 = 0;
637 		pcb->pcb_dr7 = 0;
638 		if (pcb == curpcb) {
639 			/*
640 			 * Clear the debug registers on the running
641 			 * CPU, otherwise they will end up affecting
642 			 * the next process we switch to.
643 			 */
644 			reset_dbregs();
645 		}
646 		clear_pcb_flags(pcb, PCB_DBREGS);
647 	}
648 
649 	/*
650 	 * Drop the FP state if we hold it, so that the process gets a
651 	 * clean FP state if it uses the FPU again.
652 	 */
653 	fpstate_drop(td);
654 }
655 
656 void
cpu_setregs(void)657 cpu_setregs(void)
658 {
659 	register_t cr0;
660 
661 	cr0 = rcr0();
662 	/*
663 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
664 	 * BSP.  See the comments there about why we set them.
665 	 */
666 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
667 	load_cr0(cr0);
668 }
669 
670 /*
671  * Initialize amd64 and configure to run kernel
672  */
673 
674 /*
675  * Initialize segments & interrupt table
676  */
677 
678 struct user_segment_descriptor gdt[NGDT * MAXCPU];/* global descriptor tables */
679 static struct gate_descriptor idt0[NIDT];
680 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
681 
682 static char dblfault_stack[PAGE_SIZE] __aligned(16);
683 static char mce0_stack[PAGE_SIZE] __aligned(16);
684 static char nmi0_stack[PAGE_SIZE] __aligned(16);
685 static char dbg0_stack[PAGE_SIZE] __aligned(16);
686 CTASSERT(sizeof(struct nmi_pcpu) == 16);
687 
688 struct amd64tss common_tss[MAXCPU];
689 
690 /*
691  * Software prototypes -- in more palatable form.
692  *
693  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
694  * slots as corresponding segments for i386 kernel.
695  */
696 struct soft_segment_descriptor gdt_segs[] = {
697 /* GNULL_SEL	0 Null Descriptor */
698 {	.ssd_base = 0x0,
699 	.ssd_limit = 0x0,
700 	.ssd_type = 0,
701 	.ssd_dpl = 0,
702 	.ssd_p = 0,
703 	.ssd_long = 0,
704 	.ssd_def32 = 0,
705 	.ssd_gran = 0		},
706 /* GNULL2_SEL	1 Null Descriptor */
707 {	.ssd_base = 0x0,
708 	.ssd_limit = 0x0,
709 	.ssd_type = 0,
710 	.ssd_dpl = 0,
711 	.ssd_p = 0,
712 	.ssd_long = 0,
713 	.ssd_def32 = 0,
714 	.ssd_gran = 0		},
715 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
716 {	.ssd_base = 0x0,
717 	.ssd_limit = 0xfffff,
718 	.ssd_type = SDT_MEMRWA,
719 	.ssd_dpl = SEL_UPL,
720 	.ssd_p = 1,
721 	.ssd_long = 0,
722 	.ssd_def32 = 1,
723 	.ssd_gran = 1		},
724 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
725 {	.ssd_base = 0x0,
726 	.ssd_limit = 0xfffff,
727 	.ssd_type = SDT_MEMRWA,
728 	.ssd_dpl = SEL_UPL,
729 	.ssd_p = 1,
730 	.ssd_long = 0,
731 	.ssd_def32 = 1,
732 	.ssd_gran = 1		},
733 /* GCODE_SEL	4 Code Descriptor for kernel */
734 {	.ssd_base = 0x0,
735 	.ssd_limit = 0xfffff,
736 	.ssd_type = SDT_MEMERA,
737 	.ssd_dpl = SEL_KPL,
738 	.ssd_p = 1,
739 	.ssd_long = 1,
740 	.ssd_def32 = 0,
741 	.ssd_gran = 1		},
742 /* GDATA_SEL	5 Data Descriptor for kernel */
743 {	.ssd_base = 0x0,
744 	.ssd_limit = 0xfffff,
745 	.ssd_type = SDT_MEMRWA,
746 	.ssd_dpl = SEL_KPL,
747 	.ssd_p = 1,
748 	.ssd_long = 1,
749 	.ssd_def32 = 0,
750 	.ssd_gran = 1		},
751 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
752 {	.ssd_base = 0x0,
753 	.ssd_limit = 0xfffff,
754 	.ssd_type = SDT_MEMERA,
755 	.ssd_dpl = SEL_UPL,
756 	.ssd_p = 1,
757 	.ssd_long = 0,
758 	.ssd_def32 = 1,
759 	.ssd_gran = 1		},
760 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
761 {	.ssd_base = 0x0,
762 	.ssd_limit = 0xfffff,
763 	.ssd_type = SDT_MEMRWA,
764 	.ssd_dpl = SEL_UPL,
765 	.ssd_p = 1,
766 	.ssd_long = 0,
767 	.ssd_def32 = 1,
768 	.ssd_gran = 1		},
769 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
770 {	.ssd_base = 0x0,
771 	.ssd_limit = 0xfffff,
772 	.ssd_type = SDT_MEMERA,
773 	.ssd_dpl = SEL_UPL,
774 	.ssd_p = 1,
775 	.ssd_long = 1,
776 	.ssd_def32 = 0,
777 	.ssd_gran = 1		},
778 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
779 {	.ssd_base = 0x0,
780 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
781 	.ssd_type = SDT_SYSTSS,
782 	.ssd_dpl = SEL_KPL,
783 	.ssd_p = 1,
784 	.ssd_long = 0,
785 	.ssd_def32 = 0,
786 	.ssd_gran = 0		},
787 /* Actually, the TSS is a system descriptor which is double size */
788 {	.ssd_base = 0x0,
789 	.ssd_limit = 0x0,
790 	.ssd_type = 0,
791 	.ssd_dpl = 0,
792 	.ssd_p = 0,
793 	.ssd_long = 0,
794 	.ssd_def32 = 0,
795 	.ssd_gran = 0		},
796 /* GUSERLDT_SEL	11 LDT Descriptor */
797 {	.ssd_base = 0x0,
798 	.ssd_limit = 0x0,
799 	.ssd_type = 0,
800 	.ssd_dpl = 0,
801 	.ssd_p = 0,
802 	.ssd_long = 0,
803 	.ssd_def32 = 0,
804 	.ssd_gran = 0		},
805 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
806 {	.ssd_base = 0x0,
807 	.ssd_limit = 0x0,
808 	.ssd_type = 0,
809 	.ssd_dpl = 0,
810 	.ssd_p = 0,
811 	.ssd_long = 0,
812 	.ssd_def32 = 0,
813 	.ssd_gran = 0		},
814 };
815 
816 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)817 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
818 {
819 	struct gate_descriptor *ip;
820 
821 	ip = idt + idx;
822 	ip->gd_looffset = (uintptr_t)func;
823 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
824 	ip->gd_ist = ist;
825 	ip->gd_xx = 0;
826 	ip->gd_type = typ;
827 	ip->gd_dpl = dpl;
828 	ip->gd_p = 1;
829 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
830 }
831 
832 extern inthand_t
833 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
834 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
835 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
836 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
837 	IDTVEC(xmm), IDTVEC(dblfault),
838 	IDTVEC(div_pti), IDTVEC(bpt_pti),
839 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
840 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
841 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
842 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
843 	IDTVEC(xmm_pti),
844 #ifdef KDTRACE_HOOKS
845 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
846 #endif
847 #ifdef XENHVM
848 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
849 #endif
850 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
851 	IDTVEC(fast_syscall_pti);
852 
853 #ifdef DDB
854 /*
855  * Display the index and function name of any IDT entries that don't use
856  * the default 'rsvd' entry point.
857  */
DB_SHOW_COMMAND(idt,db_show_idt)858 DB_SHOW_COMMAND(idt, db_show_idt)
859 {
860 	struct gate_descriptor *ip;
861 	int idx;
862 	uintptr_t func;
863 
864 	ip = idt;
865 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
866 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
867 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
868 			db_printf("%3d\t", idx);
869 			db_printsym(func, DB_STGY_PROC);
870 			db_printf("\n");
871 		}
872 		ip++;
873 	}
874 }
875 
876 /* Show privileged registers. */
DB_SHOW_COMMAND(sysregs,db_show_sysregs)877 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
878 {
879 	struct {
880 		uint16_t limit;
881 		uint64_t base;
882 	} __packed idtr, gdtr;
883 	uint16_t ldt, tr;
884 
885 	__asm __volatile("sidt %0" : "=m" (idtr));
886 	db_printf("idtr\t0x%016lx/%04x\n",
887 	    (u_long)idtr.base, (u_int)idtr.limit);
888 	__asm __volatile("sgdt %0" : "=m" (gdtr));
889 	db_printf("gdtr\t0x%016lx/%04x\n",
890 	    (u_long)gdtr.base, (u_int)gdtr.limit);
891 	__asm __volatile("sldt %0" : "=r" (ldt));
892 	db_printf("ldtr\t0x%04x\n", ldt);
893 	__asm __volatile("str %0" : "=r" (tr));
894 	db_printf("tr\t0x%04x\n", tr);
895 	db_printf("cr0\t0x%016lx\n", rcr0());
896 	db_printf("cr2\t0x%016lx\n", rcr2());
897 	db_printf("cr3\t0x%016lx\n", rcr3());
898 	db_printf("cr4\t0x%016lx\n", rcr4());
899 	if (rcr4() & CR4_XSAVE)
900 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
901 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
902 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
903 		db_printf("FEATURES_CTL\t%016lx\n",
904 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
905 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
906 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
907 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
908 }
909 
DB_SHOW_COMMAND(dbregs,db_show_dbregs)910 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
911 {
912 
913 	db_printf("dr0\t0x%016lx\n", rdr0());
914 	db_printf("dr1\t0x%016lx\n", rdr1());
915 	db_printf("dr2\t0x%016lx\n", rdr2());
916 	db_printf("dr3\t0x%016lx\n", rdr3());
917 	db_printf("dr6\t0x%016lx\n", rdr6());
918 	db_printf("dr7\t0x%016lx\n", rdr7());
919 }
920 #endif
921 
922 void
sdtossd(sd,ssd)923 sdtossd(sd, ssd)
924 	struct user_segment_descriptor *sd;
925 	struct soft_segment_descriptor *ssd;
926 {
927 
928 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
929 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
930 	ssd->ssd_type  = sd->sd_type;
931 	ssd->ssd_dpl   = sd->sd_dpl;
932 	ssd->ssd_p     = sd->sd_p;
933 	ssd->ssd_long  = sd->sd_long;
934 	ssd->ssd_def32 = sd->sd_def32;
935 	ssd->ssd_gran  = sd->sd_gran;
936 }
937 
938 void
ssdtosd(ssd,sd)939 ssdtosd(ssd, sd)
940 	struct soft_segment_descriptor *ssd;
941 	struct user_segment_descriptor *sd;
942 {
943 
944 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
945 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
946 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
947 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
948 	sd->sd_type  = ssd->ssd_type;
949 	sd->sd_dpl   = ssd->ssd_dpl;
950 	sd->sd_p     = ssd->ssd_p;
951 	sd->sd_long  = ssd->ssd_long;
952 	sd->sd_def32 = ssd->ssd_def32;
953 	sd->sd_gran  = ssd->ssd_gran;
954 }
955 
956 void
ssdtosyssd(ssd,sd)957 ssdtosyssd(ssd, sd)
958 	struct soft_segment_descriptor *ssd;
959 	struct system_segment_descriptor *sd;
960 {
961 
962 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
963 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
964 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
965 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
966 	sd->sd_type  = ssd->ssd_type;
967 	sd->sd_dpl   = ssd->ssd_dpl;
968 	sd->sd_p     = ssd->ssd_p;
969 	sd->sd_gran  = ssd->ssd_gran;
970 }
971 
972 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
973 #include <isa/isavar.h>
974 #include <isa/isareg.h>
975 /*
976  * Return a bitmap of the current interrupt requests.  This is 8259-specific
977  * and is only suitable for use at probe time.
978  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
979  * It shouldn't be here.  There should probably be an APIC centric
980  * implementation in the apic driver code, if at all.
981  */
982 intrmask_t
isa_irq_pending(void)983 isa_irq_pending(void)
984 {
985 	u_char irr1;
986 	u_char irr2;
987 
988 	irr1 = inb(IO_ICU1);
989 	irr2 = inb(IO_ICU2);
990 	return ((irr2 << 8) | irr1);
991 }
992 #endif
993 
994 u_int basemem;
995 
996 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)997 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
998     int *physmap_idxp)
999 {
1000 	int i, insert_idx, physmap_idx;
1001 
1002 	physmap_idx = *physmap_idxp;
1003 
1004 	if (length == 0)
1005 		return (1);
1006 
1007 	/*
1008 	 * Find insertion point while checking for overlap.  Start off by
1009 	 * assuming the new entry will be added to the end.
1010 	 *
1011 	 * NB: physmap_idx points to the next free slot.
1012 	 */
1013 	insert_idx = physmap_idx;
1014 	for (i = 0; i <= physmap_idx; i += 2) {
1015 		if (base < physmap[i + 1]) {
1016 			if (base + length <= physmap[i]) {
1017 				insert_idx = i;
1018 				break;
1019 			}
1020 			if (boothowto & RB_VERBOSE)
1021 				printf(
1022 		    "Overlapping memory regions, ignoring second region\n");
1023 			return (1);
1024 		}
1025 	}
1026 
1027 	/* See if we can prepend to the next entry. */
1028 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1029 		physmap[insert_idx] = base;
1030 		return (1);
1031 	}
1032 
1033 	/* See if we can append to the previous entry. */
1034 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1035 		physmap[insert_idx - 1] += length;
1036 		return (1);
1037 	}
1038 
1039 	physmap_idx += 2;
1040 	*physmap_idxp = physmap_idx;
1041 	if (physmap_idx == PHYSMAP_SIZE) {
1042 		printf(
1043 		"Too many segments in the physical address map, giving up\n");
1044 		return (0);
1045 	}
1046 
1047 	/*
1048 	 * Move the last 'N' entries down to make room for the new
1049 	 * entry if needed.
1050 	 */
1051 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1052 		physmap[i] = physmap[i - 2];
1053 		physmap[i + 1] = physmap[i - 1];
1054 	}
1055 
1056 	/* Insert the new entry. */
1057 	physmap[insert_idx] = base;
1058 	physmap[insert_idx + 1] = base + length;
1059 	return (1);
1060 }
1061 
1062 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)1063 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1064                       vm_paddr_t *physmap, int *physmap_idx)
1065 {
1066 	struct bios_smap *smap, *smapend;
1067 
1068 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1069 
1070 	for (smap = smapbase; smap < smapend; smap++) {
1071 		if (boothowto & RB_VERBOSE)
1072 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1073 			    smap->type, smap->base, smap->length);
1074 
1075 		if (smap->type != SMAP_TYPE_MEMORY)
1076 			continue;
1077 
1078 		if (!add_physmap_entry(smap->base, smap->length, physmap,
1079 		    physmap_idx))
1080 			break;
1081 	}
1082 }
1083 
1084 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)1085 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1086     int *physmap_idx)
1087 {
1088 	struct efi_md *map, *p;
1089 	const char *type;
1090 	size_t efisz;
1091 	int ndesc, i;
1092 
1093 	static const char *types[] = {
1094 		"Reserved",
1095 		"LoaderCode",
1096 		"LoaderData",
1097 		"BootServicesCode",
1098 		"BootServicesData",
1099 		"RuntimeServicesCode",
1100 		"RuntimeServicesData",
1101 		"ConventionalMemory",
1102 		"UnusableMemory",
1103 		"ACPIReclaimMemory",
1104 		"ACPIMemoryNVS",
1105 		"MemoryMappedIO",
1106 		"MemoryMappedIOPortSpace",
1107 		"PalCode",
1108 		"PersistentMemory"
1109 	};
1110 
1111 	/*
1112 	 * Memory map data provided by UEFI via the GetMemoryMap
1113 	 * Boot Services API.
1114 	 */
1115 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1116 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1117 
1118 	if (efihdr->descriptor_size == 0)
1119 		return;
1120 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1121 
1122 	if (boothowto & RB_VERBOSE)
1123 		printf("%23s %12s %12s %8s %4s\n",
1124 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1125 
1126 	for (i = 0, p = map; i < ndesc; i++,
1127 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1128 		if (boothowto & RB_VERBOSE) {
1129 			if (p->md_type < nitems(types))
1130 				type = types[p->md_type];
1131 			else
1132 				type = "<INVALID>";
1133 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1134 			    p->md_virt, p->md_pages);
1135 			if (p->md_attr & EFI_MD_ATTR_UC)
1136 				printf("UC ");
1137 			if (p->md_attr & EFI_MD_ATTR_WC)
1138 				printf("WC ");
1139 			if (p->md_attr & EFI_MD_ATTR_WT)
1140 				printf("WT ");
1141 			if (p->md_attr & EFI_MD_ATTR_WB)
1142 				printf("WB ");
1143 			if (p->md_attr & EFI_MD_ATTR_UCE)
1144 				printf("UCE ");
1145 			if (p->md_attr & EFI_MD_ATTR_WP)
1146 				printf("WP ");
1147 			if (p->md_attr & EFI_MD_ATTR_RP)
1148 				printf("RP ");
1149 			if (p->md_attr & EFI_MD_ATTR_XP)
1150 				printf("XP ");
1151 			if (p->md_attr & EFI_MD_ATTR_NV)
1152 				printf("NV ");
1153 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1154 				printf("MORE_RELIABLE ");
1155 			if (p->md_attr & EFI_MD_ATTR_RO)
1156 				printf("RO ");
1157 			if (p->md_attr & EFI_MD_ATTR_RT)
1158 				printf("RUNTIME");
1159 			printf("\n");
1160 		}
1161 
1162 		switch (p->md_type) {
1163 		case EFI_MD_TYPE_CODE:
1164 		case EFI_MD_TYPE_DATA:
1165 		case EFI_MD_TYPE_BS_CODE:
1166 		case EFI_MD_TYPE_BS_DATA:
1167 		case EFI_MD_TYPE_FREE:
1168 			/*
1169 			 * We're allowed to use any entry with these types.
1170 			 */
1171 			break;
1172 		default:
1173 			continue;
1174 		}
1175 
1176 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1177 		    physmap, physmap_idx))
1178 			break;
1179 	}
1180 }
1181 
1182 static char bootmethod[16] = "";
1183 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1184     "System firmware boot method");
1185 
1186 static void
native_parse_memmap(caddr_t kmdp,vm_paddr_t * physmap,int * physmap_idx)1187 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1188 {
1189 	struct bios_smap *smap;
1190 	struct efi_map_header *efihdr;
1191 	u_int32_t size;
1192 
1193 	/*
1194 	 * Memory map from INT 15:E820.
1195 	 *
1196 	 * subr_module.c says:
1197 	 * "Consumer may safely assume that size value precedes data."
1198 	 * ie: an int32_t immediately precedes smap.
1199 	 */
1200 
1201 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1202 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1203 	smap = (struct bios_smap *)preload_search_info(kmdp,
1204 	    MODINFO_METADATA | MODINFOMD_SMAP);
1205 	if (efihdr == NULL && smap == NULL)
1206 		panic("No BIOS smap or EFI map info from loader!");
1207 
1208 	if (efihdr != NULL) {
1209 		add_efi_map_entries(efihdr, physmap, physmap_idx);
1210 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1211 	} else {
1212 		size = *((u_int32_t *)smap - 1);
1213 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1214 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1215 	}
1216 }
1217 
1218 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1219 
1220 /*
1221  * Populate the (physmap) array with base/bound pairs describing the
1222  * available physical memory in the system, then test this memory and
1223  * build the phys_avail array describing the actually-available memory.
1224  *
1225  * Total memory size may be set by the kernel environment variable
1226  * hw.physmem or the compile-time define MAXMEM.
1227  *
1228  * XXX first should be vm_paddr_t.
1229  */
1230 static void
getmemsize(caddr_t kmdp,u_int64_t first)1231 getmemsize(caddr_t kmdp, u_int64_t first)
1232 {
1233 	int i, physmap_idx, pa_indx, da_indx;
1234 	vm_paddr_t pa, physmap[PHYSMAP_SIZE];
1235 	u_long physmem_start, physmem_tunable, memtest;
1236 	pt_entry_t *pte;
1237 	quad_t dcons_addr, dcons_size;
1238 	int page_counter;
1239 
1240 	/*
1241 	 * Tell the physical memory allocator about pages used to store
1242 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1243 	 */
1244 	vm_phys_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1245 
1246 	bzero(physmap, sizeof(physmap));
1247 	physmap_idx = 0;
1248 
1249 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1250 	physmap_idx -= 2;
1251 
1252 	/*
1253 	 * Find the 'base memory' segment for SMP
1254 	 */
1255 	basemem = 0;
1256 	for (i = 0; i <= physmap_idx; i += 2) {
1257 		if (physmap[i] <= 0xA0000) {
1258 			basemem = physmap[i + 1] / 1024;
1259 			break;
1260 		}
1261 	}
1262 	if (basemem == 0 || basemem > 640) {
1263 		if (bootverbose)
1264 			printf(
1265 		"Memory map doesn't contain a basemem segment, faking it");
1266 		basemem = 640;
1267 	}
1268 
1269 	/*
1270 	 * Make hole for "AP -> long mode" bootstrap code.  The
1271 	 * mp_bootaddress vector is only available when the kernel
1272 	 * is configured to support APs and APs for the system start
1273 	 * in 32bit mode (e.g. SMP bare metal).
1274 	 */
1275 	if (init_ops.mp_bootaddress) {
1276 		if (physmap[1] >= 0x100000000)
1277 			panic(
1278 	"Basemem segment is not suitable for AP bootstrap code!");
1279 		physmap[1] = init_ops.mp_bootaddress(physmap[1] / 1024);
1280 	}
1281 
1282 	/*
1283 	 * Maxmem isn't the "maximum memory", it's one larger than the
1284 	 * highest page of the physical address space.  It should be
1285 	 * called something like "Maxphyspage".  We may adjust this
1286 	 * based on ``hw.physmem'' and the results of the memory test.
1287 	 */
1288 	Maxmem = atop(physmap[physmap_idx + 1]);
1289 
1290 #ifdef MAXMEM
1291 	Maxmem = MAXMEM / 4;
1292 #endif
1293 
1294 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1295 		Maxmem = atop(physmem_tunable);
1296 
1297 	/*
1298 	 * The boot memory test is disabled by default, as it takes a
1299 	 * significant amount of time on large-memory systems, and is
1300 	 * unfriendly to virtual machines as it unnecessarily touches all
1301 	 * pages.
1302 	 *
1303 	 * A general name is used as the code may be extended to support
1304 	 * additional tests beyond the current "page present" test.
1305 	 */
1306 	memtest = 0;
1307 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1308 
1309 	/*
1310 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1311 	 * in the system.
1312 	 */
1313 	if (Maxmem > atop(physmap[physmap_idx + 1]))
1314 		Maxmem = atop(physmap[physmap_idx + 1]);
1315 
1316 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1317 	    (boothowto & RB_VERBOSE))
1318 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1319 
1320 	/* call pmap initialization to make new kernel address space */
1321 	pmap_bootstrap(&first);
1322 
1323 	/*
1324 	 * Size up each available chunk of physical memory.
1325 	 *
1326 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1327 	 * By default, mask off the first 16 pages unless we appear to be
1328 	 * running in a VM.
1329 	 */
1330 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1331 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1332 	if (physmap[0] < physmem_start) {
1333 		if (physmem_start < PAGE_SIZE)
1334 			physmap[0] = PAGE_SIZE;
1335 		else if (physmem_start >= physmap[1])
1336 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1337 		else
1338 			physmap[0] = round_page(physmem_start);
1339 	}
1340 	pa_indx = 0;
1341 	da_indx = 1;
1342 	phys_avail[pa_indx++] = physmap[0];
1343 	phys_avail[pa_indx] = physmap[0];
1344 	dump_avail[da_indx] = physmap[0];
1345 	pte = CMAP1;
1346 
1347 	/*
1348 	 * Get dcons buffer address
1349 	 */
1350 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1351 	    getenv_quad("dcons.size", &dcons_size) == 0)
1352 		dcons_addr = 0;
1353 
1354 	/*
1355 	 * physmap is in bytes, so when converting to page boundaries,
1356 	 * round up the start address and round down the end address.
1357 	 */
1358 	page_counter = 0;
1359 	if (memtest != 0)
1360 		printf("Testing system memory");
1361 	for (i = 0; i <= physmap_idx; i += 2) {
1362 		vm_paddr_t end;
1363 
1364 		end = ptoa((vm_paddr_t)Maxmem);
1365 		if (physmap[i + 1] < end)
1366 			end = trunc_page(physmap[i + 1]);
1367 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1368 			int tmp, page_bad, full;
1369 			int *ptr = (int *)CADDR1;
1370 
1371 			full = FALSE;
1372 			/*
1373 			 * block out kernel memory as not available.
1374 			 */
1375 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1376 				goto do_dump_avail;
1377 
1378 			/*
1379 			 * block out dcons buffer
1380 			 */
1381 			if (dcons_addr > 0
1382 			    && pa >= trunc_page(dcons_addr)
1383 			    && pa < dcons_addr + dcons_size)
1384 				goto do_dump_avail;
1385 
1386 			page_bad = FALSE;
1387 			if (memtest == 0)
1388 				goto skip_memtest;
1389 
1390 			/*
1391 			 * Print a "." every GB to show we're making
1392 			 * progress.
1393 			 */
1394 			page_counter++;
1395 			if ((page_counter % PAGES_PER_GB) == 0)
1396 				printf(".");
1397 
1398 			/*
1399 			 * map page into kernel: valid, read/write,non-cacheable
1400 			 */
1401 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1402 			invltlb();
1403 
1404 			tmp = *(int *)ptr;
1405 			/*
1406 			 * Test for alternating 1's and 0's
1407 			 */
1408 			*(volatile int *)ptr = 0xaaaaaaaa;
1409 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1410 				page_bad = TRUE;
1411 			/*
1412 			 * Test for alternating 0's and 1's
1413 			 */
1414 			*(volatile int *)ptr = 0x55555555;
1415 			if (*(volatile int *)ptr != 0x55555555)
1416 				page_bad = TRUE;
1417 			/*
1418 			 * Test for all 1's
1419 			 */
1420 			*(volatile int *)ptr = 0xffffffff;
1421 			if (*(volatile int *)ptr != 0xffffffff)
1422 				page_bad = TRUE;
1423 			/*
1424 			 * Test for all 0's
1425 			 */
1426 			*(volatile int *)ptr = 0x0;
1427 			if (*(volatile int *)ptr != 0x0)
1428 				page_bad = TRUE;
1429 			/*
1430 			 * Restore original value.
1431 			 */
1432 			*(int *)ptr = tmp;
1433 
1434 skip_memtest:
1435 			/*
1436 			 * Adjust array of valid/good pages.
1437 			 */
1438 			if (page_bad == TRUE)
1439 				continue;
1440 			/*
1441 			 * If this good page is a continuation of the
1442 			 * previous set of good pages, then just increase
1443 			 * the end pointer. Otherwise start a new chunk.
1444 			 * Note that "end" points one higher than end,
1445 			 * making the range >= start and < end.
1446 			 * If we're also doing a speculative memory
1447 			 * test and we at or past the end, bump up Maxmem
1448 			 * so that we keep going. The first bad page
1449 			 * will terminate the loop.
1450 			 */
1451 			if (phys_avail[pa_indx] == pa) {
1452 				phys_avail[pa_indx] += PAGE_SIZE;
1453 			} else {
1454 				pa_indx++;
1455 				if (pa_indx == PHYS_AVAIL_ARRAY_END) {
1456 					printf(
1457 		"Too many holes in the physical address space, giving up\n");
1458 					pa_indx--;
1459 					full = TRUE;
1460 					goto do_dump_avail;
1461 				}
1462 				phys_avail[pa_indx++] = pa;	/* start */
1463 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1464 			}
1465 			physmem++;
1466 do_dump_avail:
1467 			if (dump_avail[da_indx] == pa) {
1468 				dump_avail[da_indx] += PAGE_SIZE;
1469 			} else {
1470 				da_indx++;
1471 				if (da_indx == DUMP_AVAIL_ARRAY_END) {
1472 					da_indx--;
1473 					goto do_next;
1474 				}
1475 				dump_avail[da_indx++] = pa; /* start */
1476 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1477 			}
1478 do_next:
1479 			if (full)
1480 				break;
1481 		}
1482 	}
1483 	*pte = 0;
1484 	invltlb();
1485 	if (memtest != 0)
1486 		printf("\n");
1487 
1488 	/*
1489 	 * XXX
1490 	 * The last chunk must contain at least one page plus the message
1491 	 * buffer to avoid complicating other code (message buffer address
1492 	 * calculation, etc.).
1493 	 */
1494 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1495 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1496 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1497 		phys_avail[pa_indx--] = 0;
1498 		phys_avail[pa_indx--] = 0;
1499 	}
1500 
1501 	Maxmem = atop(phys_avail[pa_indx]);
1502 
1503 	/* Trim off space for the message buffer. */
1504 	phys_avail[pa_indx] -= round_page(msgbufsize);
1505 
1506 	/* Map the message buffer. */
1507 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1508 }
1509 
1510 static caddr_t
native_parse_preload_data(u_int64_t modulep)1511 native_parse_preload_data(u_int64_t modulep)
1512 {
1513 	caddr_t kmdp;
1514 	char *envp;
1515 #ifdef DDB
1516 	vm_offset_t ksym_start;
1517 	vm_offset_t ksym_end;
1518 #endif
1519 
1520 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1521 	preload_bootstrap_relocate(KERNBASE);
1522 	kmdp = preload_search_by_type("elf kernel");
1523 	if (kmdp == NULL)
1524 		kmdp = preload_search_by_type("elf64 kernel");
1525 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1526 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1527 	if (envp != NULL)
1528 		envp += KERNBASE;
1529 	init_static_kenv(envp, 0);
1530 #ifdef DDB
1531 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1532 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1533 	db_fetch_ksymtab(ksym_start, ksym_end);
1534 #endif
1535 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1536 
1537 	return (kmdp);
1538 }
1539 
1540 static void
amd64_kdb_init(void)1541 amd64_kdb_init(void)
1542 {
1543 	kdb_init();
1544 #ifdef KDB
1545 	if (boothowto & RB_KDB)
1546 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1547 #endif
1548 }
1549 
1550 /* Set up the fast syscall stuff */
1551 void
amd64_conf_fast_syscall(void)1552 amd64_conf_fast_syscall(void)
1553 {
1554 	uint64_t msr;
1555 
1556 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1557 	wrmsr(MSR_EFER, msr);
1558 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1559 	    (u_int64_t)IDTVEC(fast_syscall));
1560 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1561 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1562 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1563 	wrmsr(MSR_STAR, msr);
1564 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D);
1565 }
1566 
1567 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1568 hammer_time(u_int64_t modulep, u_int64_t physfree)
1569 {
1570 	caddr_t kmdp;
1571 	int gsel_tss, x;
1572 	struct pcpu *pc;
1573 	struct nmi_pcpu *np;
1574 	struct xstate_hdr *xhdr;
1575 	u_int64_t rsp0;
1576 	char *env;
1577 	size_t kstack0_sz;
1578 	int late_console;
1579 
1580 	kmdp = init_ops.parse_preload_data(modulep);
1581 
1582 	physfree += ucode_load_bsp(physfree + KERNBASE);
1583 	physfree = roundup2(physfree, PAGE_SIZE);
1584 
1585 	identify_cpu1();
1586 	identify_hypervisor();
1587 	/*
1588 	 * hw.cpu_stdext_disable is ignored by the call, it will be
1589 	 * re-evaluted by the below call to finishidentcpu().
1590 	 */
1591 	identify_cpu2();
1592 
1593 	link_elf_ireloc(kmdp);
1594 
1595 	/*
1596 	 * This may be done better later if it gets more high level
1597 	 * components in it. If so just link td->td_proc here.
1598 	 */
1599 	proc_linkup0(&proc0, &thread0);
1600 
1601 	/* Init basic tunables, hz etc */
1602 	init_param1();
1603 
1604 	thread0.td_kstack = physfree + KERNBASE;
1605 	thread0.td_kstack_pages = kstack_pages;
1606 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1607 	bzero((void *)thread0.td_kstack, kstack0_sz);
1608 	physfree += kstack0_sz;
1609 
1610 	/*
1611 	 * make gdt memory segments
1612 	 */
1613 	for (x = 0; x < NGDT; x++) {
1614 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1615 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1616 			ssdtosd(&gdt_segs[x], &gdt[x]);
1617 	}
1618 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&common_tss[0];
1619 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1620 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1621 
1622 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1623 	r_gdt.rd_base =  (long) gdt;
1624 	lgdt(&r_gdt);
1625 	pc = &__pcpu[0];
1626 
1627 	wrmsr(MSR_FSBASE, 0);		/* User value */
1628 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1629 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1630 
1631 	pcpu_init(pc, 0, sizeof(struct pcpu));
1632 	dpcpu_init((void *)(physfree + KERNBASE), 0);
1633 	physfree += DPCPU_SIZE;
1634 	PCPU_SET(prvspace, pc);
1635 	PCPU_SET(curthread, &thread0);
1636 	/* Non-late cninit() and printf() can be moved up to here. */
1637 	PCPU_SET(tssp, &common_tss[0]);
1638 	PCPU_SET(commontssp, &common_tss[0]);
1639 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1640 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1641 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1642 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1643 
1644 	/*
1645 	 * Initialize mutexes.
1646 	 *
1647 	 * icu_lock: in order to allow an interrupt to occur in a critical
1648 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1649 	 *	     must be able to get the icu lock, so it can't be
1650 	 *	     under witness.
1651 	 */
1652 	mutex_init();
1653 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1654 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1655 
1656 	/* exceptions */
1657 	pti = pti_get_default();
1658 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1659 
1660 	for (x = 0; x < NIDT; x++)
1661 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1662 		    SEL_KPL, 0);
1663 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1664 	    SEL_KPL, 0);
1665 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1666 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1667 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1668 	    SEL_UPL, 0);
1669 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1670 	    SEL_UPL, 0);
1671 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1672 	    SEL_KPL, 0);
1673 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1674 	    SEL_KPL, 0);
1675 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1676 	    SEL_KPL, 0);
1677 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1678 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1679 	    SDT_SYSIGT, SEL_KPL, 0);
1680 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1681 	    SEL_KPL, 0);
1682 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1683 	    SDT_SYSIGT, SEL_KPL, 0);
1684 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1685 	    SEL_KPL, 0);
1686 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1687 	    SEL_KPL, 0);
1688 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1689 	    SEL_KPL, 0);
1690 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1691 	    SEL_KPL, 0);
1692 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1693 	    SEL_KPL, 0);
1694 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1695 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1696 	    SEL_KPL, 0);
1697 #ifdef KDTRACE_HOOKS
1698 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1699 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1700 #endif
1701 #ifdef XENHVM
1702 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1703 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1704 #endif
1705 	r_idt.rd_limit = sizeof(idt0) - 1;
1706 	r_idt.rd_base = (long) idt;
1707 	lidt(&r_idt);
1708 
1709 	/*
1710 	 * Initialize the clock before the console so that console
1711 	 * initialization can use DELAY().
1712 	 */
1713 	clock_init();
1714 
1715 	/*
1716 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1717 	 * transition).
1718 	 * Once bootblocks have updated, we can test directly for
1719 	 * efi_systbl != NULL here...
1720 	 */
1721 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1722 	    != NULL)
1723 		vty_set_preferred(VTY_VT);
1724 
1725 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1726 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1727 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1728 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1729 
1730 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1731 	    &x86_rngds_mitg_enable);
1732 
1733 	finishidentcpu();	/* Final stage of CPU initialization */
1734 	initializecpu();	/* Initialize CPU registers */
1735 	initializecpucache();
1736 
1737 	/* doublefault stack space, runs on ist1 */
1738 	common_tss[0].tss_ist1 = (long)&dblfault_stack[sizeof(dblfault_stack)];
1739 
1740 	/*
1741 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1742 	 * above the start of the ist2 stack.
1743 	 */
1744 	np = ((struct nmi_pcpu *) &nmi0_stack[sizeof(nmi0_stack)]) - 1;
1745 	np->np_pcpu = (register_t) pc;
1746 	common_tss[0].tss_ist2 = (long) np;
1747 
1748 	/*
1749 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1750 	 * above the start of the ist3 stack.
1751 	 */
1752 	np = ((struct nmi_pcpu *) &mce0_stack[sizeof(mce0_stack)]) - 1;
1753 	np->np_pcpu = (register_t) pc;
1754 	common_tss[0].tss_ist3 = (long) np;
1755 
1756 	/*
1757 	 * DB# stack, runs on ist4.
1758 	 */
1759 	np = ((struct nmi_pcpu *) &dbg0_stack[sizeof(dbg0_stack)]) - 1;
1760 	np->np_pcpu = (register_t) pc;
1761 	common_tss[0].tss_ist4 = (long) np;
1762 
1763 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1764 	common_tss[0].tss_iobase = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE;
1765 
1766 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1767 	ltr(gsel_tss);
1768 
1769 	amd64_conf_fast_syscall();
1770 
1771 	/*
1772 	 * Temporary forge some valid pointer to PCB, for exception
1773 	 * handlers.  It is reinitialized properly below after FPU is
1774 	 * set up.  Also set up td_critnest to short-cut the page
1775 	 * fault handler.
1776 	 */
1777 	cpu_max_ext_state_size = sizeof(struct savefpu);
1778 	thread0.td_pcb = get_pcb_td(&thread0);
1779 	thread0.td_critnest = 1;
1780 
1781 	/*
1782 	 * The console and kdb should be initialized even earlier than here,
1783 	 * but some console drivers don't work until after getmemsize().
1784 	 * Default to late console initialization to support these drivers.
1785 	 * This loses mainly printf()s in getmemsize() and early debugging.
1786 	 */
1787 	late_console = 1;
1788 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1789 	if (!late_console) {
1790 		cninit();
1791 		amd64_kdb_init();
1792 	}
1793 
1794 	getmemsize(kmdp, physfree);
1795 	init_param2(physmem);
1796 
1797 	/* now running on new page tables, configured,and u/iom is accessible */
1798 
1799 	if (late_console)
1800 		cninit();
1801 
1802 #ifdef DEV_ISA
1803 #ifdef DEV_ATPIC
1804 	elcr_probe();
1805 	atpic_startup();
1806 #else
1807 	/* Reset and mask the atpics and leave them shut down. */
1808 	atpic_reset();
1809 
1810 	/*
1811 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1812 	 * interrupt handler.
1813 	 */
1814 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1815 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1816 #endif
1817 #else
1818 #error "have you forgotten the isa device?";
1819 #endif
1820 
1821 	if (late_console)
1822 		amd64_kdb_init();
1823 
1824 	msgbufinit(msgbufp, msgbufsize);
1825 	fpuinit();
1826 
1827 	/*
1828 	 * Set up thread0 pcb after fpuinit calculated pcb + fpu save
1829 	 * area size.  Zero out the extended state header in fpu save
1830 	 * area.
1831 	 */
1832 	thread0.td_pcb = get_pcb_td(&thread0);
1833 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1834 	bzero(get_pcb_user_save_td(&thread0), cpu_max_ext_state_size);
1835 	if (use_xsave) {
1836 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1837 		    1);
1838 		xhdr->xstate_bv = xsave_mask;
1839 	}
1840 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1841 	rsp0 = (vm_offset_t)thread0.td_pcb;
1842 	/* Ensure the stack is aligned to 16 bytes */
1843 	rsp0 &= ~0xFul;
1844 	common_tss[0].tss_rsp0 = rsp0;
1845 	PCPU_SET(rsp0, rsp0);
1846 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1847 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1848 	PCPU_SET(curpcb, thread0.td_pcb);
1849 
1850 	/* transfer to user mode */
1851 
1852 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1853 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1854 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1855 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1856 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1857 
1858 	load_ds(_udatasel);
1859 	load_es(_udatasel);
1860 	load_fs(_ufssel);
1861 
1862 	/* setup proc 0's pcb */
1863 	thread0.td_pcb->pcb_flags = 0;
1864 	thread0.td_frame = &proc0_tf;
1865 
1866         env = kern_getenv("kernelname");
1867 	if (env != NULL)
1868 		strlcpy(kernelname, env, sizeof(kernelname));
1869 
1870 	cpu_probe_amdc1e();
1871 
1872 #ifdef FDT
1873 	x86_init_fdt();
1874 #endif
1875 	thread0.td_critnest = 0;
1876 
1877 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1878 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1879 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1880 
1881 	/* Location of kernel stack for locore */
1882 	return ((u_int64_t)thread0.td_pcb);
1883 }
1884 
1885 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1886 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1887 {
1888 
1889 	pcpu->pc_acpi_id = 0xffffffff;
1890 }
1891 
1892 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1893 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1894 {
1895 	struct bios_smap *smapbase;
1896 	struct bios_smap_xattr smap;
1897 	caddr_t kmdp;
1898 	uint32_t *smapattr;
1899 	int count, error, i;
1900 
1901 	/* Retrieve the system memory map from the loader. */
1902 	kmdp = preload_search_by_type("elf kernel");
1903 	if (kmdp == NULL)
1904 		kmdp = preload_search_by_type("elf64 kernel");
1905 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1906 	    MODINFO_METADATA | MODINFOMD_SMAP);
1907 	if (smapbase == NULL)
1908 		return (0);
1909 	smapattr = (uint32_t *)preload_search_info(kmdp,
1910 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1911 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1912 	error = 0;
1913 	for (i = 0; i < count; i++) {
1914 		smap.base = smapbase[i].base;
1915 		smap.length = smapbase[i].length;
1916 		smap.type = smapbase[i].type;
1917 		if (smapattr != NULL)
1918 			smap.xattr = smapattr[i];
1919 		else
1920 			smap.xattr = 0;
1921 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1922 	}
1923 	return (error);
1924 }
1925 SYSCTL_PROC(_machdep, OID_AUTO, smap, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1926     smap_sysctl_handler, "S,bios_smap_xattr", "Raw BIOS SMAP data");
1927 
1928 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1929 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1930 {
1931 	struct efi_map_header *efihdr;
1932 	caddr_t kmdp;
1933 	uint32_t efisize;
1934 
1935 	kmdp = preload_search_by_type("elf kernel");
1936 	if (kmdp == NULL)
1937 		kmdp = preload_search_by_type("elf64 kernel");
1938 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1939 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1940 	if (efihdr == NULL)
1941 		return (0);
1942 	efisize = *((uint32_t *)efihdr - 1);
1943 	return (SYSCTL_OUT(req, efihdr, efisize));
1944 }
1945 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1946     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1947 
1948 void
spinlock_enter(void)1949 spinlock_enter(void)
1950 {
1951 	struct thread *td;
1952 	register_t flags;
1953 
1954 	td = curthread;
1955 	if (td->td_md.md_spinlock_count == 0) {
1956 		flags = intr_disable();
1957 		td->td_md.md_spinlock_count = 1;
1958 		td->td_md.md_saved_flags = flags;
1959 	} else
1960 		td->td_md.md_spinlock_count++;
1961 	critical_enter();
1962 }
1963 
1964 void
spinlock_exit(void)1965 spinlock_exit(void)
1966 {
1967 	struct thread *td;
1968 	register_t flags;
1969 
1970 	td = curthread;
1971 	critical_exit();
1972 	flags = td->td_md.md_saved_flags;
1973 	td->td_md.md_spinlock_count--;
1974 	if (td->td_md.md_spinlock_count == 0)
1975 		intr_restore(flags);
1976 }
1977 
1978 /*
1979  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1980  * we want to start a backtrace from the function that caused us to enter
1981  * the debugger. We have the context in the trapframe, but base the trace
1982  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1983  * enough for a backtrace.
1984  */
1985 void
makectx(struct trapframe * tf,struct pcb * pcb)1986 makectx(struct trapframe *tf, struct pcb *pcb)
1987 {
1988 
1989 	pcb->pcb_r12 = tf->tf_r12;
1990 	pcb->pcb_r13 = tf->tf_r13;
1991 	pcb->pcb_r14 = tf->tf_r14;
1992 	pcb->pcb_r15 = tf->tf_r15;
1993 	pcb->pcb_rbp = tf->tf_rbp;
1994 	pcb->pcb_rbx = tf->tf_rbx;
1995 	pcb->pcb_rip = tf->tf_rip;
1996 	pcb->pcb_rsp = tf->tf_rsp;
1997 }
1998 
1999 int
ptrace_set_pc(struct thread * td,unsigned long addr)2000 ptrace_set_pc(struct thread *td, unsigned long addr)
2001 {
2002 
2003 	td->td_frame->tf_rip = addr;
2004 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2005 	return (0);
2006 }
2007 
2008 int
ptrace_single_step(struct thread * td)2009 ptrace_single_step(struct thread *td)
2010 {
2011 
2012 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2013 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2014 		td->td_frame->tf_rflags |= PSL_T;
2015 		td->td_dbgflags |= TDB_STEP;
2016 	}
2017 	return (0);
2018 }
2019 
2020 int
ptrace_clear_single_step(struct thread * td)2021 ptrace_clear_single_step(struct thread *td)
2022 {
2023 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2024 	td->td_frame->tf_rflags &= ~PSL_T;
2025 	td->td_dbgflags &= ~TDB_STEP;
2026 	return (0);
2027 }
2028 
2029 int
fill_regs(struct thread * td,struct reg * regs)2030 fill_regs(struct thread *td, struct reg *regs)
2031 {
2032 	struct trapframe *tp;
2033 
2034 	tp = td->td_frame;
2035 	return (fill_frame_regs(tp, regs));
2036 }
2037 
2038 int
fill_frame_regs(struct trapframe * tp,struct reg * regs)2039 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2040 {
2041 
2042 	regs->r_r15 = tp->tf_r15;
2043 	regs->r_r14 = tp->tf_r14;
2044 	regs->r_r13 = tp->tf_r13;
2045 	regs->r_r12 = tp->tf_r12;
2046 	regs->r_r11 = tp->tf_r11;
2047 	regs->r_r10 = tp->tf_r10;
2048 	regs->r_r9  = tp->tf_r9;
2049 	regs->r_r8  = tp->tf_r8;
2050 	regs->r_rdi = tp->tf_rdi;
2051 	regs->r_rsi = tp->tf_rsi;
2052 	regs->r_rbp = tp->tf_rbp;
2053 	regs->r_rbx = tp->tf_rbx;
2054 	regs->r_rdx = tp->tf_rdx;
2055 	regs->r_rcx = tp->tf_rcx;
2056 	regs->r_rax = tp->tf_rax;
2057 	regs->r_rip = tp->tf_rip;
2058 	regs->r_cs = tp->tf_cs;
2059 	regs->r_rflags = tp->tf_rflags;
2060 	regs->r_rsp = tp->tf_rsp;
2061 	regs->r_ss = tp->tf_ss;
2062 	if (tp->tf_flags & TF_HASSEGS) {
2063 		regs->r_ds = tp->tf_ds;
2064 		regs->r_es = tp->tf_es;
2065 		regs->r_fs = tp->tf_fs;
2066 		regs->r_gs = tp->tf_gs;
2067 	} else {
2068 		regs->r_ds = 0;
2069 		regs->r_es = 0;
2070 		regs->r_fs = 0;
2071 		regs->r_gs = 0;
2072 	}
2073 	regs->r_err = 0;
2074 	regs->r_trapno = 0;
2075 	return (0);
2076 }
2077 
2078 int
set_regs(struct thread * td,struct reg * regs)2079 set_regs(struct thread *td, struct reg *regs)
2080 {
2081 	struct trapframe *tp;
2082 	register_t rflags;
2083 
2084 	tp = td->td_frame;
2085 	rflags = regs->r_rflags & 0xffffffff;
2086 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2087 		return (EINVAL);
2088 	tp->tf_r15 = regs->r_r15;
2089 	tp->tf_r14 = regs->r_r14;
2090 	tp->tf_r13 = regs->r_r13;
2091 	tp->tf_r12 = regs->r_r12;
2092 	tp->tf_r11 = regs->r_r11;
2093 	tp->tf_r10 = regs->r_r10;
2094 	tp->tf_r9  = regs->r_r9;
2095 	tp->tf_r8  = regs->r_r8;
2096 	tp->tf_rdi = regs->r_rdi;
2097 	tp->tf_rsi = regs->r_rsi;
2098 	tp->tf_rbp = regs->r_rbp;
2099 	tp->tf_rbx = regs->r_rbx;
2100 	tp->tf_rdx = regs->r_rdx;
2101 	tp->tf_rcx = regs->r_rcx;
2102 	tp->tf_rax = regs->r_rax;
2103 	tp->tf_rip = regs->r_rip;
2104 	tp->tf_cs = regs->r_cs;
2105 	tp->tf_rflags = rflags;
2106 	tp->tf_rsp = regs->r_rsp;
2107 	tp->tf_ss = regs->r_ss;
2108 	if (0) {	/* XXXKIB */
2109 		tp->tf_ds = regs->r_ds;
2110 		tp->tf_es = regs->r_es;
2111 		tp->tf_fs = regs->r_fs;
2112 		tp->tf_gs = regs->r_gs;
2113 		tp->tf_flags = TF_HASSEGS;
2114 	}
2115 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2116 	return (0);
2117 }
2118 
2119 /* XXX check all this stuff! */
2120 /* externalize from sv_xmm */
2121 static void
fill_fpregs_xmm(struct savefpu * sv_xmm,struct fpreg * fpregs)2122 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2123 {
2124 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2125 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2126 	int i;
2127 
2128 	/* pcb -> fpregs */
2129 	bzero(fpregs, sizeof(*fpregs));
2130 
2131 	/* FPU control/status */
2132 	penv_fpreg->en_cw = penv_xmm->en_cw;
2133 	penv_fpreg->en_sw = penv_xmm->en_sw;
2134 	penv_fpreg->en_tw = penv_xmm->en_tw;
2135 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2136 	penv_fpreg->en_rip = penv_xmm->en_rip;
2137 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2138 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2139 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2140 
2141 	/* FPU registers */
2142 	for (i = 0; i < 8; ++i)
2143 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2144 
2145 	/* SSE registers */
2146 	for (i = 0; i < 16; ++i)
2147 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2148 }
2149 
2150 /* internalize from fpregs into sv_xmm */
2151 static void
set_fpregs_xmm(struct fpreg * fpregs,struct savefpu * sv_xmm)2152 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2153 {
2154 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2155 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2156 	int i;
2157 
2158 	/* fpregs -> pcb */
2159 	/* FPU control/status */
2160 	penv_xmm->en_cw = penv_fpreg->en_cw;
2161 	penv_xmm->en_sw = penv_fpreg->en_sw;
2162 	penv_xmm->en_tw = penv_fpreg->en_tw;
2163 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2164 	penv_xmm->en_rip = penv_fpreg->en_rip;
2165 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2166 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2167 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2168 
2169 	/* FPU registers */
2170 	for (i = 0; i < 8; ++i)
2171 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2172 
2173 	/* SSE registers */
2174 	for (i = 0; i < 16; ++i)
2175 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2176 }
2177 
2178 /* externalize from td->pcb */
2179 int
fill_fpregs(struct thread * td,struct fpreg * fpregs)2180 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2181 {
2182 
2183 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2184 	    P_SHOULDSTOP(td->td_proc),
2185 	    ("not suspended thread %p", td));
2186 	fpugetregs(td);
2187 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2188 	return (0);
2189 }
2190 
2191 /* internalize to td->pcb */
2192 int
set_fpregs(struct thread * td,struct fpreg * fpregs)2193 set_fpregs(struct thread *td, struct fpreg *fpregs)
2194 {
2195 
2196 	critical_enter();
2197 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2198 	fpuuserinited(td);
2199 	critical_exit();
2200 	return (0);
2201 }
2202 
2203 /*
2204  * Get machine context.
2205  */
2206 int
get_mcontext(struct thread * td,mcontext_t * mcp,int flags)2207 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2208 {
2209 	struct pcb *pcb;
2210 	struct trapframe *tp;
2211 
2212 	pcb = td->td_pcb;
2213 	tp = td->td_frame;
2214 	PROC_LOCK(curthread->td_proc);
2215 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2216 	PROC_UNLOCK(curthread->td_proc);
2217 	mcp->mc_r15 = tp->tf_r15;
2218 	mcp->mc_r14 = tp->tf_r14;
2219 	mcp->mc_r13 = tp->tf_r13;
2220 	mcp->mc_r12 = tp->tf_r12;
2221 	mcp->mc_r11 = tp->tf_r11;
2222 	mcp->mc_r10 = tp->tf_r10;
2223 	mcp->mc_r9  = tp->tf_r9;
2224 	mcp->mc_r8  = tp->tf_r8;
2225 	mcp->mc_rdi = tp->tf_rdi;
2226 	mcp->mc_rsi = tp->tf_rsi;
2227 	mcp->mc_rbp = tp->tf_rbp;
2228 	mcp->mc_rbx = tp->tf_rbx;
2229 	mcp->mc_rcx = tp->tf_rcx;
2230 	mcp->mc_rflags = tp->tf_rflags;
2231 	if (flags & GET_MC_CLEAR_RET) {
2232 		mcp->mc_rax = 0;
2233 		mcp->mc_rdx = 0;
2234 		mcp->mc_rflags &= ~PSL_C;
2235 	} else {
2236 		mcp->mc_rax = tp->tf_rax;
2237 		mcp->mc_rdx = tp->tf_rdx;
2238 	}
2239 	mcp->mc_rip = tp->tf_rip;
2240 	mcp->mc_cs = tp->tf_cs;
2241 	mcp->mc_rsp = tp->tf_rsp;
2242 	mcp->mc_ss = tp->tf_ss;
2243 	mcp->mc_ds = tp->tf_ds;
2244 	mcp->mc_es = tp->tf_es;
2245 	mcp->mc_fs = tp->tf_fs;
2246 	mcp->mc_gs = tp->tf_gs;
2247 	mcp->mc_flags = tp->tf_flags;
2248 	mcp->mc_len = sizeof(*mcp);
2249 	get_fpcontext(td, mcp, NULL, 0);
2250 	update_pcb_bases(pcb);
2251 	mcp->mc_fsbase = pcb->pcb_fsbase;
2252 	mcp->mc_gsbase = pcb->pcb_gsbase;
2253 	mcp->mc_xfpustate = 0;
2254 	mcp->mc_xfpustate_len = 0;
2255 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2256 	return (0);
2257 }
2258 
2259 /*
2260  * Set machine context.
2261  *
2262  * However, we don't set any but the user modifiable flags, and we won't
2263  * touch the cs selector.
2264  */
2265 int
set_mcontext(struct thread * td,mcontext_t * mcp)2266 set_mcontext(struct thread *td, mcontext_t *mcp)
2267 {
2268 	struct pcb *pcb;
2269 	struct trapframe *tp;
2270 	char *xfpustate;
2271 	long rflags;
2272 	int ret;
2273 
2274 	pcb = td->td_pcb;
2275 	tp = td->td_frame;
2276 	if (mcp->mc_len != sizeof(*mcp) ||
2277 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2278 		return (EINVAL);
2279 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2280 	    (tp->tf_rflags & ~PSL_USERCHANGE);
2281 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2282 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2283 		    sizeof(struct savefpu))
2284 			return (EINVAL);
2285 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2286 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2287 		    mcp->mc_xfpustate_len);
2288 		if (ret != 0)
2289 			return (ret);
2290 	} else
2291 		xfpustate = NULL;
2292 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2293 	if (ret != 0)
2294 		return (ret);
2295 	tp->tf_r15 = mcp->mc_r15;
2296 	tp->tf_r14 = mcp->mc_r14;
2297 	tp->tf_r13 = mcp->mc_r13;
2298 	tp->tf_r12 = mcp->mc_r12;
2299 	tp->tf_r11 = mcp->mc_r11;
2300 	tp->tf_r10 = mcp->mc_r10;
2301 	tp->tf_r9  = mcp->mc_r9;
2302 	tp->tf_r8  = mcp->mc_r8;
2303 	tp->tf_rdi = mcp->mc_rdi;
2304 	tp->tf_rsi = mcp->mc_rsi;
2305 	tp->tf_rbp = mcp->mc_rbp;
2306 	tp->tf_rbx = mcp->mc_rbx;
2307 	tp->tf_rdx = mcp->mc_rdx;
2308 	tp->tf_rcx = mcp->mc_rcx;
2309 	tp->tf_rax = mcp->mc_rax;
2310 	tp->tf_rip = mcp->mc_rip;
2311 	tp->tf_rflags = rflags;
2312 	tp->tf_rsp = mcp->mc_rsp;
2313 	tp->tf_ss = mcp->mc_ss;
2314 	tp->tf_flags = mcp->mc_flags;
2315 	if (tp->tf_flags & TF_HASSEGS) {
2316 		tp->tf_ds = mcp->mc_ds;
2317 		tp->tf_es = mcp->mc_es;
2318 		tp->tf_fs = mcp->mc_fs;
2319 		tp->tf_gs = mcp->mc_gs;
2320 	}
2321 	set_pcb_flags(pcb, PCB_FULL_IRET);
2322 	if (mcp->mc_flags & _MC_HASBASES) {
2323 		pcb->pcb_fsbase = mcp->mc_fsbase;
2324 		pcb->pcb_gsbase = mcp->mc_gsbase;
2325 	}
2326 	return (0);
2327 }
2328 
2329 static void
get_fpcontext(struct thread * td,mcontext_t * mcp,char * xfpusave,size_t xfpusave_len)2330 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2331     size_t xfpusave_len)
2332 {
2333 	size_t max_len, len;
2334 
2335 	mcp->mc_ownedfp = fpugetregs(td);
2336 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2337 	    sizeof(mcp->mc_fpstate));
2338 	mcp->mc_fpformat = fpuformat();
2339 	if (!use_xsave || xfpusave_len == 0)
2340 		return;
2341 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2342 	len = xfpusave_len;
2343 	if (len > max_len) {
2344 		len = max_len;
2345 		bzero(xfpusave + max_len, len - max_len);
2346 	}
2347 	mcp->mc_flags |= _MC_HASFPXSTATE;
2348 	mcp->mc_xfpustate_len = len;
2349 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2350 }
2351 
2352 static int
set_fpcontext(struct thread * td,mcontext_t * mcp,char * xfpustate,size_t xfpustate_len)2353 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2354     size_t xfpustate_len)
2355 {
2356 	int error;
2357 
2358 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2359 		return (0);
2360 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2361 		return (EINVAL);
2362 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2363 		/* We don't care what state is left in the FPU or PCB. */
2364 		fpstate_drop(td);
2365 		error = 0;
2366 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2367 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2368 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2369 		    xfpustate, xfpustate_len);
2370 	} else
2371 		return (EINVAL);
2372 	return (error);
2373 }
2374 
2375 void
fpstate_drop(struct thread * td)2376 fpstate_drop(struct thread *td)
2377 {
2378 
2379 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2380 	critical_enter();
2381 	if (PCPU_GET(fpcurthread) == td)
2382 		fpudrop();
2383 	/*
2384 	 * XXX force a full drop of the fpu.  The above only drops it if we
2385 	 * owned it.
2386 	 *
2387 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2388 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2389 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2390 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2391 	 * have too many layers.
2392 	 */
2393 	clear_pcb_flags(curthread->td_pcb,
2394 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2395 	critical_exit();
2396 }
2397 
2398 int
fill_dbregs(struct thread * td,struct dbreg * dbregs)2399 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2400 {
2401 	struct pcb *pcb;
2402 
2403 	if (td == NULL) {
2404 		dbregs->dr[0] = rdr0();
2405 		dbregs->dr[1] = rdr1();
2406 		dbregs->dr[2] = rdr2();
2407 		dbregs->dr[3] = rdr3();
2408 		dbregs->dr[6] = rdr6();
2409 		dbregs->dr[7] = rdr7();
2410 	} else {
2411 		pcb = td->td_pcb;
2412 		dbregs->dr[0] = pcb->pcb_dr0;
2413 		dbregs->dr[1] = pcb->pcb_dr1;
2414 		dbregs->dr[2] = pcb->pcb_dr2;
2415 		dbregs->dr[3] = pcb->pcb_dr3;
2416 		dbregs->dr[6] = pcb->pcb_dr6;
2417 		dbregs->dr[7] = pcb->pcb_dr7;
2418 	}
2419 	dbregs->dr[4] = 0;
2420 	dbregs->dr[5] = 0;
2421 	dbregs->dr[8] = 0;
2422 	dbregs->dr[9] = 0;
2423 	dbregs->dr[10] = 0;
2424 	dbregs->dr[11] = 0;
2425 	dbregs->dr[12] = 0;
2426 	dbregs->dr[13] = 0;
2427 	dbregs->dr[14] = 0;
2428 	dbregs->dr[15] = 0;
2429 	return (0);
2430 }
2431 
2432 int
set_dbregs(struct thread * td,struct dbreg * dbregs)2433 set_dbregs(struct thread *td, struct dbreg *dbregs)
2434 {
2435 	struct pcb *pcb;
2436 	int i;
2437 
2438 	if (td == NULL) {
2439 		load_dr0(dbregs->dr[0]);
2440 		load_dr1(dbregs->dr[1]);
2441 		load_dr2(dbregs->dr[2]);
2442 		load_dr3(dbregs->dr[3]);
2443 		load_dr6(dbregs->dr[6]);
2444 		load_dr7(dbregs->dr[7]);
2445 	} else {
2446 		/*
2447 		 * Don't let an illegal value for dr7 get set.  Specifically,
2448 		 * check for undefined settings.  Setting these bit patterns
2449 		 * result in undefined behaviour and can lead to an unexpected
2450 		 * TRCTRAP or a general protection fault right here.
2451 		 * Upper bits of dr6 and dr7 must not be set
2452 		 */
2453 		for (i = 0; i < 4; i++) {
2454 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2455 				return (EINVAL);
2456 			if (td->td_frame->tf_cs == _ucode32sel &&
2457 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2458 				return (EINVAL);
2459 		}
2460 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2461 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2462 			return (EINVAL);
2463 
2464 		pcb = td->td_pcb;
2465 
2466 		/*
2467 		 * Don't let a process set a breakpoint that is not within the
2468 		 * process's address space.  If a process could do this, it
2469 		 * could halt the system by setting a breakpoint in the kernel
2470 		 * (if ddb was enabled).  Thus, we need to check to make sure
2471 		 * that no breakpoints are being enabled for addresses outside
2472 		 * process's address space.
2473 		 *
2474 		 * XXX - what about when the watched area of the user's
2475 		 * address space is written into from within the kernel
2476 		 * ... wouldn't that still cause a breakpoint to be generated
2477 		 * from within kernel mode?
2478 		 */
2479 
2480 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2481 			/* dr0 is enabled */
2482 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2483 				return (EINVAL);
2484 		}
2485 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2486 			/* dr1 is enabled */
2487 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2488 				return (EINVAL);
2489 		}
2490 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2491 			/* dr2 is enabled */
2492 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2493 				return (EINVAL);
2494 		}
2495 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2496 			/* dr3 is enabled */
2497 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2498 				return (EINVAL);
2499 		}
2500 
2501 		pcb->pcb_dr0 = dbregs->dr[0];
2502 		pcb->pcb_dr1 = dbregs->dr[1];
2503 		pcb->pcb_dr2 = dbregs->dr[2];
2504 		pcb->pcb_dr3 = dbregs->dr[3];
2505 		pcb->pcb_dr6 = dbregs->dr[6];
2506 		pcb->pcb_dr7 = dbregs->dr[7];
2507 
2508 		set_pcb_flags(pcb, PCB_DBREGS);
2509 	}
2510 
2511 	return (0);
2512 }
2513 
2514 void
reset_dbregs(void)2515 reset_dbregs(void)
2516 {
2517 
2518 	load_dr7(0);	/* Turn off the control bits first */
2519 	load_dr0(0);
2520 	load_dr1(0);
2521 	load_dr2(0);
2522 	load_dr3(0);
2523 	load_dr6(0);
2524 }
2525 
2526 /*
2527  * Return > 0 if a hardware breakpoint has been hit, and the
2528  * breakpoint was in user space.  Return 0, otherwise.
2529  */
2530 int
user_dbreg_trap(register_t dr6)2531 user_dbreg_trap(register_t dr6)
2532 {
2533         u_int64_t dr7;
2534         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2535         int nbp;            /* number of breakpoints that triggered */
2536         caddr_t addr[4];    /* breakpoint addresses */
2537         int i;
2538 
2539         bp = dr6 & DBREG_DR6_BMASK;
2540         if (bp == 0) {
2541                 /*
2542                  * None of the breakpoint bits are set meaning this
2543                  * trap was not caused by any of the debug registers
2544                  */
2545                 return 0;
2546         }
2547 
2548         dr7 = rdr7();
2549         if ((dr7 & 0x000000ff) == 0) {
2550                 /*
2551                  * all GE and LE bits in the dr7 register are zero,
2552                  * thus the trap couldn't have been caused by the
2553                  * hardware debug registers
2554                  */
2555                 return 0;
2556         }
2557 
2558         nbp = 0;
2559 
2560         /*
2561          * at least one of the breakpoints were hit, check to see
2562          * which ones and if any of them are user space addresses
2563          */
2564 
2565         if (bp & 0x01) {
2566                 addr[nbp++] = (caddr_t)rdr0();
2567         }
2568         if (bp & 0x02) {
2569                 addr[nbp++] = (caddr_t)rdr1();
2570         }
2571         if (bp & 0x04) {
2572                 addr[nbp++] = (caddr_t)rdr2();
2573         }
2574         if (bp & 0x08) {
2575                 addr[nbp++] = (caddr_t)rdr3();
2576         }
2577 
2578         for (i = 0; i < nbp; i++) {
2579                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2580                         /*
2581                          * addr[i] is in user space
2582                          */
2583                         return nbp;
2584                 }
2585         }
2586 
2587         /*
2588          * None of the breakpoints are in user space.
2589          */
2590         return 0;
2591 }
2592 
2593 /*
2594  * The pcb_flags is only modified by current thread, or by other threads
2595  * when current thread is stopped.  However, current thread may change it
2596  * from the interrupt context in cpu_switch(), or in the trap handler.
2597  * When we read-modify-write pcb_flags from C sources, compiler may generate
2598  * code that is not atomic regarding the interrupt handler.  If a trap or
2599  * interrupt happens and any flag is modified from the handler, it can be
2600  * clobbered with the cached value later.  Therefore, we implement setting
2601  * and clearing flags with single-instruction functions, which do not race
2602  * with possible modification of the flags from the trap or interrupt context,
2603  * because traps and interrupts are executed only on instruction boundary.
2604  */
2605 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)2606 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2607 {
2608 
2609 	__asm __volatile("orl %1,%0"
2610 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2611 	    : "cc", "memory");
2612 
2613 }
2614 
2615 /*
2616  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2617  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2618  * pcb if user space modified the bases.  We must save on the context
2619  * switch or if the return to usermode happens through the doreti.
2620  *
2621  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2622  * which have a consequence that the base MSRs must be saved each time
2623  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2624  * context switches.
2625  */
2626 void
set_pcb_flags(struct pcb * pcb,const u_int flags)2627 set_pcb_flags(struct pcb *pcb, const u_int flags)
2628 {
2629 	register_t r;
2630 
2631 	if (curpcb == pcb &&
2632 	    (flags & PCB_FULL_IRET) != 0 &&
2633 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0 &&
2634 	    (cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0) {
2635 		r = intr_disable();
2636 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2637 			if (rfs() == _ufssel)
2638 				pcb->pcb_fsbase = rdfsbase();
2639 			if (rgs() == _ugssel)
2640 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2641 		}
2642 		set_pcb_flags_raw(pcb, flags);
2643 		intr_restore(r);
2644 	} else {
2645 		set_pcb_flags_raw(pcb, flags);
2646 	}
2647 }
2648 
2649 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)2650 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2651 {
2652 
2653 	__asm __volatile("andl %1,%0"
2654 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2655 	    : "cc", "memory");
2656 }
2657 
2658 #ifdef KDB
2659 
2660 /*
2661  * Provide inb() and outb() as functions.  They are normally only available as
2662  * inline functions, thus cannot be called from the debugger.
2663  */
2664 
2665 /* silence compiler warnings */
2666 u_char inb_(u_short);
2667 void outb_(u_short, u_char);
2668 
2669 u_char
inb_(u_short port)2670 inb_(u_short port)
2671 {
2672 	return inb(port);
2673 }
2674 
2675 void
outb_(u_short port,u_char data)2676 outb_(u_short port, u_char data)
2677 {
2678 	outb(port, data);
2679 }
2680 
2681 #endif /* KDB */
2682