xref: /freebsd-13-stable/sys/amd64/amd64/machdep.c (revision c0dfdac8c8fa41b4c9cb4c75d2a420140837399e)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 #include "opt_atpic.h"
45 #include "opt_cpu.h"
46 #include "opt_ddb.h"
47 #include "opt_inet.h"
48 #include "opt_isa.h"
49 #include "opt_kstack_pages.h"
50 #include "opt_maxmem.h"
51 #include "opt_mp_watchdog.h"
52 #include "opt_pci.h"
53 #include "opt_platform.h"
54 #include "opt_sched.h"
55 
56 #include <sys/param.h>
57 #include <sys/proc.h>
58 #include <sys/systm.h>
59 #include <sys/asan.h>
60 #include <sys/bio.h>
61 #include <sys/buf.h>
62 #include <sys/bus.h>
63 #include <sys/callout.h>
64 #include <sys/cons.h>
65 #include <sys/cpu.h>
66 #include <sys/csan.h>
67 #include <sys/efi.h>
68 #include <sys/eventhandler.h>
69 #include <sys/exec.h>
70 #include <sys/imgact.h>
71 #include <sys/kdb.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/linker.h>
75 #include <sys/lock.h>
76 #include <sys/malloc.h>
77 #include <sys/memrange.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mutex.h>
80 #include <sys/pcpu.h>
81 #include <sys/ptrace.h>
82 #include <sys/reboot.h>
83 #include <sys/reg.h>
84 #include <sys/rwlock.h>
85 #include <sys/sched.h>
86 #include <sys/signalvar.h>
87 #ifdef SMP
88 #include <sys/smp.h>
89 #endif
90 #include <sys/syscallsubr.h>
91 #include <sys/sysctl.h>
92 #include <sys/sysent.h>
93 #include <sys/sysproto.h>
94 #include <sys/ucontext.h>
95 #include <sys/vmmeter.h>
96 
97 #include <vm/vm.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_extern.h>
100 #include <vm/vm_kern.h>
101 #include <vm/vm_page.h>
102 #include <vm/vm_map.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_pager.h>
105 #include <vm/vm_phys.h>
106 #include <vm/vm_dumpset.h>
107 
108 #ifdef DDB
109 #ifndef KDB
110 #error KDB must be enabled in order for DDB to work!
111 #endif
112 #include <ddb/ddb.h>
113 #include <ddb/db_sym.h>
114 #endif
115 
116 #include <net/netisr.h>
117 
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/mp_watchdog.h>
127 #include <machine/pc/bios.h>
128 #include <machine/pcb.h>
129 #include <machine/proc.h>
130 #include <machine/sigframe.h>
131 #include <machine/specialreg.h>
132 #include <machine/trap.h>
133 #include <machine/tss.h>
134 #include <x86/ucode.h>
135 #include <x86/ifunc.h>
136 #ifdef SMP
137 #include <machine/smp.h>
138 #endif
139 #ifdef FDT
140 #include <x86/fdt.h>
141 #endif
142 
143 #ifdef DEV_ATPIC
144 #include <x86/isa/icu.h>
145 #else
146 #include <x86/apicvar.h>
147 #endif
148 
149 #include <isa/isareg.h>
150 #include <isa/rtc.h>
151 #include <x86/init.h>
152 
153 /* Sanity check for __curthread() */
154 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
155 
156 /*
157  * The PTI trampoline stack needs enough space for a hardware trapframe and a
158  * couple of scratch registers, as well as the trapframe left behind after an
159  * iret fault.
160  */
161 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
162     offsetof(struct pti_frame, pti_rip));
163 
164 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
165 
166 static void cpu_startup(void *);
167 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
168 
169 /* Probe 8254 PIT and TSC. */
170 static void native_clock_source_init(void);
171 
172 /* Preload data parse function */
173 static caddr_t native_parse_preload_data(u_int64_t);
174 
175 /* Native function to fetch and parse the e820 map */
176 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
177 
178 /* Default init_ops implementation. */
179 struct init_ops init_ops = {
180 	.parse_preload_data =		native_parse_preload_data,
181 	.early_clock_source_init =	native_clock_source_init,
182 	.early_delay =			i8254_delay,
183 	.parse_memmap =			native_parse_memmap,
184 #ifdef SMP
185 	.start_all_aps =		native_start_all_aps,
186 #endif
187 #ifdef DEV_PCI
188 	.msi_init =			msi_init,
189 #endif
190 };
191 
192 /*
193  * Physical address of the EFI System Table. Stashed from the metadata hints
194  * passed into the kernel and used by the EFI code to call runtime services.
195  */
196 vm_paddr_t efi_systbl_phys;
197 
198 /* Intel ICH registers */
199 #define ICH_PMBASE	0x400
200 #define ICH_SMI_EN	ICH_PMBASE + 0x30
201 
202 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
203 
204 int cold = 1;
205 
206 long Maxmem = 0;
207 long realmem = 0;
208 
209 struct kva_md_info kmi;
210 
211 static struct trapframe proc0_tf;
212 struct region_descriptor r_idt;
213 
214 struct pcpu *__pcpu;
215 struct pcpu temp_bsp_pcpu;
216 
217 struct mtx icu_lock;
218 
219 struct mem_range_softc mem_range_softc;
220 
221 struct mtx dt_lock;	/* lock for GDT and LDT */
222 
223 void (*vmm_resume_p)(void);
224 
225 bool efi_boot;
226 
227 static void
cpu_startup(void * dummy)228 cpu_startup(void *dummy)
229 {
230 	uintmax_t memsize;
231 	char *sysenv;
232 
233 	/*
234 	 * On MacBooks, we need to disallow the legacy USB circuit to
235 	 * generate an SMI# because this can cause several problems,
236 	 * namely: incorrect CPU frequency detection and failure to
237 	 * start the APs.
238 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
239 	 * Enable register) of the Intel ICH LPC Interface Bridge.
240 	 */
241 	sysenv = kern_getenv("smbios.system.product");
242 	if (sysenv != NULL) {
243 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
244 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
245 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
246 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
247 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
248 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
249 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
250 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
251 			if (bootverbose)
252 				printf("Disabling LEGACY_USB_EN bit on "
253 				    "Intel ICH.\n");
254 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
255 		}
256 		freeenv(sysenv);
257 	}
258 
259 	/*
260 	 * Good {morning,afternoon,evening,night}.
261 	 */
262 	startrtclock();
263 	printcpuinfo();
264 
265 	/*
266 	 * Display physical memory if SMBIOS reports reasonable amount.
267 	 */
268 	memsize = 0;
269 	sysenv = kern_getenv("smbios.memory.enabled");
270 	if (sysenv != NULL) {
271 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
272 		freeenv(sysenv);
273 	}
274 	if (memsize < ptoa((uintmax_t)vm_free_count()))
275 		memsize = ptoa((uintmax_t)Maxmem);
276 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
277 	realmem = atop(memsize);
278 
279 	/*
280 	 * Display any holes after the first chunk of extended memory.
281 	 */
282 	if (bootverbose) {
283 		int indx;
284 
285 		printf("Physical memory chunk(s):\n");
286 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
287 			vm_paddr_t size;
288 
289 			size = phys_avail[indx + 1] - phys_avail[indx];
290 			printf(
291 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
292 			    (uintmax_t)phys_avail[indx],
293 			    (uintmax_t)phys_avail[indx + 1] - 1,
294 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
295 		}
296 	}
297 
298 	vm_ksubmap_init(&kmi);
299 
300 	printf("avail memory = %ju (%ju MB)\n",
301 	    ptoa((uintmax_t)vm_free_count()),
302 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
303 #ifdef DEV_PCI
304 	if (bootverbose && intel_graphics_stolen_base != 0)
305 		printf("intel stolen mem: base %#jx size %ju MB\n",
306 		    (uintmax_t)intel_graphics_stolen_base,
307 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
308 #endif
309 
310 	/*
311 	 * Set up buffers, so they can be used to read disk labels.
312 	 */
313 	bufinit();
314 	vm_pager_bufferinit();
315 
316 	cpu_setregs();
317 }
318 
319 static void
late_ifunc_resolve(void * dummy __unused)320 late_ifunc_resolve(void *dummy __unused)
321 {
322 	link_elf_late_ireloc();
323 }
324 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
325 
326 
327 void
cpu_setregs(void)328 cpu_setregs(void)
329 {
330 	register_t cr0;
331 
332 	cr0 = rcr0();
333 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
334 	load_cr0(cr0);
335 }
336 
337 /*
338  * Initialize amd64 and configure to run kernel
339  */
340 
341 /*
342  * Initialize segments & interrupt table
343  */
344 static struct gate_descriptor idt0[NIDT];
345 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
346 
347 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
348 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
349 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
350 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
351 CTASSERT(sizeof(struct nmi_pcpu) == 16);
352 
353 /*
354  * Software prototypes -- in more palatable form.
355  *
356  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
357  * slots as corresponding segments for i386 kernel.
358  */
359 struct soft_segment_descriptor gdt_segs[] = {
360 [GNULL_SEL] = { /* 0 Null Descriptor */
361 	.ssd_base = 0x0,
362 	.ssd_limit = 0x0,
363 	.ssd_type = 0,
364 	.ssd_dpl = 0,
365 	.ssd_p = 0,
366 	.ssd_long = 0,
367 	.ssd_def32 = 0,
368 	.ssd_gran = 0		},
369 [GNULL2_SEL] = { /*	1 Null Descriptor */
370 	.ssd_base = 0x0,
371 	.ssd_limit = 0x0,
372 	.ssd_type = 0,
373 	.ssd_dpl = 0,
374 	.ssd_p = 0,
375 	.ssd_long = 0,
376 	.ssd_def32 = 0,
377 	.ssd_gran = 0		},
378 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
379 	.ssd_base = 0x0,
380 	.ssd_limit = 0xfffff,
381 	.ssd_type = SDT_MEMRWA,
382 	.ssd_dpl = SEL_UPL,
383 	.ssd_p = 1,
384 	.ssd_long = 0,
385 	.ssd_def32 = 1,
386 	.ssd_gran = 1		},
387 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
388 	.ssd_base = 0x0,
389 	.ssd_limit = 0xfffff,
390 	.ssd_type = SDT_MEMRWA,
391 	.ssd_dpl = SEL_UPL,
392 	.ssd_p = 1,
393 	.ssd_long = 0,
394 	.ssd_def32 = 1,
395 	.ssd_gran = 1		},
396 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
397 	.ssd_base = 0x0,
398 	.ssd_limit = 0xfffff,
399 	.ssd_type = SDT_MEMERA,
400 	.ssd_dpl = SEL_KPL,
401 	.ssd_p = 1,
402 	.ssd_long = 1,
403 	.ssd_def32 = 0,
404 	.ssd_gran = 1		},
405 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
406 	.ssd_base = 0x0,
407 	.ssd_limit = 0xfffff,
408 	.ssd_type = SDT_MEMRWA,
409 	.ssd_dpl = SEL_KPL,
410 	.ssd_p = 1,
411 	.ssd_long = 1,
412 	.ssd_def32 = 0,
413 	.ssd_gran = 1		},
414 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
415 	.ssd_base = 0x0,
416 	.ssd_limit = 0xfffff,
417 	.ssd_type = SDT_MEMERA,
418 	.ssd_dpl = SEL_UPL,
419 	.ssd_p = 1,
420 	.ssd_long = 0,
421 	.ssd_def32 = 1,
422 	.ssd_gran = 1		},
423 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
424 	.ssd_base = 0x0,
425 	.ssd_limit = 0xfffff,
426 	.ssd_type = SDT_MEMRWA,
427 	.ssd_dpl = SEL_UPL,
428 	.ssd_p = 1,
429 	.ssd_long = 0,
430 	.ssd_def32 = 1,
431 	.ssd_gran = 1		},
432 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
433 	.ssd_base = 0x0,
434 	.ssd_limit = 0xfffff,
435 	.ssd_type = SDT_MEMERA,
436 	.ssd_dpl = SEL_UPL,
437 	.ssd_p = 1,
438 	.ssd_long = 1,
439 	.ssd_def32 = 0,
440 	.ssd_gran = 1		},
441 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
442 	.ssd_base = 0x0,
443 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
444 	.ssd_type = SDT_SYSTSS,
445 	.ssd_dpl = SEL_KPL,
446 	.ssd_p = 1,
447 	.ssd_long = 0,
448 	.ssd_def32 = 0,
449 	.ssd_gran = 0		},
450 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
451 	.ssd_base = 0x0,
452 	.ssd_limit = 0x0,
453 	.ssd_type = 0,
454 	.ssd_dpl = 0,
455 	.ssd_p = 0,
456 	.ssd_long = 0,
457 	.ssd_def32 = 0,
458 	.ssd_gran = 0		},
459 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
460 	.ssd_base = 0x0,
461 	.ssd_limit = 0x0,
462 	.ssd_type = 0,
463 	.ssd_dpl = 0,
464 	.ssd_p = 0,
465 	.ssd_long = 0,
466 	.ssd_def32 = 0,
467 	.ssd_gran = 0		},
468 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
469 	.ssd_base = 0x0,
470 	.ssd_limit = 0x0,
471 	.ssd_type = 0,
472 	.ssd_dpl = 0,
473 	.ssd_p = 0,
474 	.ssd_long = 0,
475 	.ssd_def32 = 0,
476 	.ssd_gran = 0		},
477 };
478 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
479 
480 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)481 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
482 {
483 	struct gate_descriptor *ip;
484 
485 	ip = idt + idx;
486 	ip->gd_looffset = (uintptr_t)func;
487 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
488 	ip->gd_ist = ist;
489 	ip->gd_xx = 0;
490 	ip->gd_type = typ;
491 	ip->gd_dpl = dpl;
492 	ip->gd_p = 1;
493 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
494 }
495 
496 extern inthand_t
497 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
498 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
499 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
500 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
501 	IDTVEC(xmm), IDTVEC(dblfault),
502 	IDTVEC(div_pti), IDTVEC(bpt_pti),
503 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
504 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
505 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
506 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
507 	IDTVEC(xmm_pti),
508 #ifdef KDTRACE_HOOKS
509 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
510 #endif
511 #ifdef XENHVM
512 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
513 #endif
514 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
515 	IDTVEC(fast_syscall_pti);
516 
517 #ifdef DDB
518 /*
519  * Display the index and function name of any IDT entries that don't use
520  * the default 'rsvd' entry point.
521  */
DB_SHOW_COMMAND(idt,db_show_idt)522 DB_SHOW_COMMAND(idt, db_show_idt)
523 {
524 	struct gate_descriptor *ip;
525 	int idx;
526 	uintptr_t func;
527 
528 	ip = idt;
529 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
530 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
531 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
532 			db_printf("%3d\t", idx);
533 			db_printsym(func, DB_STGY_PROC);
534 			db_printf("\n");
535 		}
536 		ip++;
537 	}
538 }
539 
540 /* Show privileged registers. */
DB_SHOW_COMMAND(sysregs,db_show_sysregs)541 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
542 {
543 	struct {
544 		uint16_t limit;
545 		uint64_t base;
546 	} __packed idtr, gdtr;
547 	uint16_t ldt, tr;
548 
549 	__asm __volatile("sidt %0" : "=m" (idtr));
550 	db_printf("idtr\t0x%016lx/%04x\n",
551 	    (u_long)idtr.base, (u_int)idtr.limit);
552 	__asm __volatile("sgdt %0" : "=m" (gdtr));
553 	db_printf("gdtr\t0x%016lx/%04x\n",
554 	    (u_long)gdtr.base, (u_int)gdtr.limit);
555 	__asm __volatile("sldt %0" : "=r" (ldt));
556 	db_printf("ldtr\t0x%04x\n", ldt);
557 	__asm __volatile("str %0" : "=r" (tr));
558 	db_printf("tr\t0x%04x\n", tr);
559 	db_printf("cr0\t0x%016lx\n", rcr0());
560 	db_printf("cr2\t0x%016lx\n", rcr2());
561 	db_printf("cr3\t0x%016lx\n", rcr3());
562 	db_printf("cr4\t0x%016lx\n", rcr4());
563 	if (rcr4() & CR4_XSAVE)
564 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
565 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
566 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
567 		db_printf("FEATURES_CTL\t%016lx\n",
568 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
569 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
570 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
571 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
572 }
573 
DB_SHOW_COMMAND(dbregs,db_show_dbregs)574 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
575 {
576 
577 	db_printf("dr0\t0x%016lx\n", rdr0());
578 	db_printf("dr1\t0x%016lx\n", rdr1());
579 	db_printf("dr2\t0x%016lx\n", rdr2());
580 	db_printf("dr3\t0x%016lx\n", rdr3());
581 	db_printf("dr6\t0x%016lx\n", rdr6());
582 	db_printf("dr7\t0x%016lx\n", rdr7());
583 }
584 #endif
585 
586 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)587 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
588 {
589 
590 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
591 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
592 	ssd->ssd_type  = sd->sd_type;
593 	ssd->ssd_dpl   = sd->sd_dpl;
594 	ssd->ssd_p     = sd->sd_p;
595 	ssd->ssd_long  = sd->sd_long;
596 	ssd->ssd_def32 = sd->sd_def32;
597 	ssd->ssd_gran  = sd->sd_gran;
598 }
599 
600 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)601 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
602 {
603 
604 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
605 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
606 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
607 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
608 	sd->sd_type  = ssd->ssd_type;
609 	sd->sd_dpl   = ssd->ssd_dpl;
610 	sd->sd_p     = ssd->ssd_p;
611 	sd->sd_long  = ssd->ssd_long;
612 	sd->sd_def32 = ssd->ssd_def32;
613 	sd->sd_gran  = ssd->ssd_gran;
614 }
615 
616 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)617 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
618 {
619 
620 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
621 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
622 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
623 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
624 	sd->sd_type  = ssd->ssd_type;
625 	sd->sd_dpl   = ssd->ssd_dpl;
626 	sd->sd_p     = ssd->ssd_p;
627 	sd->sd_gran  = ssd->ssd_gran;
628 }
629 
630 u_int basemem;
631 
632 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)633 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
634     int *physmap_idxp)
635 {
636 	int i, insert_idx, physmap_idx;
637 
638 	physmap_idx = *physmap_idxp;
639 
640 	if (length == 0)
641 		return (1);
642 
643 	/*
644 	 * Find insertion point while checking for overlap.  Start off by
645 	 * assuming the new entry will be added to the end.
646 	 *
647 	 * NB: physmap_idx points to the next free slot.
648 	 */
649 	insert_idx = physmap_idx;
650 	for (i = 0; i <= physmap_idx; i += 2) {
651 		if (base < physmap[i + 1]) {
652 			if (base + length <= physmap[i]) {
653 				insert_idx = i;
654 				break;
655 			}
656 			if (boothowto & RB_VERBOSE)
657 				printf(
658 		    "Overlapping memory regions, ignoring second region\n");
659 			return (1);
660 		}
661 	}
662 
663 	/* See if we can prepend to the next entry. */
664 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
665 		physmap[insert_idx] = base;
666 		return (1);
667 	}
668 
669 	/* See if we can append to the previous entry. */
670 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
671 		physmap[insert_idx - 1] += length;
672 		return (1);
673 	}
674 
675 	physmap_idx += 2;
676 	*physmap_idxp = physmap_idx;
677 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
678 		printf(
679 		"Too many segments in the physical address map, giving up\n");
680 		return (0);
681 	}
682 
683 	/*
684 	 * Move the last 'N' entries down to make room for the new
685 	 * entry if needed.
686 	 */
687 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
688 		physmap[i] = physmap[i - 2];
689 		physmap[i + 1] = physmap[i - 1];
690 	}
691 
692 	/* Insert the new entry. */
693 	physmap[insert_idx] = base;
694 	physmap[insert_idx + 1] = base + length;
695 	return (1);
696 }
697 
698 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)699 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
700                       vm_paddr_t *physmap, int *physmap_idx)
701 {
702 	struct bios_smap *smap, *smapend;
703 
704 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
705 
706 	for (smap = smapbase; smap < smapend; smap++) {
707 		if (boothowto & RB_VERBOSE)
708 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
709 			    smap->type, smap->base, smap->length);
710 
711 		if (smap->type != SMAP_TYPE_MEMORY)
712 			continue;
713 
714 		if (!add_physmap_entry(smap->base, smap->length, physmap,
715 		    physmap_idx))
716 			break;
717 	}
718 }
719 
720 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)721 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
722     int *physmap_idx)
723 {
724 	struct efi_md *map, *p;
725 	const char *type;
726 	size_t efisz;
727 	int ndesc, i;
728 
729 	static const char *types[] = {
730 		"Reserved",
731 		"LoaderCode",
732 		"LoaderData",
733 		"BootServicesCode",
734 		"BootServicesData",
735 		"RuntimeServicesCode",
736 		"RuntimeServicesData",
737 		"ConventionalMemory",
738 		"UnusableMemory",
739 		"ACPIReclaimMemory",
740 		"ACPIMemoryNVS",
741 		"MemoryMappedIO",
742 		"MemoryMappedIOPortSpace",
743 		"PalCode",
744 		"PersistentMemory"
745 	};
746 
747 	/*
748 	 * Memory map data provided by UEFI via the GetMemoryMap
749 	 * Boot Services API.
750 	 */
751 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
752 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
753 
754 	if (efihdr->descriptor_size == 0)
755 		return;
756 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
757 
758 	if (boothowto & RB_VERBOSE)
759 		printf("%23s %12s %12s %8s %4s\n",
760 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
761 
762 	for (i = 0, p = map; i < ndesc; i++,
763 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
764 		if (boothowto & RB_VERBOSE) {
765 			if (p->md_type < nitems(types))
766 				type = types[p->md_type];
767 			else
768 				type = "<INVALID>";
769 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
770 			    p->md_virt, p->md_pages);
771 			if (p->md_attr & EFI_MD_ATTR_UC)
772 				printf("UC ");
773 			if (p->md_attr & EFI_MD_ATTR_WC)
774 				printf("WC ");
775 			if (p->md_attr & EFI_MD_ATTR_WT)
776 				printf("WT ");
777 			if (p->md_attr & EFI_MD_ATTR_WB)
778 				printf("WB ");
779 			if (p->md_attr & EFI_MD_ATTR_UCE)
780 				printf("UCE ");
781 			if (p->md_attr & EFI_MD_ATTR_WP)
782 				printf("WP ");
783 			if (p->md_attr & EFI_MD_ATTR_RP)
784 				printf("RP ");
785 			if (p->md_attr & EFI_MD_ATTR_XP)
786 				printf("XP ");
787 			if (p->md_attr & EFI_MD_ATTR_NV)
788 				printf("NV ");
789 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
790 				printf("MORE_RELIABLE ");
791 			if (p->md_attr & EFI_MD_ATTR_RO)
792 				printf("RO ");
793 			if (p->md_attr & EFI_MD_ATTR_RT)
794 				printf("RUNTIME");
795 			printf("\n");
796 		}
797 
798 		switch (p->md_type) {
799 		case EFI_MD_TYPE_CODE:
800 		case EFI_MD_TYPE_DATA:
801 		case EFI_MD_TYPE_BS_CODE:
802 		case EFI_MD_TYPE_BS_DATA:
803 		case EFI_MD_TYPE_FREE:
804 			/*
805 			 * We're allowed to use any entry with these types.
806 			 */
807 			break;
808 		default:
809 			continue;
810 		}
811 
812 		if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
813 		    physmap, physmap_idx))
814 			break;
815 	}
816 }
817 
818 static char bootmethod[16] = "";
819 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
820     "System firmware boot method");
821 
822 static void
native_parse_memmap(caddr_t kmdp,vm_paddr_t * physmap,int * physmap_idx)823 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
824 {
825 	struct bios_smap *smap;
826 	struct efi_map_header *efihdr;
827 	u_int32_t size;
828 
829 	/*
830 	 * Memory map from INT 15:E820.
831 	 *
832 	 * subr_module.c says:
833 	 * "Consumer may safely assume that size value precedes data."
834 	 * ie: an int32_t immediately precedes smap.
835 	 */
836 
837 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
838 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
839 	smap = (struct bios_smap *)preload_search_info(kmdp,
840 	    MODINFO_METADATA | MODINFOMD_SMAP);
841 	if (efihdr == NULL && smap == NULL)
842 		panic("No BIOS smap or EFI map info from loader!");
843 
844 	if (efihdr != NULL) {
845 		add_efi_map_entries(efihdr, physmap, physmap_idx);
846 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
847 	} else {
848 		size = *((u_int32_t *)smap - 1);
849 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
850 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
851 	}
852 }
853 
854 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
855 
856 /*
857  * Populate the (physmap) array with base/bound pairs describing the
858  * available physical memory in the system, then test this memory and
859  * build the phys_avail array describing the actually-available memory.
860  *
861  * Total memory size may be set by the kernel environment variable
862  * hw.physmem or the compile-time define MAXMEM.
863  *
864  * XXX first should be vm_paddr_t.
865  */
866 static void
getmemsize(caddr_t kmdp,u_int64_t first)867 getmemsize(caddr_t kmdp, u_int64_t first)
868 {
869 	int i, physmap_idx, pa_indx, da_indx;
870 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
871 	u_long physmem_start, physmem_tunable, memtest;
872 	pt_entry_t *pte;
873 	quad_t dcons_addr, dcons_size;
874 	int page_counter;
875 
876 	/*
877 	 * Tell the physical memory allocator about pages used to store
878 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
879 	 */
880 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
881 
882 	bzero(physmap, sizeof(physmap));
883 	physmap_idx = 0;
884 
885 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
886 	physmap_idx -= 2;
887 
888 	/*
889 	 * Find the 'base memory' segment for SMP
890 	 */
891 	basemem = 0;
892 	for (i = 0; i <= physmap_idx; i += 2) {
893 		if (physmap[i] <= 0xA0000) {
894 			basemem = physmap[i + 1] / 1024;
895 			break;
896 		}
897 	}
898 	if (basemem == 0 || basemem > 640) {
899 		if (bootverbose)
900 			printf(
901 		"Memory map doesn't contain a basemem segment, faking it");
902 		basemem = 640;
903 	}
904 
905 	/*
906 	 * Maxmem isn't the "maximum memory", it's one larger than the
907 	 * highest page of the physical address space.  It should be
908 	 * called something like "Maxphyspage".  We may adjust this
909 	 * based on ``hw.physmem'' and the results of the memory test.
910 	 */
911 	Maxmem = atop(physmap[physmap_idx + 1]);
912 
913 #ifdef MAXMEM
914 	Maxmem = MAXMEM / 4;
915 #endif
916 
917 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
918 		Maxmem = atop(physmem_tunable);
919 
920 	/*
921 	 * The boot memory test is disabled by default, as it takes a
922 	 * significant amount of time on large-memory systems, and is
923 	 * unfriendly to virtual machines as it unnecessarily touches all
924 	 * pages.
925 	 *
926 	 * A general name is used as the code may be extended to support
927 	 * additional tests beyond the current "page present" test.
928 	 */
929 	memtest = 0;
930 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
931 
932 	/*
933 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
934 	 * in the system.
935 	 */
936 	if (Maxmem > atop(physmap[physmap_idx + 1]))
937 		Maxmem = atop(physmap[physmap_idx + 1]);
938 
939 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
940 	    (boothowto & RB_VERBOSE))
941 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
942 
943 	/* call pmap initialization to make new kernel address space */
944 	pmap_bootstrap(&first);
945 
946 	/*
947 	 * Size up each available chunk of physical memory.
948 	 *
949 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
950 	 * By default, mask off the first 16 pages unless we appear to be
951 	 * running in a VM.
952 	 */
953 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
954 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
955 	if (physmap[0] < physmem_start) {
956 		if (physmem_start < PAGE_SIZE)
957 			physmap[0] = PAGE_SIZE;
958 		else if (physmem_start >= physmap[1])
959 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
960 		else
961 			physmap[0] = round_page(physmem_start);
962 	}
963 	pa_indx = 0;
964 	da_indx = 1;
965 	phys_avail[pa_indx++] = physmap[0];
966 	phys_avail[pa_indx] = physmap[0];
967 	dump_avail[da_indx] = physmap[0];
968 	pte = CMAP1;
969 
970 	/*
971 	 * Get dcons buffer address
972 	 */
973 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
974 	    getenv_quad("dcons.size", &dcons_size) == 0)
975 		dcons_addr = 0;
976 
977 	/*
978 	 * physmap is in bytes, so when converting to page boundaries,
979 	 * round up the start address and round down the end address.
980 	 */
981 	page_counter = 0;
982 	if (memtest != 0)
983 		printf("Testing system memory");
984 	for (i = 0; i <= physmap_idx; i += 2) {
985 		vm_paddr_t end;
986 
987 		end = ptoa((vm_paddr_t)Maxmem);
988 		if (physmap[i + 1] < end)
989 			end = trunc_page(physmap[i + 1]);
990 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
991 			int *ptr = (int *)CADDR1;
992 			int tmp;
993 			bool full, page_bad;
994 
995 			full = false;
996 			/*
997 			 * block out kernel memory as not available.
998 			 */
999 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1000 				goto do_dump_avail;
1001 
1002 			/*
1003 			 * block out dcons buffer
1004 			 */
1005 			if (dcons_addr > 0
1006 			    && pa >= trunc_page(dcons_addr)
1007 			    && pa < dcons_addr + dcons_size)
1008 				goto do_dump_avail;
1009 
1010 			page_bad = false;
1011 			if (memtest == 0)
1012 				goto skip_memtest;
1013 
1014 			/*
1015 			 * Print a "." every GB to show we're making
1016 			 * progress.
1017 			 */
1018 			page_counter++;
1019 			if ((page_counter % PAGES_PER_GB) == 0)
1020 				printf(".");
1021 
1022 			/*
1023 			 * map page into kernel: valid, read/write,non-cacheable
1024 			 */
1025 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1026 			invltlb();
1027 
1028 			tmp = *(int *)ptr;
1029 			/*
1030 			 * Test for alternating 1's and 0's
1031 			 */
1032 			*(volatile int *)ptr = 0xaaaaaaaa;
1033 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1034 				page_bad = true;
1035 			/*
1036 			 * Test for alternating 0's and 1's
1037 			 */
1038 			*(volatile int *)ptr = 0x55555555;
1039 			if (*(volatile int *)ptr != 0x55555555)
1040 				page_bad = true;
1041 			/*
1042 			 * Test for all 1's
1043 			 */
1044 			*(volatile int *)ptr = 0xffffffff;
1045 			if (*(volatile int *)ptr != 0xffffffff)
1046 				page_bad = true;
1047 			/*
1048 			 * Test for all 0's
1049 			 */
1050 			*(volatile int *)ptr = 0x0;
1051 			if (*(volatile int *)ptr != 0x0)
1052 				page_bad = true;
1053 			/*
1054 			 * Restore original value.
1055 			 */
1056 			*(int *)ptr = tmp;
1057 
1058 skip_memtest:
1059 			/*
1060 			 * Adjust array of valid/good pages.
1061 			 */
1062 			if (page_bad == true)
1063 				continue;
1064 			/*
1065 			 * If this good page is a continuation of the
1066 			 * previous set of good pages, then just increase
1067 			 * the end pointer. Otherwise start a new chunk.
1068 			 * Note that "end" points one higher than end,
1069 			 * making the range >= start and < end.
1070 			 * If we're also doing a speculative memory
1071 			 * test and we at or past the end, bump up Maxmem
1072 			 * so that we keep going. The first bad page
1073 			 * will terminate the loop.
1074 			 */
1075 			if (phys_avail[pa_indx] == pa) {
1076 				phys_avail[pa_indx] += PAGE_SIZE;
1077 			} else {
1078 				pa_indx++;
1079 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1080 					printf(
1081 		"Too many holes in the physical address space, giving up\n");
1082 					pa_indx--;
1083 					full = true;
1084 					goto do_dump_avail;
1085 				}
1086 				phys_avail[pa_indx++] = pa;	/* start */
1087 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1088 			}
1089 			physmem++;
1090 do_dump_avail:
1091 			if (dump_avail[da_indx] == pa) {
1092 				dump_avail[da_indx] += PAGE_SIZE;
1093 			} else {
1094 				da_indx++;
1095 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1096 					da_indx--;
1097 					goto do_next;
1098 				}
1099 				dump_avail[da_indx++] = pa; /* start */
1100 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1101 			}
1102 do_next:
1103 			if (full)
1104 				break;
1105 		}
1106 	}
1107 	*pte = 0;
1108 	invltlb();
1109 	if (memtest != 0)
1110 		printf("\n");
1111 
1112 	/*
1113 	 * XXX
1114 	 * The last chunk must contain at least one page plus the message
1115 	 * buffer to avoid complicating other code (message buffer address
1116 	 * calculation, etc.).
1117 	 */
1118 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1119 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1120 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1121 		phys_avail[pa_indx--] = 0;
1122 		phys_avail[pa_indx--] = 0;
1123 	}
1124 
1125 	Maxmem = atop(phys_avail[pa_indx]);
1126 
1127 	/* Trim off space for the message buffer. */
1128 	phys_avail[pa_indx] -= round_page(msgbufsize);
1129 
1130 	/* Map the message buffer. */
1131 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1132 }
1133 
1134 static caddr_t
native_parse_preload_data(u_int64_t modulep)1135 native_parse_preload_data(u_int64_t modulep)
1136 {
1137 	caddr_t kmdp;
1138 	char *envp;
1139 #ifdef DDB
1140 	vm_offset_t ksym_start;
1141 	vm_offset_t ksym_end;
1142 #endif
1143 
1144 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1145 	preload_bootstrap_relocate(KERNBASE);
1146 	kmdp = preload_search_by_type("elf kernel");
1147 	if (kmdp == NULL)
1148 		kmdp = preload_search_by_type("elf64 kernel");
1149 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1150 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1151 	if (envp != NULL)
1152 		envp += KERNBASE;
1153 	init_static_kenv(envp, 0);
1154 #ifdef DDB
1155 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1156 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1157 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1158 #endif
1159 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1160 
1161 	return (kmdp);
1162 }
1163 
1164 static void
native_clock_source_init(void)1165 native_clock_source_init(void)
1166 {
1167 	i8254_init();
1168 	tsc_init();
1169 }
1170 
1171 static void
amd64_kdb_init(void)1172 amd64_kdb_init(void)
1173 {
1174 	kdb_init();
1175 #ifdef KDB
1176 	if (boothowto & RB_KDB)
1177 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1178 #endif
1179 }
1180 
1181 /* Set up the fast syscall stuff */
1182 void
amd64_conf_fast_syscall(void)1183 amd64_conf_fast_syscall(void)
1184 {
1185 	uint64_t msr;
1186 
1187 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1188 	wrmsr(MSR_EFER, msr);
1189 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1190 	    (u_int64_t)IDTVEC(fast_syscall));
1191 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1192 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1193 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1194 	wrmsr(MSR_STAR, msr);
1195 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1196 }
1197 
1198 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1199 amd64_bsp_pcpu_init1(struct pcpu *pc)
1200 {
1201 	struct user_segment_descriptor *gdt;
1202 
1203 	PCPU_SET(prvspace, pc);
1204 	gdt = *PCPU_PTR(gdt);
1205 	PCPU_SET(curthread, &thread0);
1206 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1207 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1208 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1209 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1210 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1211 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1212 	PCPU_SET(smp_tlb_gen, 1);
1213 }
1214 
1215 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1216 amd64_bsp_pcpu_init2(uint64_t rsp0)
1217 {
1218 
1219 	PCPU_SET(rsp0, rsp0);
1220 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1221 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1222 	PCPU_SET(curpcb, thread0.td_pcb);
1223 }
1224 
1225 void
amd64_bsp_ist_init(struct pcpu * pc)1226 amd64_bsp_ist_init(struct pcpu *pc)
1227 {
1228 	struct nmi_pcpu *np;
1229 	struct amd64tss *tssp;
1230 
1231 	tssp = &pc->pc_common_tss;
1232 
1233 	/* doublefault stack space, runs on ist1 */
1234 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1235 	np->np_pcpu = (register_t)pc;
1236 	tssp->tss_ist1 = (long)np;
1237 
1238 	/*
1239 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1240 	 * above the start of the ist2 stack.
1241 	 */
1242 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1243 	np->np_pcpu = (register_t)pc;
1244 	tssp->tss_ist2 = (long)np;
1245 
1246 	/*
1247 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1248 	 * above the start of the ist3 stack.
1249 	 */
1250 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1251 	np->np_pcpu = (register_t)pc;
1252 	tssp->tss_ist3 = (long)np;
1253 
1254 	/*
1255 	 * DB# stack, runs on ist4.
1256 	 */
1257 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1258 	np->np_pcpu = (register_t)pc;
1259 	tssp->tss_ist4 = (long)np;
1260 }
1261 
1262 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1263 hammer_time(u_int64_t modulep, u_int64_t physfree)
1264 {
1265 	caddr_t kmdp;
1266 	int gsel_tss, x;
1267 	struct pcpu *pc;
1268 	uint64_t cr3, rsp0;
1269 	pml4_entry_t *pml4e;
1270 	pdp_entry_t *pdpe;
1271 	pd_entry_t *pde;
1272 	char *env;
1273 	struct user_segment_descriptor *gdt;
1274 	struct region_descriptor r_gdt;
1275 	size_t kstack0_sz;
1276 	int late_console;
1277 
1278 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1279 
1280 	/*
1281 	 * Calculate kernphys by inspecting page table created by loader.
1282 	 * The assumptions:
1283 	 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1284 	 *   aligned at 2M, below 4G (the latter is important for AP startup)
1285 	 * - there is a 2M hole at KERNBASE
1286 	 * - kernel is mapped with 2M superpages
1287 	 * - all participating memory, i.e. kernel, modules, metadata,
1288 	 *   page table is accessible by pre-created 1:1 mapping
1289 	 *   (right now loader creates 1:1 mapping for lower 4G, and all
1290 	 *   memory is from there)
1291 	 * - there is a usable memory block right after the end of the
1292 	 *   mapped kernel and all modules/metadata, pointed to by
1293 	 *   physfree, for early allocations
1294 	 */
1295 	cr3 = rcr3();
1296 	pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
1297 	    (vm_offset_t)hammer_time);
1298 	pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
1299 	    (vm_offset_t)hammer_time);
1300 	pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
1301 	    (vm_offset_t)hammer_time);
1302 	kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
1303 	    (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
1304 
1305 	/* Fix-up for 2M hole */
1306 	physfree += kernphys;
1307 	kernphys += NBPDR;
1308 
1309 	kmdp = init_ops.parse_preload_data(modulep);
1310 
1311 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1312 	    MODINFOMD_EFI_MAP) != NULL;
1313 
1314 	if (!efi_boot) {
1315 		/* Tell the bios to warmboot next time */
1316 		atomic_store_short((u_short *)0x472, 0x1234);
1317 	}
1318 
1319 	physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1320 	physfree = roundup2(physfree, PAGE_SIZE);
1321 
1322 	identify_cpu1();
1323 	identify_hypervisor();
1324 	identify_cpu_fixup_bsp();
1325 	identify_cpu2();
1326 	initializecpucache();
1327 
1328 	/*
1329 	 * Check for pti, pcid, and invpcid before ifuncs are
1330 	 * resolved, to correctly select the implementation for
1331 	 * pmap_activate_sw_mode().
1332 	 */
1333 	pti = pti_get_default();
1334 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1335 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1336 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1337 		invpcid_works = (cpu_stdext_feature &
1338 		    CPUID_STDEXT_INVPCID) != 0;
1339 	} else {
1340 		pmap_pcid_enabled = 0;
1341 	}
1342 
1343 	/*
1344 	 * Now we can do small core initialization, after the PCID
1345 	 * CPU features and user knobs are evaluated.
1346 	 */
1347 	TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1348 	    &pmap_pcid_invlpg_workaround_uena);
1349 	cpu_init_small_core();
1350 
1351 	if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1352 		use_xsave = 1;
1353 		TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1354 	}
1355 
1356 	link_elf_ireloc(kmdp);
1357 
1358 	/*
1359 	 * This may be done better later if it gets more high level
1360 	 * components in it. If so just link td->td_proc here.
1361 	 */
1362 	proc_linkup0(&proc0, &thread0);
1363 
1364 	/* Init basic tunables, hz etc */
1365 	init_param1();
1366 
1367 	thread0.td_kstack = physfree - kernphys + KERNSTART;
1368 	thread0.td_kstack_pages = kstack_pages;
1369 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1370 	bzero((void *)thread0.td_kstack, kstack0_sz);
1371 	physfree += kstack0_sz;
1372 
1373 	/*
1374 	 * Initialize enough of thread0 for delayed invalidation to
1375 	 * work very early.  Rely on thread0.td_base_pri
1376 	 * zero-initialization, it is reset to PVM at proc0_init().
1377 	 */
1378 	pmap_thread_init_invl_gen(&thread0);
1379 
1380 	pc = &temp_bsp_pcpu;
1381 	pcpu_init(pc, 0, sizeof(struct pcpu));
1382 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1383 
1384 	/*
1385 	 * make gdt memory segments
1386 	 */
1387 	for (x = 0; x < NGDT; x++) {
1388 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1389 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1390 			ssdtosd(&gdt_segs[x], &gdt[x]);
1391 	}
1392 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1393 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1394 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1395 
1396 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1397 	r_gdt.rd_base = (long)gdt;
1398 	lgdt(&r_gdt);
1399 
1400 	wrmsr(MSR_FSBASE, 0);		/* User value */
1401 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1402 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1403 
1404 	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1405 	physfree += DPCPU_SIZE;
1406 	amd64_bsp_pcpu_init1(pc);
1407 	/* Non-late cninit() and printf() can be moved up to here. */
1408 
1409 	/*
1410 	 * Initialize mutexes.
1411 	 *
1412 	 * icu_lock: in order to allow an interrupt to occur in a critical
1413 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1414 	 *	     must be able to get the icu lock, so it can't be
1415 	 *	     under witness.
1416 	 */
1417 	mutex_init();
1418 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1419 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1420 
1421 	/* exceptions */
1422 	for (x = 0; x < NIDT; x++)
1423 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1424 		    SEL_KPL, 0);
1425 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1426 	    SEL_KPL, 0);
1427 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1428 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1429 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1430 	    SEL_UPL, 0);
1431 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1432 	    SEL_UPL, 0);
1433 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1434 	    SEL_KPL, 0);
1435 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1436 	    SEL_KPL, 0);
1437 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1438 	    SEL_KPL, 0);
1439 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1440 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1441 	    SDT_SYSIGT, SEL_KPL, 0);
1442 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1443 	    SEL_KPL, 0);
1444 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1445 	    SDT_SYSIGT, SEL_KPL, 0);
1446 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1447 	    SEL_KPL, 0);
1448 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1449 	    SEL_KPL, 0);
1450 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1451 	    SEL_KPL, 0);
1452 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1453 	    SEL_KPL, 0);
1454 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1455 	    SEL_KPL, 0);
1456 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1457 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1458 	    SEL_KPL, 0);
1459 #ifdef KDTRACE_HOOKS
1460 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1461 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1462 #endif
1463 #ifdef XENHVM
1464 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1465 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1466 #endif
1467 	r_idt.rd_limit = sizeof(idt0) - 1;
1468 	r_idt.rd_base = (long) idt;
1469 	lidt(&r_idt);
1470 
1471 	/*
1472 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1473 	 * transition).
1474 	 * Once bootblocks have updated, we can test directly for
1475 	 * efi_systbl != NULL here...
1476 	 */
1477 	if (efi_boot)
1478 		vty_set_preferred(VTY_VT);
1479 
1480 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1481 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1482 
1483 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1484 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1485 
1486 	TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1487 	    &syscall_ret_l1d_flush_mode);
1488 
1489 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1490 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1491 
1492 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1493 
1494 	TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1495 	    &x86_rngds_mitg_enable);
1496 
1497 	TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1498 	    &zenbleed_enable);
1499 	zenbleed_sanitize_enable();
1500 
1501 	finishidentcpu();	/* Final stage of CPU initialization */
1502 
1503 	/*
1504 	 * Initialize the clock before the console so that console
1505 	 * initialization can use DELAY().
1506 	 */
1507 	clock_init();
1508 
1509 	initializecpu();	/* Initialize CPU registers */
1510 
1511 	amd64_bsp_ist_init(pc);
1512 
1513 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1514 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1515 	    IOPERM_BITMAP_SIZE;
1516 
1517 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1518 	ltr(gsel_tss);
1519 
1520 	amd64_conf_fast_syscall();
1521 
1522 	/*
1523 	 * We initialize the PCB pointer early so that exception
1524 	 * handlers will work.  Also set up td_critnest to short-cut
1525 	 * the page fault handler.
1526 	 */
1527 	cpu_max_ext_state_size = sizeof(struct savefpu);
1528 	set_top_of_stack_td(&thread0);
1529 	thread0.td_pcb = get_pcb_td(&thread0);
1530 	thread0.td_critnest = 1;
1531 
1532 	/*
1533 	 * The console and kdb should be initialized even earlier than here,
1534 	 * but some console drivers don't work until after getmemsize().
1535 	 * Default to late console initialization to support these drivers.
1536 	 * This loses mainly printf()s in getmemsize() and early debugging.
1537 	 */
1538 	late_console = 1;
1539 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1540 	if (!late_console) {
1541 		cninit();
1542 		amd64_kdb_init();
1543 	}
1544 
1545 	getmemsize(kmdp, physfree);
1546 	init_param2(physmem);
1547 
1548 	/* now running on new page tables, configured,and u/iom is accessible */
1549 
1550 #ifdef DEV_PCI
1551         /* This call might adjust phys_avail[]. */
1552         pci_early_quirks();
1553 #endif
1554 
1555 	if (late_console)
1556 		cninit();
1557 
1558 	/*
1559 	 * Dump the boot metadata. We have to wait for cninit() since console
1560 	 * output is required. If it's grossly incorrect the kernel will never
1561 	 * make it this far.
1562 	 */
1563 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1564 		preload_dump();
1565 
1566 #ifdef DEV_ISA
1567 #ifdef DEV_ATPIC
1568 	elcr_probe();
1569 	atpic_startup();
1570 #else
1571 	/* Reset and mask the atpics and leave them shut down. */
1572 	atpic_reset();
1573 
1574 	/*
1575 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1576 	 * interrupt handler.
1577 	 */
1578 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1579 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1580 #endif
1581 #else
1582 #error "have you forgotten the isa device?"
1583 #endif
1584 
1585 	if (late_console)
1586 		amd64_kdb_init();
1587 
1588 	msgbufinit(msgbufp, msgbufsize);
1589 	fpuinit();
1590 
1591 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1592 	rsp0 = thread0.td_md.md_stack_base;
1593 	/* Ensure the stack is aligned to 16 bytes */
1594 	rsp0 &= ~0xFul;
1595 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1596 	amd64_bsp_pcpu_init2(rsp0);
1597 
1598 	/* transfer to user mode */
1599 
1600 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1601 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1602 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1603 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1604 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1605 
1606 	load_ds(_udatasel);
1607 	load_es(_udatasel);
1608 	load_fs(_ufssel);
1609 
1610 	/* setup proc 0's pcb */
1611 	thread0.td_pcb->pcb_flags = 0;
1612 	thread0.td_frame = &proc0_tf;
1613 
1614         env = kern_getenv("kernelname");
1615 	if (env != NULL)
1616 		strlcpy(kernelname, env, sizeof(kernelname));
1617 
1618 	kcsan_cpu_init(0);
1619 
1620 #ifdef FDT
1621 	x86_init_fdt();
1622 #endif
1623 	thread0.td_critnest = 0;
1624 
1625 	kasan_init();
1626 
1627 	TSEXIT();
1628 
1629 	/* Location of kernel stack for locore */
1630 	return (thread0.td_md.md_stack_base);
1631 }
1632 
1633 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1634 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1635 {
1636 
1637 	pcpu->pc_acpi_id = 0xffffffff;
1638 }
1639 
1640 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1641 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1642 {
1643 	struct bios_smap *smapbase;
1644 	struct bios_smap_xattr smap;
1645 	caddr_t kmdp;
1646 	uint32_t *smapattr;
1647 	int count, error, i;
1648 
1649 	/* Retrieve the system memory map from the loader. */
1650 	kmdp = preload_search_by_type("elf kernel");
1651 	if (kmdp == NULL)
1652 		kmdp = preload_search_by_type("elf64 kernel");
1653 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1654 	    MODINFO_METADATA | MODINFOMD_SMAP);
1655 	if (smapbase == NULL)
1656 		return (0);
1657 	smapattr = (uint32_t *)preload_search_info(kmdp,
1658 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1659 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1660 	error = 0;
1661 	for (i = 0; i < count; i++) {
1662 		smap.base = smapbase[i].base;
1663 		smap.length = smapbase[i].length;
1664 		smap.type = smapbase[i].type;
1665 		if (smapattr != NULL)
1666 			smap.xattr = smapattr[i];
1667 		else
1668 			smap.xattr = 0;
1669 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1670 	}
1671 	return (error);
1672 }
1673 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1674     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1675     smap_sysctl_handler, "S,bios_smap_xattr",
1676     "Raw BIOS SMAP data");
1677 
1678 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1679 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1680 {
1681 	struct efi_map_header *efihdr;
1682 	caddr_t kmdp;
1683 	uint32_t efisize;
1684 
1685 	kmdp = preload_search_by_type("elf kernel");
1686 	if (kmdp == NULL)
1687 		kmdp = preload_search_by_type("elf64 kernel");
1688 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1689 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1690 	if (efihdr == NULL)
1691 		return (0);
1692 	efisize = *((uint32_t *)efihdr - 1);
1693 	return (SYSCTL_OUT(req, efihdr, efisize));
1694 }
1695 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1696     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1697     efi_map_sysctl_handler, "S,efi_map_header",
1698     "Raw EFI Memory Map");
1699 
1700 void
spinlock_enter(void)1701 spinlock_enter(void)
1702 {
1703 	struct thread *td;
1704 	register_t flags;
1705 
1706 	td = curthread;
1707 	if (td->td_md.md_spinlock_count == 0) {
1708 		flags = intr_disable();
1709 		td->td_md.md_spinlock_count = 1;
1710 		td->td_md.md_saved_flags = flags;
1711 		critical_enter();
1712 	} else
1713 		td->td_md.md_spinlock_count++;
1714 }
1715 
1716 void
spinlock_exit(void)1717 spinlock_exit(void)
1718 {
1719 	struct thread *td;
1720 	register_t flags;
1721 
1722 	td = curthread;
1723 	flags = td->td_md.md_saved_flags;
1724 	td->td_md.md_spinlock_count--;
1725 	if (td->td_md.md_spinlock_count == 0) {
1726 		critical_exit();
1727 		intr_restore(flags);
1728 	}
1729 }
1730 
1731 /*
1732  * Construct a PCB from a trapframe. This is called from kdb_trap() where
1733  * we want to start a backtrace from the function that caused us to enter
1734  * the debugger. We have the context in the trapframe, but base the trace
1735  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1736  * enough for a backtrace.
1737  */
1738 void
makectx(struct trapframe * tf,struct pcb * pcb)1739 makectx(struct trapframe *tf, struct pcb *pcb)
1740 {
1741 
1742 	pcb->pcb_r12 = tf->tf_r12;
1743 	pcb->pcb_r13 = tf->tf_r13;
1744 	pcb->pcb_r14 = tf->tf_r14;
1745 	pcb->pcb_r15 = tf->tf_r15;
1746 	pcb->pcb_rbp = tf->tf_rbp;
1747 	pcb->pcb_rbx = tf->tf_rbx;
1748 	pcb->pcb_rip = tf->tf_rip;
1749 	pcb->pcb_rsp = tf->tf_rsp;
1750 }
1751 
1752 /*
1753  * The pcb_flags is only modified by current thread, or by other threads
1754  * when current thread is stopped.  However, current thread may change it
1755  * from the interrupt context in cpu_switch(), or in the trap handler.
1756  * When we read-modify-write pcb_flags from C sources, compiler may generate
1757  * code that is not atomic regarding the interrupt handler.  If a trap or
1758  * interrupt happens and any flag is modified from the handler, it can be
1759  * clobbered with the cached value later.  Therefore, we implement setting
1760  * and clearing flags with single-instruction functions, which do not race
1761  * with possible modification of the flags from the trap or interrupt context,
1762  * because traps and interrupts are executed only on instruction boundary.
1763  */
1764 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1765 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1766 {
1767 
1768 	__asm __volatile("orl %1,%0"
1769 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1770 	    : "cc", "memory");
1771 
1772 }
1773 
1774 /*
1775  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1776  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1777  * pcb if user space modified the bases.  We must save on the context
1778  * switch or if the return to usermode happens through the doreti.
1779  *
1780  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1781  * which have a consequence that the base MSRs must be saved each time
1782  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
1783  * context switches.
1784  */
1785 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1786 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1787 {
1788 	register_t r;
1789 
1790 	if (curpcb == pcb &&
1791 	    (flags & PCB_FULL_IRET) != 0 &&
1792 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1793 		r = intr_disable();
1794 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1795 			if (rfs() == _ufssel)
1796 				pcb->pcb_fsbase = rdfsbase();
1797 			if (rgs() == _ugssel)
1798 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1799 		}
1800 		set_pcb_flags_raw(pcb, flags);
1801 		intr_restore(r);
1802 	} else {
1803 		set_pcb_flags_raw(pcb, flags);
1804 	}
1805 }
1806 
1807 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1808 {
1809 
1810 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1811 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1812 }
1813 
1814 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1815 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1816 {
1817 
1818 	__asm __volatile("andl %1,%0"
1819 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1820 	    : "cc", "memory");
1821 }
1822 
1823 #ifdef KDB
1824 
1825 /*
1826  * Provide inb() and outb() as functions.  They are normally only available as
1827  * inline functions, thus cannot be called from the debugger.
1828  */
1829 
1830 /* silence compiler warnings */
1831 u_char inb_(u_short);
1832 void outb_(u_short, u_char);
1833 
1834 u_char
inb_(u_short port)1835 inb_(u_short port)
1836 {
1837 	return inb(port);
1838 }
1839 
1840 void
outb_(u_short port,u_char data)1841 outb_(u_short port, u_char data)
1842 {
1843 	outb(port, data);
1844 }
1845 
1846 #endif /* KDB */
1847 
1848 #undef memset
1849 #undef memmove
1850 #undef memcpy
1851 
1852 void	*memset_std(void *buf, int c, size_t len);
1853 void	*memset_erms(void *buf, int c, size_t len);
1854 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1855 	    size_t len);
1856 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1857 	    size_t len);
1858 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1859 	    size_t len);
1860 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1861 	    size_t len);
1862 
1863 #ifdef KCSAN
1864 /*
1865  * These fail to build as ifuncs when used with KCSAN.
1866  */
1867 void *
memset(void * buf,int c,size_t len)1868 memset(void *buf, int c, size_t len)
1869 {
1870 
1871 	return (memset_std(buf, c, len));
1872 }
1873 
1874 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1875 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1876 {
1877 
1878 	return (memmove_std(dst, src, len));
1879 }
1880 
1881 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1882 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1883 {
1884 
1885 	return (memcpy_std(dst, src, len));
1886 }
1887 #else
1888 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1889 {
1890 
1891 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1892 	    memset_erms : memset_std);
1893 }
1894 
1895 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1896     size_t))
1897 {
1898 
1899 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1900 	    memmove_erms : memmove_std);
1901 }
1902 
1903 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1904 {
1905 
1906 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1907 	    memcpy_erms : memcpy_std);
1908 }
1909 #endif
1910 
1911 void	pagezero_std(void *addr);
1912 void	pagezero_erms(void *addr);
1913 DEFINE_IFUNC(, void , pagezero, (void *))
1914 {
1915 
1916 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1917 	    pagezero_erms : pagezero_std);
1918 }
1919