1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 1992 Terrence R. Lambert.
6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * William Jolitz.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
41 */
42
43 #include <sys/cdefs.h>
44 #include "opt_atpic.h"
45 #include "opt_cpu.h"
46 #include "opt_ddb.h"
47 #include "opt_inet.h"
48 #include "opt_isa.h"
49 #include "opt_kstack_pages.h"
50 #include "opt_maxmem.h"
51 #include "opt_mp_watchdog.h"
52 #include "opt_pci.h"
53 #include "opt_platform.h"
54 #include "opt_sched.h"
55
56 #include <sys/param.h>
57 #include <sys/proc.h>
58 #include <sys/systm.h>
59 #include <sys/asan.h>
60 #include <sys/bio.h>
61 #include <sys/buf.h>
62 #include <sys/bus.h>
63 #include <sys/callout.h>
64 #include <sys/cons.h>
65 #include <sys/cpu.h>
66 #include <sys/csan.h>
67 #include <sys/efi.h>
68 #include <sys/eventhandler.h>
69 #include <sys/exec.h>
70 #include <sys/imgact.h>
71 #include <sys/kdb.h>
72 #include <sys/kernel.h>
73 #include <sys/ktr.h>
74 #include <sys/linker.h>
75 #include <sys/lock.h>
76 #include <sys/malloc.h>
77 #include <sys/memrange.h>
78 #include <sys/msgbuf.h>
79 #include <sys/mutex.h>
80 #include <sys/pcpu.h>
81 #include <sys/ptrace.h>
82 #include <sys/reboot.h>
83 #include <sys/reg.h>
84 #include <sys/rwlock.h>
85 #include <sys/sched.h>
86 #include <sys/signalvar.h>
87 #ifdef SMP
88 #include <sys/smp.h>
89 #endif
90 #include <sys/syscallsubr.h>
91 #include <sys/sysctl.h>
92 #include <sys/sysent.h>
93 #include <sys/sysproto.h>
94 #include <sys/ucontext.h>
95 #include <sys/vmmeter.h>
96
97 #include <vm/vm.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_extern.h>
100 #include <vm/vm_kern.h>
101 #include <vm/vm_page.h>
102 #include <vm/vm_map.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_pager.h>
105 #include <vm/vm_phys.h>
106 #include <vm/vm_dumpset.h>
107
108 #ifdef DDB
109 #ifndef KDB
110 #error KDB must be enabled in order for DDB to work!
111 #endif
112 #include <ddb/ddb.h>
113 #include <ddb/db_sym.h>
114 #endif
115
116 #include <net/netisr.h>
117
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/mp_watchdog.h>
127 #include <machine/pc/bios.h>
128 #include <machine/pcb.h>
129 #include <machine/proc.h>
130 #include <machine/sigframe.h>
131 #include <machine/specialreg.h>
132 #include <machine/trap.h>
133 #include <machine/tss.h>
134 #include <x86/ucode.h>
135 #include <x86/ifunc.h>
136 #ifdef SMP
137 #include <machine/smp.h>
138 #endif
139 #ifdef FDT
140 #include <x86/fdt.h>
141 #endif
142
143 #ifdef DEV_ATPIC
144 #include <x86/isa/icu.h>
145 #else
146 #include <x86/apicvar.h>
147 #endif
148
149 #include <isa/isareg.h>
150 #include <isa/rtc.h>
151 #include <x86/init.h>
152
153 /* Sanity check for __curthread() */
154 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
155
156 /*
157 * The PTI trampoline stack needs enough space for a hardware trapframe and a
158 * couple of scratch registers, as well as the trapframe left behind after an
159 * iret fault.
160 */
161 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
162 offsetof(struct pti_frame, pti_rip));
163
164 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
165
166 static void cpu_startup(void *);
167 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
168
169 /* Probe 8254 PIT and TSC. */
170 static void native_clock_source_init(void);
171
172 /* Preload data parse function */
173 static caddr_t native_parse_preload_data(u_int64_t);
174
175 /* Native function to fetch and parse the e820 map */
176 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
177
178 /* Default init_ops implementation. */
179 struct init_ops init_ops = {
180 .parse_preload_data = native_parse_preload_data,
181 .early_clock_source_init = native_clock_source_init,
182 .early_delay = i8254_delay,
183 .parse_memmap = native_parse_memmap,
184 #ifdef SMP
185 .start_all_aps = native_start_all_aps,
186 #endif
187 #ifdef DEV_PCI
188 .msi_init = msi_init,
189 #endif
190 };
191
192 /*
193 * Physical address of the EFI System Table. Stashed from the metadata hints
194 * passed into the kernel and used by the EFI code to call runtime services.
195 */
196 vm_paddr_t efi_systbl_phys;
197
198 /* Intel ICH registers */
199 #define ICH_PMBASE 0x400
200 #define ICH_SMI_EN ICH_PMBASE + 0x30
201
202 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
203
204 int cold = 1;
205
206 long Maxmem = 0;
207 long realmem = 0;
208
209 struct kva_md_info kmi;
210
211 static struct trapframe proc0_tf;
212 struct region_descriptor r_idt;
213
214 struct pcpu *__pcpu;
215 struct pcpu temp_bsp_pcpu;
216
217 struct mtx icu_lock;
218
219 struct mem_range_softc mem_range_softc;
220
221 struct mtx dt_lock; /* lock for GDT and LDT */
222
223 void (*vmm_resume_p)(void);
224
225 bool efi_boot;
226
227 static void
cpu_startup(void * dummy)228 cpu_startup(void *dummy)
229 {
230 uintmax_t memsize;
231 char *sysenv;
232
233 /*
234 * On MacBooks, we need to disallow the legacy USB circuit to
235 * generate an SMI# because this can cause several problems,
236 * namely: incorrect CPU frequency detection and failure to
237 * start the APs.
238 * We do this by disabling a bit in the SMI_EN (SMI Control and
239 * Enable register) of the Intel ICH LPC Interface Bridge.
240 */
241 sysenv = kern_getenv("smbios.system.product");
242 if (sysenv != NULL) {
243 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
244 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
245 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
246 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
247 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
248 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
249 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
250 strncmp(sysenv, "Macmini1,1", 10) == 0) {
251 if (bootverbose)
252 printf("Disabling LEGACY_USB_EN bit on "
253 "Intel ICH.\n");
254 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
255 }
256 freeenv(sysenv);
257 }
258
259 /*
260 * Good {morning,afternoon,evening,night}.
261 */
262 startrtclock();
263 printcpuinfo();
264
265 /*
266 * Display physical memory if SMBIOS reports reasonable amount.
267 */
268 memsize = 0;
269 sysenv = kern_getenv("smbios.memory.enabled");
270 if (sysenv != NULL) {
271 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
272 freeenv(sysenv);
273 }
274 if (memsize < ptoa((uintmax_t)vm_free_count()))
275 memsize = ptoa((uintmax_t)Maxmem);
276 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
277 realmem = atop(memsize);
278
279 /*
280 * Display any holes after the first chunk of extended memory.
281 */
282 if (bootverbose) {
283 int indx;
284
285 printf("Physical memory chunk(s):\n");
286 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
287 vm_paddr_t size;
288
289 size = phys_avail[indx + 1] - phys_avail[indx];
290 printf(
291 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
292 (uintmax_t)phys_avail[indx],
293 (uintmax_t)phys_avail[indx + 1] - 1,
294 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
295 }
296 }
297
298 vm_ksubmap_init(&kmi);
299
300 printf("avail memory = %ju (%ju MB)\n",
301 ptoa((uintmax_t)vm_free_count()),
302 ptoa((uintmax_t)vm_free_count()) / 1048576);
303 #ifdef DEV_PCI
304 if (bootverbose && intel_graphics_stolen_base != 0)
305 printf("intel stolen mem: base %#jx size %ju MB\n",
306 (uintmax_t)intel_graphics_stolen_base,
307 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
308 #endif
309
310 /*
311 * Set up buffers, so they can be used to read disk labels.
312 */
313 bufinit();
314 vm_pager_bufferinit();
315
316 cpu_setregs();
317 }
318
319 static void
late_ifunc_resolve(void * dummy __unused)320 late_ifunc_resolve(void *dummy __unused)
321 {
322 link_elf_late_ireloc();
323 }
324 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
325
326
327 void
cpu_setregs(void)328 cpu_setregs(void)
329 {
330 register_t cr0;
331
332 cr0 = rcr0();
333 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
334 load_cr0(cr0);
335 }
336
337 /*
338 * Initialize amd64 and configure to run kernel
339 */
340
341 /*
342 * Initialize segments & interrupt table
343 */
344 static struct gate_descriptor idt0[NIDT];
345 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
346
347 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
348 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
349 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
350 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
351 CTASSERT(sizeof(struct nmi_pcpu) == 16);
352
353 /*
354 * Software prototypes -- in more palatable form.
355 *
356 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
357 * slots as corresponding segments for i386 kernel.
358 */
359 struct soft_segment_descriptor gdt_segs[] = {
360 [GNULL_SEL] = { /* 0 Null Descriptor */
361 .ssd_base = 0x0,
362 .ssd_limit = 0x0,
363 .ssd_type = 0,
364 .ssd_dpl = 0,
365 .ssd_p = 0,
366 .ssd_long = 0,
367 .ssd_def32 = 0,
368 .ssd_gran = 0 },
369 [GNULL2_SEL] = { /* 1 Null Descriptor */
370 .ssd_base = 0x0,
371 .ssd_limit = 0x0,
372 .ssd_type = 0,
373 .ssd_dpl = 0,
374 .ssd_p = 0,
375 .ssd_long = 0,
376 .ssd_def32 = 0,
377 .ssd_gran = 0 },
378 [GUFS32_SEL] = { /* 2 32 bit %gs Descriptor for user */
379 .ssd_base = 0x0,
380 .ssd_limit = 0xfffff,
381 .ssd_type = SDT_MEMRWA,
382 .ssd_dpl = SEL_UPL,
383 .ssd_p = 1,
384 .ssd_long = 0,
385 .ssd_def32 = 1,
386 .ssd_gran = 1 },
387 [GUGS32_SEL] = { /* 3 32 bit %fs Descriptor for user */
388 .ssd_base = 0x0,
389 .ssd_limit = 0xfffff,
390 .ssd_type = SDT_MEMRWA,
391 .ssd_dpl = SEL_UPL,
392 .ssd_p = 1,
393 .ssd_long = 0,
394 .ssd_def32 = 1,
395 .ssd_gran = 1 },
396 [GCODE_SEL] = { /* 4 Code Descriptor for kernel */
397 .ssd_base = 0x0,
398 .ssd_limit = 0xfffff,
399 .ssd_type = SDT_MEMERA,
400 .ssd_dpl = SEL_KPL,
401 .ssd_p = 1,
402 .ssd_long = 1,
403 .ssd_def32 = 0,
404 .ssd_gran = 1 },
405 [GDATA_SEL] = { /* 5 Data Descriptor for kernel */
406 .ssd_base = 0x0,
407 .ssd_limit = 0xfffff,
408 .ssd_type = SDT_MEMRWA,
409 .ssd_dpl = SEL_KPL,
410 .ssd_p = 1,
411 .ssd_long = 1,
412 .ssd_def32 = 0,
413 .ssd_gran = 1 },
414 [GUCODE32_SEL] = { /* 6 32 bit Code Descriptor for user */
415 .ssd_base = 0x0,
416 .ssd_limit = 0xfffff,
417 .ssd_type = SDT_MEMERA,
418 .ssd_dpl = SEL_UPL,
419 .ssd_p = 1,
420 .ssd_long = 0,
421 .ssd_def32 = 1,
422 .ssd_gran = 1 },
423 [GUDATA_SEL] = { /* 7 32/64 bit Data Descriptor for user */
424 .ssd_base = 0x0,
425 .ssd_limit = 0xfffff,
426 .ssd_type = SDT_MEMRWA,
427 .ssd_dpl = SEL_UPL,
428 .ssd_p = 1,
429 .ssd_long = 0,
430 .ssd_def32 = 1,
431 .ssd_gran = 1 },
432 [GUCODE_SEL] = { /* 8 64 bit Code Descriptor for user */
433 .ssd_base = 0x0,
434 .ssd_limit = 0xfffff,
435 .ssd_type = SDT_MEMERA,
436 .ssd_dpl = SEL_UPL,
437 .ssd_p = 1,
438 .ssd_long = 1,
439 .ssd_def32 = 0,
440 .ssd_gran = 1 },
441 [GPROC0_SEL] = { /* 9 Proc 0 TSS Descriptor */
442 .ssd_base = 0x0,
443 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
444 .ssd_type = SDT_SYSTSS,
445 .ssd_dpl = SEL_KPL,
446 .ssd_p = 1,
447 .ssd_long = 0,
448 .ssd_def32 = 0,
449 .ssd_gran = 0 },
450 [GPROC0_SEL + 1] = { /* 10 Proc 0 TSS descriptor, double size */
451 .ssd_base = 0x0,
452 .ssd_limit = 0x0,
453 .ssd_type = 0,
454 .ssd_dpl = 0,
455 .ssd_p = 0,
456 .ssd_long = 0,
457 .ssd_def32 = 0,
458 .ssd_gran = 0 },
459 [GUSERLDT_SEL] = { /* 11 LDT Descriptor */
460 .ssd_base = 0x0,
461 .ssd_limit = 0x0,
462 .ssd_type = 0,
463 .ssd_dpl = 0,
464 .ssd_p = 0,
465 .ssd_long = 0,
466 .ssd_def32 = 0,
467 .ssd_gran = 0 },
468 [GUSERLDT_SEL + 1] = { /* 12 LDT Descriptor, double size */
469 .ssd_base = 0x0,
470 .ssd_limit = 0x0,
471 .ssd_type = 0,
472 .ssd_dpl = 0,
473 .ssd_p = 0,
474 .ssd_long = 0,
475 .ssd_def32 = 0,
476 .ssd_gran = 0 },
477 };
478 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
479
480 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)481 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
482 {
483 struct gate_descriptor *ip;
484
485 ip = idt + idx;
486 ip->gd_looffset = (uintptr_t)func;
487 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
488 ip->gd_ist = ist;
489 ip->gd_xx = 0;
490 ip->gd_type = typ;
491 ip->gd_dpl = dpl;
492 ip->gd_p = 1;
493 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
494 }
495
496 extern inthand_t
497 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
498 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
499 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
500 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
501 IDTVEC(xmm), IDTVEC(dblfault),
502 IDTVEC(div_pti), IDTVEC(bpt_pti),
503 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
504 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
505 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
506 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
507 IDTVEC(xmm_pti),
508 #ifdef KDTRACE_HOOKS
509 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
510 #endif
511 #ifdef XENHVM
512 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
513 #endif
514 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
515 IDTVEC(fast_syscall_pti);
516
517 #ifdef DDB
518 /*
519 * Display the index and function name of any IDT entries that don't use
520 * the default 'rsvd' entry point.
521 */
DB_SHOW_COMMAND(idt,db_show_idt)522 DB_SHOW_COMMAND(idt, db_show_idt)
523 {
524 struct gate_descriptor *ip;
525 int idx;
526 uintptr_t func;
527
528 ip = idt;
529 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
530 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
531 if (func != (uintptr_t)&IDTVEC(rsvd)) {
532 db_printf("%3d\t", idx);
533 db_printsym(func, DB_STGY_PROC);
534 db_printf("\n");
535 }
536 ip++;
537 }
538 }
539
540 /* Show privileged registers. */
DB_SHOW_COMMAND(sysregs,db_show_sysregs)541 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
542 {
543 struct {
544 uint16_t limit;
545 uint64_t base;
546 } __packed idtr, gdtr;
547 uint16_t ldt, tr;
548
549 __asm __volatile("sidt %0" : "=m" (idtr));
550 db_printf("idtr\t0x%016lx/%04x\n",
551 (u_long)idtr.base, (u_int)idtr.limit);
552 __asm __volatile("sgdt %0" : "=m" (gdtr));
553 db_printf("gdtr\t0x%016lx/%04x\n",
554 (u_long)gdtr.base, (u_int)gdtr.limit);
555 __asm __volatile("sldt %0" : "=r" (ldt));
556 db_printf("ldtr\t0x%04x\n", ldt);
557 __asm __volatile("str %0" : "=r" (tr));
558 db_printf("tr\t0x%04x\n", tr);
559 db_printf("cr0\t0x%016lx\n", rcr0());
560 db_printf("cr2\t0x%016lx\n", rcr2());
561 db_printf("cr3\t0x%016lx\n", rcr3());
562 db_printf("cr4\t0x%016lx\n", rcr4());
563 if (rcr4() & CR4_XSAVE)
564 db_printf("xcr0\t0x%016lx\n", rxcr(0));
565 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
566 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
567 db_printf("FEATURES_CTL\t%016lx\n",
568 rdmsr(MSR_IA32_FEATURE_CONTROL));
569 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
570 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
571 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
572 }
573
DB_SHOW_COMMAND(dbregs,db_show_dbregs)574 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
575 {
576
577 db_printf("dr0\t0x%016lx\n", rdr0());
578 db_printf("dr1\t0x%016lx\n", rdr1());
579 db_printf("dr2\t0x%016lx\n", rdr2());
580 db_printf("dr3\t0x%016lx\n", rdr3());
581 db_printf("dr6\t0x%016lx\n", rdr6());
582 db_printf("dr7\t0x%016lx\n", rdr7());
583 }
584 #endif
585
586 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)587 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
588 {
589
590 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
591 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
592 ssd->ssd_type = sd->sd_type;
593 ssd->ssd_dpl = sd->sd_dpl;
594 ssd->ssd_p = sd->sd_p;
595 ssd->ssd_long = sd->sd_long;
596 ssd->ssd_def32 = sd->sd_def32;
597 ssd->ssd_gran = sd->sd_gran;
598 }
599
600 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)601 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
602 {
603
604 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
605 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
606 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
607 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
608 sd->sd_type = ssd->ssd_type;
609 sd->sd_dpl = ssd->ssd_dpl;
610 sd->sd_p = ssd->ssd_p;
611 sd->sd_long = ssd->ssd_long;
612 sd->sd_def32 = ssd->ssd_def32;
613 sd->sd_gran = ssd->ssd_gran;
614 }
615
616 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)617 ssdtosyssd(struct soft_segment_descriptor *ssd, struct system_segment_descriptor *sd)
618 {
619
620 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
621 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
622 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
623 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
624 sd->sd_type = ssd->ssd_type;
625 sd->sd_dpl = ssd->ssd_dpl;
626 sd->sd_p = ssd->ssd_p;
627 sd->sd_gran = ssd->ssd_gran;
628 }
629
630 u_int basemem;
631
632 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)633 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
634 int *physmap_idxp)
635 {
636 int i, insert_idx, physmap_idx;
637
638 physmap_idx = *physmap_idxp;
639
640 if (length == 0)
641 return (1);
642
643 /*
644 * Find insertion point while checking for overlap. Start off by
645 * assuming the new entry will be added to the end.
646 *
647 * NB: physmap_idx points to the next free slot.
648 */
649 insert_idx = physmap_idx;
650 for (i = 0; i <= physmap_idx; i += 2) {
651 if (base < physmap[i + 1]) {
652 if (base + length <= physmap[i]) {
653 insert_idx = i;
654 break;
655 }
656 if (boothowto & RB_VERBOSE)
657 printf(
658 "Overlapping memory regions, ignoring second region\n");
659 return (1);
660 }
661 }
662
663 /* See if we can prepend to the next entry. */
664 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
665 physmap[insert_idx] = base;
666 return (1);
667 }
668
669 /* See if we can append to the previous entry. */
670 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
671 physmap[insert_idx - 1] += length;
672 return (1);
673 }
674
675 physmap_idx += 2;
676 *physmap_idxp = physmap_idx;
677 if (physmap_idx == PHYS_AVAIL_ENTRIES) {
678 printf(
679 "Too many segments in the physical address map, giving up\n");
680 return (0);
681 }
682
683 /*
684 * Move the last 'N' entries down to make room for the new
685 * entry if needed.
686 */
687 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
688 physmap[i] = physmap[i - 2];
689 physmap[i + 1] = physmap[i - 1];
690 }
691
692 /* Insert the new entry. */
693 physmap[insert_idx] = base;
694 physmap[insert_idx + 1] = base + length;
695 return (1);
696 }
697
698 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)699 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
700 vm_paddr_t *physmap, int *physmap_idx)
701 {
702 struct bios_smap *smap, *smapend;
703
704 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
705
706 for (smap = smapbase; smap < smapend; smap++) {
707 if (boothowto & RB_VERBOSE)
708 printf("SMAP type=%02x base=%016lx len=%016lx\n",
709 smap->type, smap->base, smap->length);
710
711 if (smap->type != SMAP_TYPE_MEMORY)
712 continue;
713
714 if (!add_physmap_entry(smap->base, smap->length, physmap,
715 physmap_idx))
716 break;
717 }
718 }
719
720 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)721 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
722 int *physmap_idx)
723 {
724 struct efi_md *map, *p;
725 const char *type;
726 size_t efisz;
727 int ndesc, i;
728
729 static const char *types[] = {
730 "Reserved",
731 "LoaderCode",
732 "LoaderData",
733 "BootServicesCode",
734 "BootServicesData",
735 "RuntimeServicesCode",
736 "RuntimeServicesData",
737 "ConventionalMemory",
738 "UnusableMemory",
739 "ACPIReclaimMemory",
740 "ACPIMemoryNVS",
741 "MemoryMappedIO",
742 "MemoryMappedIOPortSpace",
743 "PalCode",
744 "PersistentMemory"
745 };
746
747 /*
748 * Memory map data provided by UEFI via the GetMemoryMap
749 * Boot Services API.
750 */
751 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
752 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
753
754 if (efihdr->descriptor_size == 0)
755 return;
756 ndesc = efihdr->memory_size / efihdr->descriptor_size;
757
758 if (boothowto & RB_VERBOSE)
759 printf("%23s %12s %12s %8s %4s\n",
760 "Type", "Physical", "Virtual", "#Pages", "Attr");
761
762 for (i = 0, p = map; i < ndesc; i++,
763 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
764 if (boothowto & RB_VERBOSE) {
765 if (p->md_type < nitems(types))
766 type = types[p->md_type];
767 else
768 type = "<INVALID>";
769 printf("%23s %012lx %12p %08lx ", type, p->md_phys,
770 p->md_virt, p->md_pages);
771 if (p->md_attr & EFI_MD_ATTR_UC)
772 printf("UC ");
773 if (p->md_attr & EFI_MD_ATTR_WC)
774 printf("WC ");
775 if (p->md_attr & EFI_MD_ATTR_WT)
776 printf("WT ");
777 if (p->md_attr & EFI_MD_ATTR_WB)
778 printf("WB ");
779 if (p->md_attr & EFI_MD_ATTR_UCE)
780 printf("UCE ");
781 if (p->md_attr & EFI_MD_ATTR_WP)
782 printf("WP ");
783 if (p->md_attr & EFI_MD_ATTR_RP)
784 printf("RP ");
785 if (p->md_attr & EFI_MD_ATTR_XP)
786 printf("XP ");
787 if (p->md_attr & EFI_MD_ATTR_NV)
788 printf("NV ");
789 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
790 printf("MORE_RELIABLE ");
791 if (p->md_attr & EFI_MD_ATTR_RO)
792 printf("RO ");
793 if (p->md_attr & EFI_MD_ATTR_RT)
794 printf("RUNTIME");
795 printf("\n");
796 }
797
798 switch (p->md_type) {
799 case EFI_MD_TYPE_CODE:
800 case EFI_MD_TYPE_DATA:
801 case EFI_MD_TYPE_BS_CODE:
802 case EFI_MD_TYPE_BS_DATA:
803 case EFI_MD_TYPE_FREE:
804 /*
805 * We're allowed to use any entry with these types.
806 */
807 break;
808 default:
809 continue;
810 }
811
812 if (!add_physmap_entry(p->md_phys, p->md_pages * EFI_PAGE_SIZE,
813 physmap, physmap_idx))
814 break;
815 }
816 }
817
818 static char bootmethod[16] = "";
819 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
820 "System firmware boot method");
821
822 static void
native_parse_memmap(caddr_t kmdp,vm_paddr_t * physmap,int * physmap_idx)823 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
824 {
825 struct bios_smap *smap;
826 struct efi_map_header *efihdr;
827 u_int32_t size;
828
829 /*
830 * Memory map from INT 15:E820.
831 *
832 * subr_module.c says:
833 * "Consumer may safely assume that size value precedes data."
834 * ie: an int32_t immediately precedes smap.
835 */
836
837 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
838 MODINFO_METADATA | MODINFOMD_EFI_MAP);
839 smap = (struct bios_smap *)preload_search_info(kmdp,
840 MODINFO_METADATA | MODINFOMD_SMAP);
841 if (efihdr == NULL && smap == NULL)
842 panic("No BIOS smap or EFI map info from loader!");
843
844 if (efihdr != NULL) {
845 add_efi_map_entries(efihdr, physmap, physmap_idx);
846 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
847 } else {
848 size = *((u_int32_t *)smap - 1);
849 bios_add_smap_entries(smap, size, physmap, physmap_idx);
850 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
851 }
852 }
853
854 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
855
856 /*
857 * Populate the (physmap) array with base/bound pairs describing the
858 * available physical memory in the system, then test this memory and
859 * build the phys_avail array describing the actually-available memory.
860 *
861 * Total memory size may be set by the kernel environment variable
862 * hw.physmem or the compile-time define MAXMEM.
863 *
864 * XXX first should be vm_paddr_t.
865 */
866 static void
getmemsize(caddr_t kmdp,u_int64_t first)867 getmemsize(caddr_t kmdp, u_int64_t first)
868 {
869 int i, physmap_idx, pa_indx, da_indx;
870 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
871 u_long physmem_start, physmem_tunable, memtest;
872 pt_entry_t *pte;
873 quad_t dcons_addr, dcons_size;
874 int page_counter;
875
876 /*
877 * Tell the physical memory allocator about pages used to store
878 * the kernel and preloaded data. See kmem_bootstrap_free().
879 */
880 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
881
882 bzero(physmap, sizeof(physmap));
883 physmap_idx = 0;
884
885 init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
886 physmap_idx -= 2;
887
888 /*
889 * Find the 'base memory' segment for SMP
890 */
891 basemem = 0;
892 for (i = 0; i <= physmap_idx; i += 2) {
893 if (physmap[i] <= 0xA0000) {
894 basemem = physmap[i + 1] / 1024;
895 break;
896 }
897 }
898 if (basemem == 0 || basemem > 640) {
899 if (bootverbose)
900 printf(
901 "Memory map doesn't contain a basemem segment, faking it");
902 basemem = 640;
903 }
904
905 /*
906 * Maxmem isn't the "maximum memory", it's one larger than the
907 * highest page of the physical address space. It should be
908 * called something like "Maxphyspage". We may adjust this
909 * based on ``hw.physmem'' and the results of the memory test.
910 */
911 Maxmem = atop(physmap[physmap_idx + 1]);
912
913 #ifdef MAXMEM
914 Maxmem = MAXMEM / 4;
915 #endif
916
917 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
918 Maxmem = atop(physmem_tunable);
919
920 /*
921 * The boot memory test is disabled by default, as it takes a
922 * significant amount of time on large-memory systems, and is
923 * unfriendly to virtual machines as it unnecessarily touches all
924 * pages.
925 *
926 * A general name is used as the code may be extended to support
927 * additional tests beyond the current "page present" test.
928 */
929 memtest = 0;
930 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
931
932 /*
933 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
934 * in the system.
935 */
936 if (Maxmem > atop(physmap[physmap_idx + 1]))
937 Maxmem = atop(physmap[physmap_idx + 1]);
938
939 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
940 (boothowto & RB_VERBOSE))
941 printf("Physical memory use set to %ldK\n", Maxmem * 4);
942
943 /* call pmap initialization to make new kernel address space */
944 pmap_bootstrap(&first);
945
946 /*
947 * Size up each available chunk of physical memory.
948 *
949 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
950 * By default, mask off the first 16 pages unless we appear to be
951 * running in a VM.
952 */
953 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
954 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
955 if (physmap[0] < physmem_start) {
956 if (physmem_start < PAGE_SIZE)
957 physmap[0] = PAGE_SIZE;
958 else if (physmem_start >= physmap[1])
959 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
960 else
961 physmap[0] = round_page(physmem_start);
962 }
963 pa_indx = 0;
964 da_indx = 1;
965 phys_avail[pa_indx++] = physmap[0];
966 phys_avail[pa_indx] = physmap[0];
967 dump_avail[da_indx] = physmap[0];
968 pte = CMAP1;
969
970 /*
971 * Get dcons buffer address
972 */
973 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
974 getenv_quad("dcons.size", &dcons_size) == 0)
975 dcons_addr = 0;
976
977 /*
978 * physmap is in bytes, so when converting to page boundaries,
979 * round up the start address and round down the end address.
980 */
981 page_counter = 0;
982 if (memtest != 0)
983 printf("Testing system memory");
984 for (i = 0; i <= physmap_idx; i += 2) {
985 vm_paddr_t end;
986
987 end = ptoa((vm_paddr_t)Maxmem);
988 if (physmap[i + 1] < end)
989 end = trunc_page(physmap[i + 1]);
990 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
991 int *ptr = (int *)CADDR1;
992 int tmp;
993 bool full, page_bad;
994
995 full = false;
996 /*
997 * block out kernel memory as not available.
998 */
999 if (pa >= (vm_paddr_t)kernphys && pa < first)
1000 goto do_dump_avail;
1001
1002 /*
1003 * block out dcons buffer
1004 */
1005 if (dcons_addr > 0
1006 && pa >= trunc_page(dcons_addr)
1007 && pa < dcons_addr + dcons_size)
1008 goto do_dump_avail;
1009
1010 page_bad = false;
1011 if (memtest == 0)
1012 goto skip_memtest;
1013
1014 /*
1015 * Print a "." every GB to show we're making
1016 * progress.
1017 */
1018 page_counter++;
1019 if ((page_counter % PAGES_PER_GB) == 0)
1020 printf(".");
1021
1022 /*
1023 * map page into kernel: valid, read/write,non-cacheable
1024 */
1025 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1026 invltlb();
1027
1028 tmp = *(int *)ptr;
1029 /*
1030 * Test for alternating 1's and 0's
1031 */
1032 *(volatile int *)ptr = 0xaaaaaaaa;
1033 if (*(volatile int *)ptr != 0xaaaaaaaa)
1034 page_bad = true;
1035 /*
1036 * Test for alternating 0's and 1's
1037 */
1038 *(volatile int *)ptr = 0x55555555;
1039 if (*(volatile int *)ptr != 0x55555555)
1040 page_bad = true;
1041 /*
1042 * Test for all 1's
1043 */
1044 *(volatile int *)ptr = 0xffffffff;
1045 if (*(volatile int *)ptr != 0xffffffff)
1046 page_bad = true;
1047 /*
1048 * Test for all 0's
1049 */
1050 *(volatile int *)ptr = 0x0;
1051 if (*(volatile int *)ptr != 0x0)
1052 page_bad = true;
1053 /*
1054 * Restore original value.
1055 */
1056 *(int *)ptr = tmp;
1057
1058 skip_memtest:
1059 /*
1060 * Adjust array of valid/good pages.
1061 */
1062 if (page_bad == true)
1063 continue;
1064 /*
1065 * If this good page is a continuation of the
1066 * previous set of good pages, then just increase
1067 * the end pointer. Otherwise start a new chunk.
1068 * Note that "end" points one higher than end,
1069 * making the range >= start and < end.
1070 * If we're also doing a speculative memory
1071 * test and we at or past the end, bump up Maxmem
1072 * so that we keep going. The first bad page
1073 * will terminate the loop.
1074 */
1075 if (phys_avail[pa_indx] == pa) {
1076 phys_avail[pa_indx] += PAGE_SIZE;
1077 } else {
1078 pa_indx++;
1079 if (pa_indx == PHYS_AVAIL_ENTRIES) {
1080 printf(
1081 "Too many holes in the physical address space, giving up\n");
1082 pa_indx--;
1083 full = true;
1084 goto do_dump_avail;
1085 }
1086 phys_avail[pa_indx++] = pa; /* start */
1087 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1088 }
1089 physmem++;
1090 do_dump_avail:
1091 if (dump_avail[da_indx] == pa) {
1092 dump_avail[da_indx] += PAGE_SIZE;
1093 } else {
1094 da_indx++;
1095 if (da_indx == PHYS_AVAIL_ENTRIES) {
1096 da_indx--;
1097 goto do_next;
1098 }
1099 dump_avail[da_indx++] = pa; /* start */
1100 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1101 }
1102 do_next:
1103 if (full)
1104 break;
1105 }
1106 }
1107 *pte = 0;
1108 invltlb();
1109 if (memtest != 0)
1110 printf("\n");
1111
1112 /*
1113 * XXX
1114 * The last chunk must contain at least one page plus the message
1115 * buffer to avoid complicating other code (message buffer address
1116 * calculation, etc.).
1117 */
1118 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1119 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1120 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1121 phys_avail[pa_indx--] = 0;
1122 phys_avail[pa_indx--] = 0;
1123 }
1124
1125 Maxmem = atop(phys_avail[pa_indx]);
1126
1127 /* Trim off space for the message buffer. */
1128 phys_avail[pa_indx] -= round_page(msgbufsize);
1129
1130 /* Map the message buffer. */
1131 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1132 }
1133
1134 static caddr_t
native_parse_preload_data(u_int64_t modulep)1135 native_parse_preload_data(u_int64_t modulep)
1136 {
1137 caddr_t kmdp;
1138 char *envp;
1139 #ifdef DDB
1140 vm_offset_t ksym_start;
1141 vm_offset_t ksym_end;
1142 #endif
1143
1144 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1145 preload_bootstrap_relocate(KERNBASE);
1146 kmdp = preload_search_by_type("elf kernel");
1147 if (kmdp == NULL)
1148 kmdp = preload_search_by_type("elf64 kernel");
1149 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1150 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1151 if (envp != NULL)
1152 envp += KERNBASE;
1153 init_static_kenv(envp, 0);
1154 #ifdef DDB
1155 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1156 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1157 db_fetch_ksymtab(ksym_start, ksym_end, 0);
1158 #endif
1159 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1160
1161 return (kmdp);
1162 }
1163
1164 static void
native_clock_source_init(void)1165 native_clock_source_init(void)
1166 {
1167 i8254_init();
1168 tsc_init();
1169 }
1170
1171 static void
amd64_kdb_init(void)1172 amd64_kdb_init(void)
1173 {
1174 kdb_init();
1175 #ifdef KDB
1176 if (boothowto & RB_KDB)
1177 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1178 #endif
1179 }
1180
1181 /* Set up the fast syscall stuff */
1182 void
amd64_conf_fast_syscall(void)1183 amd64_conf_fast_syscall(void)
1184 {
1185 uint64_t msr;
1186
1187 msr = rdmsr(MSR_EFER) | EFER_SCE;
1188 wrmsr(MSR_EFER, msr);
1189 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1190 (u_int64_t)IDTVEC(fast_syscall));
1191 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1192 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1193 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1194 wrmsr(MSR_STAR, msr);
1195 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1196 }
1197
1198 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1199 amd64_bsp_pcpu_init1(struct pcpu *pc)
1200 {
1201 struct user_segment_descriptor *gdt;
1202
1203 PCPU_SET(prvspace, pc);
1204 gdt = *PCPU_PTR(gdt);
1205 PCPU_SET(curthread, &thread0);
1206 PCPU_SET(tssp, PCPU_PTR(common_tss));
1207 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1208 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1209 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1210 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1211 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1212 PCPU_SET(smp_tlb_gen, 1);
1213 }
1214
1215 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1216 amd64_bsp_pcpu_init2(uint64_t rsp0)
1217 {
1218
1219 PCPU_SET(rsp0, rsp0);
1220 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1221 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1222 PCPU_SET(curpcb, thread0.td_pcb);
1223 }
1224
1225 void
amd64_bsp_ist_init(struct pcpu * pc)1226 amd64_bsp_ist_init(struct pcpu *pc)
1227 {
1228 struct nmi_pcpu *np;
1229 struct amd64tss *tssp;
1230
1231 tssp = &pc->pc_common_tss;
1232
1233 /* doublefault stack space, runs on ist1 */
1234 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1235 np->np_pcpu = (register_t)pc;
1236 tssp->tss_ist1 = (long)np;
1237
1238 /*
1239 * NMI stack, runs on ist2. The pcpu pointer is stored just
1240 * above the start of the ist2 stack.
1241 */
1242 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1243 np->np_pcpu = (register_t)pc;
1244 tssp->tss_ist2 = (long)np;
1245
1246 /*
1247 * MC# stack, runs on ist3. The pcpu pointer is stored just
1248 * above the start of the ist3 stack.
1249 */
1250 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1251 np->np_pcpu = (register_t)pc;
1252 tssp->tss_ist3 = (long)np;
1253
1254 /*
1255 * DB# stack, runs on ist4.
1256 */
1257 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1258 np->np_pcpu = (register_t)pc;
1259 tssp->tss_ist4 = (long)np;
1260 }
1261
1262 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1263 hammer_time(u_int64_t modulep, u_int64_t physfree)
1264 {
1265 caddr_t kmdp;
1266 int gsel_tss, x;
1267 struct pcpu *pc;
1268 uint64_t cr3, rsp0;
1269 pml4_entry_t *pml4e;
1270 pdp_entry_t *pdpe;
1271 pd_entry_t *pde;
1272 char *env;
1273 struct user_segment_descriptor *gdt;
1274 struct region_descriptor r_gdt;
1275 size_t kstack0_sz;
1276 int late_console;
1277
1278 TSRAW(&thread0, TS_ENTER, __func__, NULL);
1279
1280 /*
1281 * Calculate kernphys by inspecting page table created by loader.
1282 * The assumptions:
1283 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
1284 * aligned at 2M, below 4G (the latter is important for AP startup)
1285 * - there is a 2M hole at KERNBASE
1286 * - kernel is mapped with 2M superpages
1287 * - all participating memory, i.e. kernel, modules, metadata,
1288 * page table is accessible by pre-created 1:1 mapping
1289 * (right now loader creates 1:1 mapping for lower 4G, and all
1290 * memory is from there)
1291 * - there is a usable memory block right after the end of the
1292 * mapped kernel and all modules/metadata, pointed to by
1293 * physfree, for early allocations
1294 */
1295 cr3 = rcr3();
1296 pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
1297 (vm_offset_t)hammer_time);
1298 pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
1299 (vm_offset_t)hammer_time);
1300 pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
1301 (vm_offset_t)hammer_time);
1302 kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
1303 (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
1304
1305 /* Fix-up for 2M hole */
1306 physfree += kernphys;
1307 kernphys += NBPDR;
1308
1309 kmdp = init_ops.parse_preload_data(modulep);
1310
1311 efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
1312 MODINFOMD_EFI_MAP) != NULL;
1313
1314 if (!efi_boot) {
1315 /* Tell the bios to warmboot next time */
1316 atomic_store_short((u_short *)0x472, 0x1234);
1317 }
1318
1319 physfree += ucode_load_bsp(physfree - kernphys + KERNSTART);
1320 physfree = roundup2(physfree, PAGE_SIZE);
1321
1322 identify_cpu1();
1323 identify_hypervisor();
1324 identify_cpu_fixup_bsp();
1325 identify_cpu2();
1326 initializecpucache();
1327
1328 /*
1329 * Check for pti, pcid, and invpcid before ifuncs are
1330 * resolved, to correctly select the implementation for
1331 * pmap_activate_sw_mode().
1332 */
1333 pti = pti_get_default();
1334 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1335 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1336 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1337 invpcid_works = (cpu_stdext_feature &
1338 CPUID_STDEXT_INVPCID) != 0;
1339 } else {
1340 pmap_pcid_enabled = 0;
1341 }
1342
1343 /*
1344 * Now we can do small core initialization, after the PCID
1345 * CPU features and user knobs are evaluated.
1346 */
1347 TUNABLE_INT_FETCH("vm.pmap.pcid_invlpg_workaround",
1348 &pmap_pcid_invlpg_workaround_uena);
1349 cpu_init_small_core();
1350
1351 if ((cpu_feature2 & CPUID2_XSAVE) != 0) {
1352 use_xsave = 1;
1353 TUNABLE_INT_FETCH("hw.use_xsave", &use_xsave);
1354 }
1355
1356 link_elf_ireloc(kmdp);
1357
1358 /*
1359 * This may be done better later if it gets more high level
1360 * components in it. If so just link td->td_proc here.
1361 */
1362 proc_linkup0(&proc0, &thread0);
1363
1364 /* Init basic tunables, hz etc */
1365 init_param1();
1366
1367 thread0.td_kstack = physfree - kernphys + KERNSTART;
1368 thread0.td_kstack_pages = kstack_pages;
1369 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1370 bzero((void *)thread0.td_kstack, kstack0_sz);
1371 physfree += kstack0_sz;
1372
1373 /*
1374 * Initialize enough of thread0 for delayed invalidation to
1375 * work very early. Rely on thread0.td_base_pri
1376 * zero-initialization, it is reset to PVM at proc0_init().
1377 */
1378 pmap_thread_init_invl_gen(&thread0);
1379
1380 pc = &temp_bsp_pcpu;
1381 pcpu_init(pc, 0, sizeof(struct pcpu));
1382 gdt = &temp_bsp_pcpu.pc_gdt[0];
1383
1384 /*
1385 * make gdt memory segments
1386 */
1387 for (x = 0; x < NGDT; x++) {
1388 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1389 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1390 ssdtosd(&gdt_segs[x], &gdt[x]);
1391 }
1392 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1393 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1394 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1395
1396 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1397 r_gdt.rd_base = (long)gdt;
1398 lgdt(&r_gdt);
1399
1400 wrmsr(MSR_FSBASE, 0); /* User value */
1401 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1402 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1403
1404 dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
1405 physfree += DPCPU_SIZE;
1406 amd64_bsp_pcpu_init1(pc);
1407 /* Non-late cninit() and printf() can be moved up to here. */
1408
1409 /*
1410 * Initialize mutexes.
1411 *
1412 * icu_lock: in order to allow an interrupt to occur in a critical
1413 * section, to set pcpu->ipending (etc...) properly, we
1414 * must be able to get the icu lock, so it can't be
1415 * under witness.
1416 */
1417 mutex_init();
1418 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1419 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1420
1421 /* exceptions */
1422 for (x = 0; x < NIDT; x++)
1423 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1424 SEL_KPL, 0);
1425 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1426 SEL_KPL, 0);
1427 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1428 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1429 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1430 SEL_UPL, 0);
1431 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1432 SEL_UPL, 0);
1433 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1434 SEL_KPL, 0);
1435 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1436 SEL_KPL, 0);
1437 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1438 SEL_KPL, 0);
1439 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1440 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1441 SDT_SYSIGT, SEL_KPL, 0);
1442 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1443 SEL_KPL, 0);
1444 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1445 SDT_SYSIGT, SEL_KPL, 0);
1446 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1447 SEL_KPL, 0);
1448 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1449 SEL_KPL, 0);
1450 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1451 SEL_KPL, 0);
1452 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1453 SEL_KPL, 0);
1454 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1455 SEL_KPL, 0);
1456 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1457 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1458 SEL_KPL, 0);
1459 #ifdef KDTRACE_HOOKS
1460 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1461 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1462 #endif
1463 #ifdef XENHVM
1464 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1465 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1466 #endif
1467 r_idt.rd_limit = sizeof(idt0) - 1;
1468 r_idt.rd_base = (long) idt;
1469 lidt(&r_idt);
1470
1471 /*
1472 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1473 * transition).
1474 * Once bootblocks have updated, we can test directly for
1475 * efi_systbl != NULL here...
1476 */
1477 if (efi_boot)
1478 vty_set_preferred(VTY_VT);
1479
1480 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1481 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1482
1483 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1484 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1485
1486 TUNABLE_INT_FETCH("machdep.syscall_ret_flush_l1d",
1487 &syscall_ret_l1d_flush_mode);
1488
1489 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1490 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1491
1492 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1493
1494 TUNABLE_INT_FETCH("machdep.mitigations.rngds.enable",
1495 &x86_rngds_mitg_enable);
1496
1497 TUNABLE_INT_FETCH("machdep.mitigations.zenbleed.enable",
1498 &zenbleed_enable);
1499 zenbleed_sanitize_enable();
1500
1501 finishidentcpu(); /* Final stage of CPU initialization */
1502
1503 /*
1504 * Initialize the clock before the console so that console
1505 * initialization can use DELAY().
1506 */
1507 clock_init();
1508
1509 initializecpu(); /* Initialize CPU registers */
1510
1511 amd64_bsp_ist_init(pc);
1512
1513 /* Set the IO permission bitmap (empty due to tss seg limit) */
1514 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1515 IOPERM_BITMAP_SIZE;
1516
1517 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1518 ltr(gsel_tss);
1519
1520 amd64_conf_fast_syscall();
1521
1522 /*
1523 * We initialize the PCB pointer early so that exception
1524 * handlers will work. Also set up td_critnest to short-cut
1525 * the page fault handler.
1526 */
1527 cpu_max_ext_state_size = sizeof(struct savefpu);
1528 set_top_of_stack_td(&thread0);
1529 thread0.td_pcb = get_pcb_td(&thread0);
1530 thread0.td_critnest = 1;
1531
1532 /*
1533 * The console and kdb should be initialized even earlier than here,
1534 * but some console drivers don't work until after getmemsize().
1535 * Default to late console initialization to support these drivers.
1536 * This loses mainly printf()s in getmemsize() and early debugging.
1537 */
1538 late_console = 1;
1539 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1540 if (!late_console) {
1541 cninit();
1542 amd64_kdb_init();
1543 }
1544
1545 getmemsize(kmdp, physfree);
1546 init_param2(physmem);
1547
1548 /* now running on new page tables, configured,and u/iom is accessible */
1549
1550 #ifdef DEV_PCI
1551 /* This call might adjust phys_avail[]. */
1552 pci_early_quirks();
1553 #endif
1554
1555 if (late_console)
1556 cninit();
1557
1558 /*
1559 * Dump the boot metadata. We have to wait for cninit() since console
1560 * output is required. If it's grossly incorrect the kernel will never
1561 * make it this far.
1562 */
1563 if (getenv_is_true("debug.dump_modinfo_at_boot"))
1564 preload_dump();
1565
1566 #ifdef DEV_ISA
1567 #ifdef DEV_ATPIC
1568 elcr_probe();
1569 atpic_startup();
1570 #else
1571 /* Reset and mask the atpics and leave them shut down. */
1572 atpic_reset();
1573
1574 /*
1575 * Point the ICU spurious interrupt vectors at the APIC spurious
1576 * interrupt handler.
1577 */
1578 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1579 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1580 #endif
1581 #else
1582 #error "have you forgotten the isa device?"
1583 #endif
1584
1585 if (late_console)
1586 amd64_kdb_init();
1587
1588 msgbufinit(msgbufp, msgbufsize);
1589 fpuinit();
1590
1591 /* make an initial tss so cpu can get interrupt stack on syscall! */
1592 rsp0 = thread0.td_md.md_stack_base;
1593 /* Ensure the stack is aligned to 16 bytes */
1594 rsp0 &= ~0xFul;
1595 PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1596 amd64_bsp_pcpu_init2(rsp0);
1597
1598 /* transfer to user mode */
1599
1600 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1601 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1602 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1603 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1604 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1605
1606 load_ds(_udatasel);
1607 load_es(_udatasel);
1608 load_fs(_ufssel);
1609
1610 /* setup proc 0's pcb */
1611 thread0.td_pcb->pcb_flags = 0;
1612 thread0.td_frame = &proc0_tf;
1613
1614 env = kern_getenv("kernelname");
1615 if (env != NULL)
1616 strlcpy(kernelname, env, sizeof(kernelname));
1617
1618 kcsan_cpu_init(0);
1619
1620 #ifdef FDT
1621 x86_init_fdt();
1622 #endif
1623 thread0.td_critnest = 0;
1624
1625 kasan_init();
1626
1627 TSEXIT();
1628
1629 /* Location of kernel stack for locore */
1630 return (thread0.td_md.md_stack_base);
1631 }
1632
1633 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1634 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1635 {
1636
1637 pcpu->pc_acpi_id = 0xffffffff;
1638 }
1639
1640 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1641 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1642 {
1643 struct bios_smap *smapbase;
1644 struct bios_smap_xattr smap;
1645 caddr_t kmdp;
1646 uint32_t *smapattr;
1647 int count, error, i;
1648
1649 /* Retrieve the system memory map from the loader. */
1650 kmdp = preload_search_by_type("elf kernel");
1651 if (kmdp == NULL)
1652 kmdp = preload_search_by_type("elf64 kernel");
1653 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1654 MODINFO_METADATA | MODINFOMD_SMAP);
1655 if (smapbase == NULL)
1656 return (0);
1657 smapattr = (uint32_t *)preload_search_info(kmdp,
1658 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1659 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1660 error = 0;
1661 for (i = 0; i < count; i++) {
1662 smap.base = smapbase[i].base;
1663 smap.length = smapbase[i].length;
1664 smap.type = smapbase[i].type;
1665 if (smapattr != NULL)
1666 smap.xattr = smapattr[i];
1667 else
1668 smap.xattr = 0;
1669 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1670 }
1671 return (error);
1672 }
1673 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1674 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1675 smap_sysctl_handler, "S,bios_smap_xattr",
1676 "Raw BIOS SMAP data");
1677
1678 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1679 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1680 {
1681 struct efi_map_header *efihdr;
1682 caddr_t kmdp;
1683 uint32_t efisize;
1684
1685 kmdp = preload_search_by_type("elf kernel");
1686 if (kmdp == NULL)
1687 kmdp = preload_search_by_type("elf64 kernel");
1688 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1689 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1690 if (efihdr == NULL)
1691 return (0);
1692 efisize = *((uint32_t *)efihdr - 1);
1693 return (SYSCTL_OUT(req, efihdr, efisize));
1694 }
1695 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
1696 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1697 efi_map_sysctl_handler, "S,efi_map_header",
1698 "Raw EFI Memory Map");
1699
1700 void
spinlock_enter(void)1701 spinlock_enter(void)
1702 {
1703 struct thread *td;
1704 register_t flags;
1705
1706 td = curthread;
1707 if (td->td_md.md_spinlock_count == 0) {
1708 flags = intr_disable();
1709 td->td_md.md_spinlock_count = 1;
1710 td->td_md.md_saved_flags = flags;
1711 critical_enter();
1712 } else
1713 td->td_md.md_spinlock_count++;
1714 }
1715
1716 void
spinlock_exit(void)1717 spinlock_exit(void)
1718 {
1719 struct thread *td;
1720 register_t flags;
1721
1722 td = curthread;
1723 flags = td->td_md.md_saved_flags;
1724 td->td_md.md_spinlock_count--;
1725 if (td->td_md.md_spinlock_count == 0) {
1726 critical_exit();
1727 intr_restore(flags);
1728 }
1729 }
1730
1731 /*
1732 * Construct a PCB from a trapframe. This is called from kdb_trap() where
1733 * we want to start a backtrace from the function that caused us to enter
1734 * the debugger. We have the context in the trapframe, but base the trace
1735 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
1736 * enough for a backtrace.
1737 */
1738 void
makectx(struct trapframe * tf,struct pcb * pcb)1739 makectx(struct trapframe *tf, struct pcb *pcb)
1740 {
1741
1742 pcb->pcb_r12 = tf->tf_r12;
1743 pcb->pcb_r13 = tf->tf_r13;
1744 pcb->pcb_r14 = tf->tf_r14;
1745 pcb->pcb_r15 = tf->tf_r15;
1746 pcb->pcb_rbp = tf->tf_rbp;
1747 pcb->pcb_rbx = tf->tf_rbx;
1748 pcb->pcb_rip = tf->tf_rip;
1749 pcb->pcb_rsp = tf->tf_rsp;
1750 }
1751
1752 /*
1753 * The pcb_flags is only modified by current thread, or by other threads
1754 * when current thread is stopped. However, current thread may change it
1755 * from the interrupt context in cpu_switch(), or in the trap handler.
1756 * When we read-modify-write pcb_flags from C sources, compiler may generate
1757 * code that is not atomic regarding the interrupt handler. If a trap or
1758 * interrupt happens and any flag is modified from the handler, it can be
1759 * clobbered with the cached value later. Therefore, we implement setting
1760 * and clearing flags with single-instruction functions, which do not race
1761 * with possible modification of the flags from the trap or interrupt context,
1762 * because traps and interrupts are executed only on instruction boundary.
1763 */
1764 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)1765 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
1766 {
1767
1768 __asm __volatile("orl %1,%0"
1769 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
1770 : "cc", "memory");
1771
1772 }
1773
1774 /*
1775 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
1776 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
1777 * pcb if user space modified the bases. We must save on the context
1778 * switch or if the return to usermode happens through the doreti.
1779 *
1780 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
1781 * which have a consequence that the base MSRs must be saved each time
1782 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
1783 * context switches.
1784 */
1785 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)1786 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
1787 {
1788 register_t r;
1789
1790 if (curpcb == pcb &&
1791 (flags & PCB_FULL_IRET) != 0 &&
1792 (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1793 r = intr_disable();
1794 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
1795 if (rfs() == _ufssel)
1796 pcb->pcb_fsbase = rdfsbase();
1797 if (rgs() == _ugssel)
1798 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
1799 }
1800 set_pcb_flags_raw(pcb, flags);
1801 intr_restore(r);
1802 } else {
1803 set_pcb_flags_raw(pcb, flags);
1804 }
1805 }
1806
1807 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
1808 {
1809
1810 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
1811 set_pcb_flags_fsgsbase : set_pcb_flags_raw);
1812 }
1813
1814 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)1815 clear_pcb_flags(struct pcb *pcb, const u_int flags)
1816 {
1817
1818 __asm __volatile("andl %1,%0"
1819 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
1820 : "cc", "memory");
1821 }
1822
1823 #ifdef KDB
1824
1825 /*
1826 * Provide inb() and outb() as functions. They are normally only available as
1827 * inline functions, thus cannot be called from the debugger.
1828 */
1829
1830 /* silence compiler warnings */
1831 u_char inb_(u_short);
1832 void outb_(u_short, u_char);
1833
1834 u_char
inb_(u_short port)1835 inb_(u_short port)
1836 {
1837 return inb(port);
1838 }
1839
1840 void
outb_(u_short port,u_char data)1841 outb_(u_short port, u_char data)
1842 {
1843 outb(port, data);
1844 }
1845
1846 #endif /* KDB */
1847
1848 #undef memset
1849 #undef memmove
1850 #undef memcpy
1851
1852 void *memset_std(void *buf, int c, size_t len);
1853 void *memset_erms(void *buf, int c, size_t len);
1854 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
1855 size_t len);
1856 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
1857 size_t len);
1858 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
1859 size_t len);
1860 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
1861 size_t len);
1862
1863 #ifdef KCSAN
1864 /*
1865 * These fail to build as ifuncs when used with KCSAN.
1866 */
1867 void *
memset(void * buf,int c,size_t len)1868 memset(void *buf, int c, size_t len)
1869 {
1870
1871 return (memset_std(buf, c, len));
1872 }
1873
1874 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)1875 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1876 {
1877
1878 return (memmove_std(dst, src, len));
1879 }
1880
1881 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)1882 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
1883 {
1884
1885 return (memcpy_std(dst, src, len));
1886 }
1887 #else
1888 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
1889 {
1890
1891 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1892 memset_erms : memset_std);
1893 }
1894
1895 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
1896 size_t))
1897 {
1898
1899 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1900 memmove_erms : memmove_std);
1901 }
1902
1903 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
1904 {
1905
1906 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1907 memcpy_erms : memcpy_std);
1908 }
1909 #endif
1910
1911 void pagezero_std(void *addr);
1912 void pagezero_erms(void *addr);
1913 DEFINE_IFUNC(, void , pagezero, (void *))
1914 {
1915
1916 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
1917 pagezero_erms : pagezero_std);
1918 }
1919