1 /*        $NetBSD: machdep.c,v 1.849 2025/05/05 16:57:41 imil Exp $   */
2 
3 /*
4  * Copyright (c) 1996, 1997, 1998, 2000, 2004, 2006, 2008, 2009, 2017
5  *     The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Charles M. Hannum, by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility NASA Ames Research Center, by Julio M. Merino Vidal,
11  * by Andrew Doran, and by Maxime Villard.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
24  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
26  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
27  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
28  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
29  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
30  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
31  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
32  * POSSIBILITY OF SUCH DAMAGE.
33  */
34 
35 /*
36  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
37  * All rights reserved.
38  *
39  * This code is derived from software contributed to Berkeley by
40  * William Jolitz.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *        @(#)machdep.c       7.4 (Berkeley) 6/3/91
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.849 2025/05/05 16:57:41 imil Exp $");
71 
72 #include "opt_beep.h"
73 #include "opt_compat_freebsd.h"
74 #include "opt_compat_netbsd.h"
75 #include "opt_cpureset_delay.h"
76 #include "opt_ddb.h"
77 #include "opt_kgdb.h"
78 #include "opt_mtrr.h"
79 #include "opt_modular.h"
80 #include "opt_multiboot.h"
81 #include "opt_multiprocessor.h"
82 #include "opt_physmem.h"
83 #include "opt_realmem.h"
84 #include "opt_user_ldt.h"
85 #include "opt_xen.h"
86 #include "isa.h"
87 #include "pci.h"
88 
89 #include <sys/param.h>
90 #include <sys/systm.h>
91 #include <sys/signal.h>
92 #include <sys/signalvar.h>
93 #include <sys/kernel.h>
94 #include <sys/cpu.h>
95 #include <sys/exec.h>
96 #include <sys/fcntl.h>
97 #include <sys/reboot.h>
98 #include <sys/conf.h>
99 #include <sys/kauth.h>
100 #include <sys/msgbuf.h>
101 #include <sys/mount.h>
102 #include <sys/syscallargs.h>
103 #include <sys/core.h>
104 #include <sys/kcore.h>
105 #include <sys/ucontext.h>
106 #include <sys/ras.h>
107 #include <sys/ksyms.h>
108 #include <sys/device.h>
109 #include <sys/timevar.h>
110 
111 #ifdef KGDB
112 #include <sys/kgdb.h>
113 #endif
114 
115 #include <dev/cons.h>
116 #include <dev/mm.h>
117 
118 #include <uvm/uvm.h>
119 #include <uvm/uvm_page.h>
120 
121 #include <sys/sysctl.h>
122 
123 #include <x86/efi.h>
124 
125 #include <machine/cpu.h>
126 #include <machine/cpu_rng.h>
127 #include <machine/cpufunc.h>
128 #include <machine/cpuvar.h>
129 #include <machine/gdt.h>
130 #include <machine/intr.h>
131 #include <machine/kcore.h>
132 #include <machine/pio.h>
133 #include <machine/psl.h>
134 #include <machine/reg.h>
135 #include <machine/specialreg.h>
136 #include <machine/bootinfo.h>
137 #include <machine/mtrr.h>
138 #include <machine/pmap_private.h>
139 #include <x86/x86/tsc.h>
140 
141 #include <x86/bootspace.h>
142 #include <x86/fpu.h>
143 #include <x86/dbregs.h>
144 #include <x86/machdep.h>
145 
146 #include <machine/multiboot.h>
147 
148 #ifdef XEN
149 #include <xen/evtchn.h>
150 #include <xen/xen.h>
151 #include <xen/hypervisor.h>
152 #endif
153 
154 #include <dev/isa/isareg.h>
155 #include <machine/isa_machdep.h>
156 #include <dev/ic/i8042reg.h>
157 
158 #include <ddb/db_active.h>
159 
160 #ifdef DDB
161 #include <machine/db_machdep.h>
162 #include <ddb/db_extern.h>
163 #endif
164 
165 #include "acpica.h"
166 #include "bioscall.h"
167 
168 #if NBIOSCALL > 0
169 #include <machine/bioscall.h>
170 #endif
171 
172 #if NACPICA > 0
173 #include <dev/acpi/acpivar.h>
174 #define ACPI_MACHDEP_PRIVATE
175 #include <machine/acpi_machdep.h>
176 #else
177 #include <machine/i82489var.h>
178 #endif
179 
180 #include "isa.h"
181 #include "isadma.h"
182 #include "ksyms.h"
183 
184 #include "cardbus.h"
185 #if NCARDBUS > 0
186 /* For rbus_min_start hint. */
187 #include <sys/bus.h>
188 #include <dev/cardbus/rbus.h>
189 #include <machine/rbus_machdep.h>
190 #endif
191 
192 #include "mca.h"
193 #if NMCA > 0
194 #include <machine/mca_machdep.h>        /* for mca_busprobe() */
195 #endif
196 
197 #ifdef MULTIPROCESSOR                   /* XXX */
198 #include <machine/mpbiosvar.h>          /* XXX */
199 #endif                                  /* XXX */
200 
201 /* the following is used externally (sysctl_hw) */
202 char machine[] = "i386";                /* CPU "architecture" */
203 char machine_arch[] = "i386";           /* machine == machine_arch */
204 
205 #ifdef CPURESET_DELAY
206 int cpureset_delay = CPURESET_DELAY;
207 #else
208 int cpureset_delay = 2000; /* default to 2s */
209 #endif
210 
211 #ifdef MTRR
212 const struct mtrr_funcs *mtrr_funcs;
213 #endif
214 
215 int cpu_class;
216 int use_pae;
217 int i386_fpu_fdivbug;
218 
219 int i386_use_fxsave;
220 int i386_has_sse;
221 int i386_has_sse2;
222 
223 vaddr_t idt_vaddr;
224 paddr_t idt_paddr;
225 vaddr_t gdt_vaddr;
226 paddr_t gdt_paddr;
227 vaddr_t ldt_vaddr;
228 paddr_t ldt_paddr;
229 
230 vaddr_t pentium_idt_vaddr;
231 
232 struct vm_map *phys_map = NULL;
233 
234 extern struct bootspace bootspace;
235 
236 extern paddr_t lowmem_rsvd;
237 extern paddr_t avail_start, avail_end;
238 #ifdef XENPV
239 extern paddr_t pmap_pa_start, pmap_pa_end;
240 void hypervisor_callback(void);
241 void failsafe_callback(void);
242 #endif
243 
244 /*
245  * Size of memory segments, before any memory is stolen.
246  */
247 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
248 int mem_cluster_cnt = 0;
249 
250 void init_bootspace(void);
251 void init386(paddr_t);
252 void initgdt(union descriptor *);
253 
254 static void i386_proc0_pcb_ldt_init(void);
255 
256 int *esym;
257 int *eblob;
258 extern int boothowto;
259 
260 #ifndef XENPV
261 
262 /* Base memory reported by BIOS. */
263 #ifndef REALBASEMEM
264 int biosbasemem = 0;
265 #else
266 int biosbasemem = REALBASEMEM;
267 #endif
268 
269 /* Extended memory reported by BIOS. */
270 #ifndef REALEXTMEM
271 int biosextmem = 0;
272 #else
273 int biosextmem = REALEXTMEM;
274 #endif
275 
276 /* Set if any boot-loader set biosbasemem/biosextmem. */
277 int biosmem_implicit;
278 
279 /*
280  * Representation of the bootinfo structure constructed by a NetBSD native
281  * boot loader.  Only be used by native_loader().
282  */
283 struct bootinfo_source {
284           uint32_t bs_naddrs;
285           void *bs_addrs[1]; /* Actually longer. */
286 };
287 
288 /* Only called by locore.S; no need to be in a header file. */
289 void native_loader(int, int, struct bootinfo_source *, paddr_t, int, int);
290 
291 /*
292  * Called as one of the very first things during system startup (just after
293  * the boot loader gave control to the kernel image), this routine is in
294  * charge of retrieving the parameters passed in by the boot loader and
295  * storing them in the appropriate kernel variables.
296  *
297  * WARNING: Because the kernel has not yet relocated itself to KERNBASE,
298  * special care has to be taken when accessing memory because absolute
299  * addresses (referring to kernel symbols) do not work.  So:
300  *
301  *     1) Avoid jumps to absolute addresses (such as gotos and switches).
302  *     2) To access global variables use their physical address, which
303  *        can be obtained using the RELOC macro.
304  */
305 void
native_loader(int bl_boothowto,int bl_bootdev,struct bootinfo_source * bl_bootinfo,paddr_t bl_esym,int bl_biosextmem,int bl_biosbasemem)306 native_loader(int bl_boothowto, int bl_bootdev,
307     struct bootinfo_source *bl_bootinfo, paddr_t bl_esym,
308     int bl_biosextmem, int bl_biosbasemem)
309 {
310 #define RELOC(type, x) ((type)((vaddr_t)(x) - KERNBASE))
311 
312           *RELOC(int *, &boothowto) = bl_boothowto;
313 
314           /*
315            * The boot loader provides a physical, non-relocated address
316            * for the symbols table's end.  We need to convert it to a
317            * virtual address.
318            */
319           if (bl_esym != 0)
320                     *RELOC(int **, &esym) = (int *)((vaddr_t)bl_esym + KERNBASE);
321           else
322                     *RELOC(int **, &esym) = 0;
323 
324           /*
325            * Copy bootinfo entries (if any) from the boot loader's
326            * representation to the kernel's bootinfo space.
327            */
328           if (bl_bootinfo != NULL) {
329                     size_t i;
330                     uint8_t *data;
331                     struct bootinfo *bidest;
332                     struct btinfo_modulelist *bi;
333 
334                     bidest = RELOC(struct bootinfo *, &bootinfo);
335 
336                     data = &bidest->bi_data[0];
337 
338                     for (i = 0; i < bl_bootinfo->bs_naddrs; i++) {
339                               struct btinfo_common *bc;
340 
341                               bc = bl_bootinfo->bs_addrs[i];
342 
343                               if ((data + bc->len) >
344                                   (&bidest->bi_data[0] + BOOTINFO_MAXSIZE))
345                                         break;
346 
347                               memcpy(data, bc, bc->len);
348                               /*
349                                * If any modules were loaded, record where they
350                                * end.  We'll need to skip over them.
351                                */
352                               bi = (struct btinfo_modulelist *)data;
353                               if (bi->common.type == BTINFO_MODULELIST) {
354                                         *RELOC(int **, &eblob) =
355                                             (int *)(bi->endpa + KERNBASE);
356                               }
357                               data += bc->len;
358                     }
359                     bidest->bi_nentries = i;
360           }
361 
362           /*
363            * Configure biosbasemem and biosextmem only if they were not
364            * explicitly given during the kernel's build.
365            */
366           if (*RELOC(int *, &biosbasemem) == 0) {
367                     *RELOC(int *, &biosbasemem) = bl_biosbasemem;
368                     *RELOC(int *, &biosmem_implicit) = 1;
369           }
370           if (*RELOC(int *, &biosextmem) == 0) {
371                     *RELOC(int *, &biosextmem) = bl_biosextmem;
372                     *RELOC(int *, &biosmem_implicit) = 1;
373           }
374 #undef RELOC
375 }
376 
377 #endif /* XENPV */
378 
379 /*
380  * Machine-dependent startup code
381  */
382 void
cpu_startup(void)383 cpu_startup(void)
384 {
385           int x, y;
386           vaddr_t minaddr, maxaddr;
387           psize_t sz;
388 
389           /*
390            * For console drivers that require uvm and pmap to be initialized,
391            * we'll give them one more chance here...
392            */
393           consinit();
394 
395           /*
396            * Initialize error message buffer (et end of core).
397            */
398           if (msgbuf_p_cnt == 0)
399                     panic("msgbuf paddr map has not been set up");
400           for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
401                     continue;
402 
403           msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
404           if (msgbuf_vaddr == 0)
405                     panic("failed to valloc msgbuf_vaddr");
406 
407           for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
408                     for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
409                               pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
410                                   msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
411                                   VM_PROT_READ|VM_PROT_WRITE, 0);
412           }
413 
414           pmap_update(pmap_kernel());
415 
416           initmsgbuf((void *)msgbuf_vaddr, sz);
417 
418 #ifdef MULTIBOOT
419           multiboot1_print_info();
420           multiboot2_print_info();
421 #endif
422 
423 #if NCARDBUS > 0
424           /* Tell RBUS how much RAM we have, so it can use heuristics. */
425           rbus_min_start_hint(ctob((psize_t)physmem));
426 #endif
427 
428           minaddr = 0;
429 
430           /*
431            * Allocate a submap for physio
432            */
433           phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
434               VM_PHYS_SIZE, 0, false, NULL);
435 
436           /* Say hello. */
437           banner();
438 
439           /* Safe for i/o port / memory space allocation to use malloc now. */
440 #if NISA > 0 || NPCI > 0
441           x86_bus_space_mallocok();
442 #endif
443 
444           gdt_init();
445           i386_proc0_pcb_ldt_init();
446 
447           cpu_init_tss(&cpu_info_primary);
448 #ifndef XENPV
449           ltr(cpu_info_primary.ci_tss_sel);
450 #endif
451 
452           x86_startup();
453 }
454 
455 /*
456  * Set up proc0's PCB and LDT.
457  */
458 static void
i386_proc0_pcb_ldt_init(void)459 i386_proc0_pcb_ldt_init(void)
460 {
461           struct lwp *l = &lwp0;
462           struct pcb *pcb = lwp_getpcb(l);
463 
464           pcb->pcb_cr0 = rcr0() & ~CR0_TS;
465           pcb->pcb_esp0 = uvm_lwp_getuarea(l) + USPACE - 16;
466           pcb->pcb_iopl = IOPL_KPL;
467           l->l_md.md_regs = (struct trapframe *)pcb->pcb_esp0 - 1;
468           memcpy(&pcb->pcb_fsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_fsd));
469           memcpy(&pcb->pcb_gsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_gsd));
470           pcb->pcb_dbregs = NULL;
471 
472 #ifndef XENPV
473           lldt(GSEL(GLDT_SEL, SEL_KPL));
474 #else
475           HYPERVISOR_fpu_taskswitch(1);
476           HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_esp0);
477 #endif
478 }
479 
480 #ifdef XENPV
481 /* used in assembly */
482 void i386_switch_context(lwp_t *);
483 void i386_tls_switch(lwp_t *);
484 
485 /*
486  * Switch context:
487  * - switch stack pointer for user->kernel transition
488  */
489 void
i386_switch_context(lwp_t * l)490 i386_switch_context(lwp_t *l)
491 {
492           struct pcb *pcb;
493 
494           pcb = lwp_getpcb(l);
495 
496           HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_esp0);
497 
498           struct physdev_set_iopl set_iopl;
499           set_iopl.iopl = pcb->pcb_iopl;
500           HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
501 }
502 
503 void
i386_tls_switch(lwp_t * l)504 i386_tls_switch(lwp_t *l)
505 {
506           struct cpu_info *ci = curcpu();
507           struct pcb *pcb = lwp_getpcb(l);
508 
509           /*
510            * Raise the IPL to IPL_HIGH. XXX Still needed?
511            */
512           (void)splhigh();
513 
514           /* Update TLS segment pointers */
515           update_descriptor(&ci->ci_gdt[GUFS_SEL],
516               (union descriptor *)&pcb->pcb_fsd);
517           update_descriptor(&ci->ci_gdt[GUGS_SEL],
518               (union descriptor *)&pcb->pcb_gsd);
519 }
520 #endif /* XENPV */
521 
522 /* XXX */
523 #define IDTVEC(name)          __CONCAT(X, name)
524 typedef void (vector)(void);
525 
526 #ifndef XENPV
527 static void         tss_init(struct i386tss *, void *, void *);
528 
529 static void
tss_init(struct i386tss * tss,void * stack,void * func)530 tss_init(struct i386tss *tss, void *stack, void *func)
531 {
532           KASSERT(curcpu()->ci_pmap == pmap_kernel());
533 
534           memset(tss, 0, sizeof *tss);
535           tss->tss_esp0 = tss->tss_esp = (int)((char *)stack + USPACE - 16);
536           tss->tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
537           tss->__tss_cs = GSEL(GCODE_SEL, SEL_KPL);
538           tss->tss_fs = GSEL(GCPU_SEL, SEL_KPL);
539           tss->tss_gs = tss->__tss_es = tss->__tss_ds =
540               tss->__tss_ss = GSEL(GDATA_SEL, SEL_KPL);
541           /* %cr3 contains the value associated to pmap_kernel */
542           tss->tss_cr3 = rcr3();
543           tss->tss_esp = (int)((char *)stack + USPACE - 16);
544           tss->tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
545           tss->__tss_eflags = PSL_MBO | PSL_NT;   /* XXX not needed? */
546           tss->__tss_eip = (int)func;
547 }
548 
549 extern vector IDTVEC(tss_trap08);
550 #if defined(DDB) && defined(MULTIPROCESSOR)
551 extern vector Xintr_ddbipi, Xintr_x2apic_ddbipi;
552 extern int ddb_vec;
553 #endif
554 
555 void
cpu_set_tss_gates(struct cpu_info * ci)556 cpu_set_tss_gates(struct cpu_info *ci)
557 {
558           struct segment_descriptor sd;
559           void *doubleflt_stack;
560           idt_descriptor_t *idt;
561 
562           doubleflt_stack = (void *)uvm_km_alloc(kernel_map, USPACE, 0,
563               UVM_KMF_WIRED);
564           tss_init(&ci->ci_tss->dblflt_tss, doubleflt_stack, IDTVEC(tss_trap08));
565 
566           setsegment(&sd, &ci->ci_tss->dblflt_tss, sizeof(struct i386tss) - 1,
567               SDT_SYS386TSS, SEL_KPL, 0, 0);
568           ci->ci_gdt[GTRAPTSS_SEL].sd = sd;
569 
570           idt = cpu_info_primary.ci_idtvec.iv_idt;
571           set_idtgate(&idt[8], NULL, 0, SDT_SYSTASKGT, SEL_KPL,
572               GSEL(GTRAPTSS_SEL, SEL_KPL));
573 
574 #if defined(DDB) && defined(MULTIPROCESSOR)
575           /*
576            * Set up separate handler for the DDB IPI, so that it doesn't
577            * stomp on a possibly corrupted stack.
578            *
579            * XXX overwriting the gate set in db_machine_init.
580            * Should rearrange the code so that it's set only once.
581            */
582           void *ddbipi_stack;
583 
584           ddbipi_stack = (void *)uvm_km_alloc(kernel_map, USPACE, 0,
585               UVM_KMF_WIRED);
586           tss_init(&ci->ci_tss->ddbipi_tss, ddbipi_stack,
587               x2apic_mode ? Xintr_x2apic_ddbipi : Xintr_ddbipi);
588 
589           setsegment(&sd, &ci->ci_tss->ddbipi_tss, sizeof(struct i386tss) - 1,
590               SDT_SYS386TSS, SEL_KPL, 0, 0);
591           ci->ci_gdt[GIPITSS_SEL].sd = sd;
592 
593           set_idtgate(&idt[ddb_vec], NULL, 0, SDT_SYSTASKGT, SEL_KPL,
594               GSEL(GIPITSS_SEL, SEL_KPL));
595 #endif
596 }
597 #endif /* XENPV */
598 
599 /*
600  * Set up TSS and I/O bitmap.
601  */
602 void
cpu_init_tss(struct cpu_info * ci)603 cpu_init_tss(struct cpu_info *ci)
604 {
605           struct cpu_tss *cputss;
606 
607           cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map,
608               sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
609 
610           cputss->tss.tss_iobase = IOMAP_INVALOFF << 16;
611 #ifndef XENPV
612           cputss->tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL);
613           cputss->tss.tss_ldt = GSEL(GLDT_SEL, SEL_KPL);
614           cputss->tss.tss_cr3 = rcr3();
615 #endif
616 
617           ci->ci_tss = cputss;
618 #ifndef XENPV
619           ci->ci_tss_sel = tss_alloc(&cputss->tss);
620 #endif
621 }
622 
623 void *
getframe(struct lwp * l,int sig,int * onstack)624 getframe(struct lwp *l, int sig, int *onstack)
625 {
626           struct proc *p = l->l_proc;
627           struct trapframe *tf = l->l_md.md_regs;
628 
629           /* Do we need to jump onto the signal stack? */
630           *onstack = (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0
631               && (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
632           if (*onstack)
633                     return (char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size;
634           return (void *)tf->tf_esp;
635 }
636 
637 /*
638  * Build context to run handler in.  We invoke the handler
639  * directly, only returning via the trampoline.  Note the
640  * trampoline version numbers are coordinated with machine-
641  * dependent code in libc.
642  */
643 void
buildcontext(struct lwp * l,int sel,void * catcher,void * fp)644 buildcontext(struct lwp *l, int sel, void *catcher, void *fp)
645 {
646           struct trapframe *tf = l->l_md.md_regs;
647 
648           tf->tf_gs = GSEL(GUGS_SEL, SEL_UPL);
649           tf->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
650           tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
651           tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
652           tf->tf_eip = (int)catcher;
653           tf->tf_cs = GSEL(sel, SEL_UPL);
654           tf->tf_eflags &= ~PSL_CLEARSIG;
655           tf->tf_esp = (int)fp;
656           tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
657 
658           /* Ensure FP state is reset. */
659           fpu_sigreset(l);
660 }
661 
662 void
sendsig_siginfo(const ksiginfo_t * ksi,const sigset_t * mask)663 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
664 {
665           struct lwp *l = curlwp;
666           struct proc *p = l->l_proc;
667           struct pmap *pmap = vm_map_pmap(&p->p_vmspace->vm_map);
668           int sel = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
669               GUCODEBIG_SEL : GUCODE_SEL;
670           struct sigacts *ps = p->p_sigacts;
671           int onstack, error;
672           int sig = ksi->ksi_signo;
673           struct sigframe_siginfo *fp = getframe(l, sig, &onstack), frame;
674           sig_t catcher = SIGACTION(p, sig).sa_handler;
675 
676           KASSERT(mutex_owned(p->p_lock));
677 
678           fp--;
679           fp = (struct sigframe_siginfo *)((uintptr_t)fp & ~STACK_ALIGNBYTES);
680 
681           memset(&frame, 0, sizeof(frame));
682           frame.sf_ra = (int)ps->sa_sigdesc[sig].sd_tramp;
683           frame.sf_signum = sig;
684           frame.sf_sip = &fp->sf_si;
685           frame.sf_ucp = &fp->sf_uc;
686           frame.sf_si._info = ksi->ksi_info;
687           frame.sf_uc.uc_flags = _UC_SIGMASK|_UC_VM;
688           frame.sf_uc.uc_sigmask = *mask;
689           frame.sf_uc.uc_link = l->l_ctxlink;
690           frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
691               ? _UC_SETSTACK : _UC_CLRSTACK;
692 
693           sendsig_reset(l, sig);
694 
695           mutex_exit(p->p_lock);
696           cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
697           error = copyout(&frame, fp, sizeof(frame));
698           mutex_enter(p->p_lock);
699 
700           if (error != 0) {
701                     /*
702                      * Process has trashed its stack; give it an illegal
703                      * instruction to halt it in its tracks.
704                      */
705                     sigexit(l, SIGILL);
706                     /* NOTREACHED */
707           }
708 
709           buildcontext(l, sel, catcher, fp);
710 
711           /* Remember that we're now on the signal stack. */
712           if (onstack)
713                     l->l_sigstk.ss_flags |= SS_ONSTACK;
714 }
715 
716 static void
maybe_dump(int howto)717 maybe_dump(int howto)
718 {
719           int s;
720 
721           /* Disable interrupts. */
722           s = splhigh();
723 
724           /* Do a dump if requested. */
725           if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
726                     dumpsys();
727 
728           splx(s);
729 }
730 
731 void
cpu_reboot(int howto,char * bootstr)732 cpu_reboot(int howto, char *bootstr)
733 {
734           static bool syncdone = false;
735           int s = IPL_NONE;
736 
737           if (cold) {
738                     howto |= RB_HALT;
739                     goto haltsys;
740           }
741 
742           boothowto = howto;
743 
744           /* XXX used to dump after vfs_shutdown() and before
745            * detaching devices / shutdown hooks / pmf_system_shutdown().
746            */
747           maybe_dump(howto);
748 
749           /*
750            * If we've panic'd, don't make the situation potentially
751            * worse by syncing or unmounting the file systems.
752            */
753           if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
754                     if (!syncdone) {
755                               syncdone = true;
756                               /* XXX used to force unmount as well, here */
757                               vfs_sync_all(curlwp);
758                     }
759 
760                     while (vfs_unmountall1(curlwp, false, false) ||
761                            config_detach_all(boothowto) ||
762                            vfs_unmount_forceone(curlwp))
763                               ;         /* do nothing */
764           } else {
765                     if (!db_active)
766                               suspendsched();
767           }
768 
769           pmf_system_shutdown(boothowto);
770 
771           s = splhigh();
772 
773           /* amd64 maybe_dump() */
774 
775 haltsys:
776           doshutdownhooks();
777 
778           if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
779 #if NACPICA > 0
780                     if (s != IPL_NONE)
781                               splx(s);
782 
783                     acpi_enter_sleep_state(ACPI_STATE_S5);
784 #else
785                     __USE(s);
786 #endif
787 #ifdef XEN
788                     if (vm_guest == VM_GUEST_XENPV ||
789                         vm_guest == VM_GUEST_XENPVH ||
790                         vm_guest == VM_GUEST_XENPVHVM)
791                               HYPERVISOR_shutdown();
792 #endif /* XEN */
793           }
794 
795 #ifdef MULTIPROCESSOR
796           cpu_broadcast_halt();
797 #endif /* MULTIPROCESSOR */
798 
799           if (howto & RB_HALT) {
800 #if NACPICA > 0
801                     acpi_disable();
802 #endif
803 
804                     printf("\n");
805                     printf("The operating system has halted.\n");
806                     printf("Please press any key to reboot.\n\n");
807 
808 #ifdef BEEP_ONHALT
809                     {
810                               int c;
811                               for (c = BEEP_ONHALT_COUNT; c > 0; c--) {
812                                         sysbeep(BEEP_ONHALT_PITCH,
813                                                   BEEP_ONHALT_PERIOD * hz / 1000);
814                                         delay(BEEP_ONHALT_PERIOD * 1000);
815                                         sysbeep(0, BEEP_ONHALT_PERIOD * hz / 1000);
816                                         delay(BEEP_ONHALT_PERIOD * 1000);
817                               }
818                     }
819 #endif
820 
821                     cnpollc(1);         /* for proper keyboard command handling */
822                     if (cngetc() == 0) {
823                               /* no console attached, so just hlt */
824                               printf("No keyboard - cannot reboot after all.\n");
825                               for(;;) {
826                                         x86_hlt();
827                               }
828                     }
829                     cnpollc(0);
830           }
831 
832           printf("rebooting...\n");
833           if (cpureset_delay > 0)
834                     delay(cpureset_delay * 1000);
835           cpu_reset();
836           for(;;) ;
837           /*NOTREACHED*/
838 }
839 
840 /*
841  * Clear registers on exec
842  */
843 void
setregs(struct lwp * l,struct exec_package * pack,vaddr_t stack)844 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
845 {
846           struct pmap *pmap = vm_map_pmap(&l->l_proc->p_vmspace->vm_map);
847           struct pcb *pcb = lwp_getpcb(l);
848           struct trapframe *tf;
849 
850 #ifdef USER_LDT
851           pmap_ldt_cleanup(l);
852 #endif
853 
854           fpu_clear(l, pack->ep_osversion >= 699002600
855               ? __INITIAL_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
856 
857           memcpy(&pcb->pcb_fsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_fsd));
858           memcpy(&pcb->pcb_gsd, &gdtstore[GUDATA_SEL], sizeof(pcb->pcb_gsd));
859 
860           x86_dbregs_clear(l);
861 
862           tf = l->l_md.md_regs;
863           memset(tf, 0, sizeof(*tf));
864 
865           tf->tf_trapno = T_ASTFLT;
866           tf->tf_gs = GSEL(GUGS_SEL, SEL_UPL);
867           tf->tf_fs = GSEL(GUFS_SEL, SEL_UPL);
868           tf->tf_es = LSEL(LUDATA_SEL, SEL_UPL);
869           tf->tf_ds = LSEL(LUDATA_SEL, SEL_UPL);
870           tf->tf_edi = 0;
871           tf->tf_esi = 0;
872           tf->tf_ebp = 0;
873           tf->tf_ebx = l->l_proc->p_psstrp;
874           tf->tf_edx = 0;
875           tf->tf_ecx = 0;
876           tf->tf_eax = 0;
877           tf->tf_eip = pack->ep_entry;
878           tf->tf_cs = pmap->pm_hiexec > I386_MAX_EXE_ADDR ?
879               LSEL(LUCODEBIG_SEL, SEL_UPL) : LSEL(LUCODE_SEL, SEL_UPL);
880           tf->tf_eflags = PSL_USERSET;
881           tf->tf_esp = stack;
882           tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
883 }
884 
885 /*
886  * Initialize segments and descriptor tables
887  */
888 
889 union descriptor *gdtstore, *ldtstore;
890 union descriptor *pentium_idt;
891 extern vaddr_t lwp0uarea;
892 
893 void
setgate(struct gate_descriptor * gd,void * func,int args,int type,int dpl,int sel)894 setgate(struct gate_descriptor *gd, void *func, int args, int type, int dpl,
895     int sel)
896 {
897 
898           gd->gd_looffset = (int)func;
899           gd->gd_selector = sel;
900           gd->gd_stkcpy = args;
901           gd->gd_xx = 0;
902           gd->gd_type = type;
903           gd->gd_dpl = dpl;
904           gd->gd_p = 1;
905           gd->gd_hioffset = (int)func >> 16;
906 }
907 
908 void
unsetgate(struct gate_descriptor * gd)909 unsetgate(struct gate_descriptor *gd)
910 {
911 
912           gd->gd_p = 0;
913           gd->gd_hioffset = 0;
914           gd->gd_looffset = 0;
915           gd->gd_selector = 0;
916           gd->gd_xx = 0;
917           gd->gd_stkcpy = 0;
918           gd->gd_type = 0;
919           gd->gd_dpl = 0;
920 }
921 
922 void
setregion(struct region_descriptor * rd,void * base,size_t limit)923 setregion(struct region_descriptor *rd, void *base, size_t limit)
924 {
925 
926           rd->rd_limit = (int)limit;
927           rd->rd_base = (int)base;
928 }
929 
930 void
setsegment(struct segment_descriptor * sd,const void * base,size_t limit,int type,int dpl,int def32,int gran)931 setsegment(struct segment_descriptor *sd, const void *base, size_t limit,
932     int type, int dpl, int def32, int gran)
933 {
934 
935           sd->sd_lolimit = (int)limit;
936           sd->sd_lobase = (int)base;
937           sd->sd_type = type;
938           sd->sd_dpl = dpl;
939           sd->sd_p = 1;
940           sd->sd_hilimit = (int)limit >> 16;
941           sd->sd_xx = 0;
942           sd->sd_def32 = def32;
943           sd->sd_gran = gran;
944           sd->sd_hibase = (int)base >> 24;
945 }
946 
947 /* XXX */
948 extern vector IDTVEC(syscall);
949 extern vector *IDTVEC(exceptions)[];
950 #ifdef XENPV
951 extern union descriptor tmpgdt[];
952 #endif
953 
954 void
cpu_init_idt(struct cpu_info * ci)955 cpu_init_idt(struct cpu_info *ci)
956 {
957           struct region_descriptor region;
958           struct idt_vec *iv;
959           idt_descriptor_t *idt;
960 
961           iv = &ci->ci_idtvec;
962           idt = iv->iv_idt_pentium;
963           setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
964           lidt(&region);
965 }
966 
967 /*
968  * initgdt(tgdt)
969  *
970  *        Initialize a temporary Global Descriptor Table (GDT) using
971  *        storage space at tgdt.
972  *
973  *        1. Set up segment descriptors for our purposes, including a
974  *           CPU-local segment descriptor pointing at &cpu_info_primary.
975  *
976  *        2. Load the address into the Global Descriptor Table Register.
977  *
978  *        3. Set up segment selectors for all the segment registers using
979  *           it so that %fs-relative addressing works for the CPU-local
980  *           data.
981  *
982  *        After this put, CPUVAR(...), curcpu(), and curlwp will work.
983  *
984  *        Eventually the kernel will switch to a second temporary GDT
985  *        allocated with pmap_bootstrap_valloc in pmap_bootstrap, and
986  *        then to permanent GDT allocated with uvm_km(9) in gdt_init.
987  *        But the first temporary GDT is needed now to get us going with
988  *        early access to curcpu() and curlwp before we enter kernel
989  *        main.
990  *
991  *        XXX The purpose of each of the segment descriptors should be
992  *        written down somewhere in a single place that can be cross-
993  *        referenced.
994  *
995  *        References:
996  *
997  *        - Intel 64 and IA-32 Architectures Software Developer's Manual,
998  *          Volume 3: System Programming Guide, Order Number 325384,
999  *          April 2022, Sec. 3.5.1 `Segment Descriptor Tables',
1000  *          pp. 3-14 through 3-16.
1001  */
1002 void
initgdt(union descriptor * tgdt)1003 initgdt(union descriptor *tgdt)
1004 {
1005           KASSERT(tgdt != NULL);
1006 
1007           gdtstore = tgdt;
1008 #ifdef XENPV
1009           u_long    frames[16];
1010 #else
1011           struct region_descriptor region;
1012           memset(gdtstore, 0, NGDT * sizeof(*gdtstore));
1013 #endif
1014 
1015           /* make gdt gates and memory segments */
1016           setsegment(&gdtstore[GCODE_SEL].sd, 0, 0xfffff,
1017               SDT_MEMERA, SEL_KPL, 1, 1);
1018           setsegment(&gdtstore[GDATA_SEL].sd, 0, 0xfffff,
1019               SDT_MEMRWA, SEL_KPL, 1, 1);
1020           setsegment(&gdtstore[GUCODE_SEL].sd, 0, x86_btop(I386_MAX_EXE_ADDR) - 1,
1021               SDT_MEMERA, SEL_UPL, 1, 1);
1022           setsegment(&gdtstore[GUCODEBIG_SEL].sd, 0, 0xfffff,
1023               SDT_MEMERA, SEL_UPL, 1, 1);
1024           setsegment(&gdtstore[GUDATA_SEL].sd, 0, 0xfffff,
1025               SDT_MEMRWA, SEL_UPL, 1, 1);
1026 #if NBIOSCALL > 0 && !defined(XENPV)
1027           /* bios trampoline GDT entries */
1028           setsegment(&gdtstore[GBIOSCODE_SEL].sd, 0, 0xfffff,
1029               SDT_MEMERA, SEL_KPL, 0, 0);
1030           setsegment(&gdtstore[GBIOSDATA_SEL].sd, 0, 0xfffff,
1031               SDT_MEMRWA, SEL_KPL, 0, 0);
1032 #endif
1033           setsegment(&gdtstore[GCPU_SEL].sd, &cpu_info_primary,
1034               sizeof(struct cpu_info) - 1, SDT_MEMRWA, SEL_KPL, 1, 0);
1035 
1036 #ifndef XENPV
1037           setregion(&region, gdtstore, NGDT * sizeof(gdtstore[0]) - 1);
1038           lgdt(&region);
1039 #else /* !XENPV */
1040           /*
1041            * We jumpstart the bootstrap process a bit so we can update
1042            * page permissions. This is done redundantly later from
1043            * x86_xpmap.c:xen_locore() - harmless.
1044            */
1045           xpmap_phys_to_machine_mapping =
1046               (unsigned long *)xen_start_info.mfn_list;
1047 
1048           frames[0] = xpmap_ptom((uint32_t)gdtstore - KERNBASE) >> PAGE_SHIFT;
1049           {         /*
1050                      * Enter the gdt page RO into the kernel map. We can't
1051                      * use pmap_kenter_pa() here, because %fs is not
1052                      * usable until the gdt is loaded, and %fs is used as
1053                      * the base pointer for curcpu() and curlwp(), both of
1054                      * which are in the callpath of pmap_kenter_pa().
1055                      * So we mash up our own - this is MD code anyway.
1056                      */
1057                     extern pt_entry_t xpmap_pg_nx;
1058                     pt_entry_t pte;
1059 
1060                     pte = pmap_pa2pte((vaddr_t)gdtstore - KERNBASE);
1061                     pte |= xpmap_pg_nx | PTE_P;
1062 
1063                     if (HYPERVISOR_update_va_mapping((vaddr_t)gdtstore, pte,
1064                         UVMF_INVLPG) < 0) {
1065                               panic("gdt page RO update failed.\n");
1066                     }
1067           }
1068 
1069           if (HYPERVISOR_set_gdt(frames, NGDT /* XXX is it right ? */))
1070                     panic("HYPERVISOR_set_gdt failed!\n");
1071 
1072           lgdt_finish();
1073 #endif /* !XENPV */
1074 }
1075 
1076 #if !defined(XENPV)  && NBIOSCALL > 0
1077 static void
init386_pte0(void)1078 init386_pte0(void)
1079 {
1080           paddr_t paddr;
1081           vaddr_t vaddr;
1082 
1083           paddr = 4 * PAGE_SIZE;
1084           vaddr = (vaddr_t)vtopte(0);
1085           pmap_kenter_pa(vaddr, paddr, VM_PROT_ALL, 0);
1086           pmap_update(pmap_kernel());
1087           /* make sure it is clean before using */
1088           memset((void *)vaddr, 0, PAGE_SIZE);
1089 }
1090 #endif /* !XENPV && NBIOSCALL > 0 */
1091 
1092 #ifndef XENPV
1093 static void
init386_ksyms(void)1094 init386_ksyms(void)
1095 {
1096 #if NKSYMS || defined(DDB) || defined(MODULAR)
1097           extern int end;
1098           struct btinfo_symtab *symtab;
1099 
1100 #ifdef DDB
1101           db_machine_init();
1102 #endif
1103 
1104 #if defined(MULTIBOOT)
1105           if (multiboot1_ksyms_addsyms_elf())
1106                     return;
1107 
1108           if (multiboot2_ksyms_addsyms_elf())
1109                     return;
1110 #endif
1111 
1112 #ifdef XEN
1113           if (pvh_boot && vm_guest != VM_GUEST_XENPVH) {
1114                     ksyms_addsyms_elf(0, ((int *)&end) + 1, esym);
1115                     return;
1116           }
1117 #endif
1118 
1119           if ((symtab = lookup_bootinfo(BTINFO_SYMTAB)) == NULL) {
1120                     ksyms_addsyms_elf(*(int *)&end, ((int *)&end) + 1, esym);
1121                     return;
1122           }
1123 
1124           symtab->ssym += KERNBASE;
1125           symtab->esym += KERNBASE;
1126           ksyms_addsyms_elf(symtab->nsym, (int *)symtab->ssym, (int *)symtab->esym);
1127 #endif
1128 }
1129 #endif /* XENPV */
1130 
1131 void
init_bootspace(void)1132 init_bootspace(void)
1133 {
1134           extern char __rodata_start;
1135           extern char __data_start;
1136           extern char __kernel_end;
1137           size_t i = 0;
1138 
1139           memset(&bootspace, 0, sizeof(bootspace));
1140 
1141           bootspace.head.va = KERNTEXTOFF;
1142           bootspace.head.pa = KERNTEXTOFF - KERNBASE;
1143           bootspace.head.sz = 0;
1144 
1145           bootspace.segs[i].type = BTSEG_TEXT;
1146           bootspace.segs[i].va = KERNTEXTOFF;
1147           bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE;
1148           bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF;
1149           i++;
1150 
1151           bootspace.segs[i].type = BTSEG_RODATA;
1152           bootspace.segs[i].va = (vaddr_t)&__rodata_start;
1153           bootspace.segs[i].pa = (paddr_t)(vaddr_t)&__rodata_start - KERNBASE;
1154           bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start;
1155           i++;
1156 
1157           bootspace.segs[i].type = BTSEG_DATA;
1158           bootspace.segs[i].va = (vaddr_t)&__data_start;
1159           bootspace.segs[i].pa = (paddr_t)(vaddr_t)&__data_start - KERNBASE;
1160           bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start;
1161           i++;
1162 
1163           bootspace.boot.va = (vaddr_t)&__kernel_end;
1164           bootspace.boot.pa = (paddr_t)(vaddr_t)&__kernel_end - KERNBASE;
1165           bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) -
1166               (size_t)&__kernel_end;
1167 
1168           /* Virtual address of the top level page */
1169           bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE);
1170 }
1171 
1172 void
init386(paddr_t first_avail)1173 init386(paddr_t first_avail)
1174 {
1175           extern void consinit(void);
1176           int x;
1177 #ifndef XENPV
1178           extern paddr_t local_apic_pa;
1179           union descriptor *tgdt;
1180           struct region_descriptor region;
1181 #if NBIOSCALL > 0
1182           extern int biostramp_image_size;
1183           extern u_char biostramp_image[];
1184 #endif
1185 #endif /* !XENPV */
1186           struct pcb *pcb;
1187           struct idt_vec *iv;
1188           idt_descriptor_t *idt;
1189 
1190           KASSERT(first_avail % PAGE_SIZE == 0);
1191 
1192 #ifdef XENPV
1193           KASSERT(HYPERVISOR_shared_info != NULL);
1194           cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
1195 #endif
1196 
1197 #ifdef XEN
1198           if (pvh_boot)
1199                     xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1200 #endif
1201 
1202           uvm_lwp_setuarea(&lwp0, lwp0uarea);
1203 
1204           cpu_probe(&cpu_info_primary);
1205 
1206           /*
1207            * Initialize the no-execute bit on cpu0, if supported.
1208            *
1209            * Note: The call to cpu_init_msrs for secondary CPUs happens
1210            * in cpu_hatch.
1211            */
1212           cpu_init_msrs(&cpu_info_primary, true);
1213 
1214 #ifndef XENPV
1215           cpu_speculation_init(&cpu_info_primary);
1216 #endif
1217 
1218 #ifdef PAE
1219           use_pae = 1;
1220 #else
1221           use_pae = 0;
1222 #endif
1223 
1224           pcb = lwp_getpcb(&lwp0);
1225 #ifdef XENPV
1226           pcb->pcb_cr3 = PDPpaddr;
1227 #endif
1228 
1229 #if defined(PAE) && !defined(XENPV)
1230           /*
1231            * Save VA and PA of L3 PD of boot processor (for Xen, this is done
1232            * in xen_locore())
1233            */
1234           cpu_info_primary.ci_pae_l3_pdirpa = rcr3();
1235           cpu_info_primary.ci_pae_l3_pdir = (pd_entry_t *)(rcr3() + KERNBASE);
1236 #endif
1237 
1238           uvm_md_init();
1239 
1240           /*
1241            * Start with 2 color bins -- this is just a guess to get us
1242            * started.  We'll recolor when we determine the largest cache
1243            * sizes on the system.
1244            */
1245           uvmexp.ncolors = 2;
1246 
1247           avail_start = first_avail;
1248 
1249 #ifndef XENPV
1250           /*
1251            * Low memory reservations:
1252            * Page 0:          BIOS data
1253            * Page 1:          BIOS callback
1254            * Page 2:          MP bootstrap code (MP_TRAMPOLINE)
1255            * Page 3:          ACPI wakeup code (ACPI_WAKEUP_ADDR)
1256            * Page 4:          Temporary page table for 0MB-4MB
1257            * Page 5:          Temporary page directory
1258            */
1259           lowmem_rsvd = 6 * PAGE_SIZE;
1260 #else /* !XENPV */
1261           /* Parse Xen command line (replace bootinfo) */
1262           xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1263 
1264           /* Use the dummy page as a gdt */
1265           extern vaddr_t xen_dummy_page;
1266           gdtstore = (void *)xen_dummy_page;
1267 
1268           /* Determine physical address space */
1269           avail_end = ctob((paddr_t)xen_start_info.nr_pages);
1270           pmap_pa_start = (KERNTEXTOFF - KERNBASE);
1271           pmap_pa_end = pmap_pa_start + ctob((paddr_t)xen_start_info.nr_pages);
1272           mem_clusters[0].start = avail_start;
1273           mem_clusters[0].size = avail_end - avail_start;
1274           mem_cluster_cnt++;
1275           physmem += xen_start_info.nr_pages;
1276           uvmexp.wired += atop(avail_start);
1277 
1278           /*
1279            * initgdt() has to be done before consinit(), so that %fs is properly
1280            * initialised. initgdt() uses pmap_kenter_pa so it can't be called
1281            * before the above variables are set.
1282            */
1283           initgdt(gdtstore);
1284 
1285           mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
1286 #endif /* XENPV */
1287 
1288 #if NISA > 0 || NPCI > 0
1289           x86_bus_space_init();
1290 #endif
1291 
1292           consinit();         /* XXX SHOULD NOT BE DONE HERE */
1293 
1294 #ifdef DEBUG_MEMLOAD
1295           printf("mem_cluster_count: %d\n", mem_cluster_cnt);
1296 #endif
1297 
1298           /*
1299            * Call pmap initialization to make new kernel address space.
1300            * We must do this before loading pages into the VM system.
1301            */
1302           pmap_bootstrap((vaddr_t)atdevbase + IOM_SIZE);
1303 
1304           /*
1305            * Initialize RNG to get entropy ASAP either from CPU
1306            * RDRAND/RDSEED or from seed on disk.  Constraints:
1307            *
1308            * - Must happen after cpu_init_msrs so that curcpu() and
1309            *   curlwp work.
1310            *
1311            * - Must happen after consinit so we have the opportunity to
1312            *   print useful feedback.
1313            *
1314            * - On KASLR kernels, must happen after pmap_bootstrap because
1315            *   x86_rndseed requires access to the direct map.
1316            */
1317           cpu_rng_init();
1318           x86_rndseed();
1319 
1320 #ifndef XENPV
1321           /* Initialize the memory clusters. */
1322           init_x86_clusters();
1323 
1324           /* Internalize the physical pages into the VM system. */
1325           init_x86_vm(avail_start);
1326 #else /* !XENPV */
1327           uvm_page_physload(atop(avail_start), atop(avail_end),
1328               atop(avail_start), atop(avail_end),
1329               VM_FREELIST_DEFAULT);
1330 
1331           /* Reclaim the boot gdt page - see locore.s */
1332           {
1333                     extern pt_entry_t xpmap_pg_nx;
1334                     pt_entry_t pte;
1335 
1336                     pte = pmap_pa2pte((vaddr_t)tmpgdt - KERNBASE);
1337                     pte |= PTE_W | xpmap_pg_nx | PTE_P;
1338 
1339                     if (HYPERVISOR_update_va_mapping((vaddr_t)tmpgdt, pte, UVMF_INVLPG) < 0) {
1340                               panic("tmpgdt page relaim RW update failed.\n");
1341                     }
1342           }
1343 #endif /* !XENPV */
1344 
1345           init_x86_msgbuf();
1346 
1347 #if !defined(XENPV) && NBIOSCALL > 0
1348           /*
1349            * XXX Remove this
1350            *
1351            * Setup a temporary Page Table Entry to allow identity mappings of
1352            * the real mode address. This is required by bioscall.
1353            */
1354           init386_pte0();
1355 
1356           KASSERT(biostramp_image_size <= PAGE_SIZE);
1357           pmap_kenter_pa((vaddr_t)BIOSTRAMP_BASE, (paddr_t)BIOSTRAMP_BASE,
1358               VM_PROT_ALL, 0);
1359           pmap_update(pmap_kernel());
1360           memcpy((void *)BIOSTRAMP_BASE, biostramp_image, biostramp_image_size);
1361 
1362           /* Needed early, for bioscall() */
1363           cpu_info_primary.ci_pmap = pmap_kernel();
1364 #endif
1365 
1366 #ifndef XENPV
1367           pmap_kenter_pa(local_apic_va, local_apic_pa,
1368               VM_PROT_READ|VM_PROT_WRITE, 0);
1369           pmap_update(pmap_kernel());
1370           memset((void *)local_apic_va, 0, PAGE_SIZE);
1371 #endif
1372 
1373           pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1374           pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1375           pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1376           pmap_update(pmap_kernel());
1377           memset((void *)idt_vaddr, 0, PAGE_SIZE);
1378           memset((void *)gdt_vaddr, 0, PAGE_SIZE);
1379           memset((void *)ldt_vaddr, 0, PAGE_SIZE);
1380 
1381           pmap_kenter_pa(pentium_idt_vaddr, idt_paddr, VM_PROT_READ, 0);
1382           pmap_update(pmap_kernel());
1383           iv = &(cpu_info_primary.ci_idtvec);
1384           idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary));
1385           idt = (idt_descriptor_t *)iv->iv_idt;
1386 
1387 #ifndef XENPV
1388           /*
1389            * Switch from the initial temporary GDT that was allocated on
1390            * the stack by our caller, start.  That temporary GDT will be
1391            * popped off the stack when init386 returns before start calls
1392            * main, so we need to use a second temporary GDT allocated in
1393            * pmap_bootstrap with pmap_bootstrap_valloc/palloc to make
1394            * sure at least the CPU-local data area, used by CPUVAR(...),
1395            * curcpu(), and curlwp via %fs-relative addressing, will
1396            * continue to work.
1397            *
1398            * Later, in gdt_init via cpu_startup, we will finally allocate
1399            * a permanent GDT with uvm_km(9).
1400            *
1401            * The content of the second temporary GDT is the same as the
1402            * content of the initial GDT, initialized in initgdt, except
1403            * for the address of the LDT, which is also that we are also
1404            * switching to a new temporary LDT at a new address.
1405            */
1406           tgdt = gdtstore;
1407           gdtstore = (union descriptor *)gdt_vaddr;
1408           ldtstore = (union descriptor *)ldt_vaddr;
1409 
1410           memcpy(gdtstore, tgdt, NGDT * sizeof(*gdtstore));
1411 
1412           setsegment(&gdtstore[GLDT_SEL].sd, ldtstore,
1413               NLDT * sizeof(ldtstore[0]) - 1, SDT_SYSLDT, SEL_KPL, 0, 0);
1414 #else
1415           HYPERVISOR_set_callbacks(
1416               GSEL(GCODE_SEL, SEL_KPL), (unsigned long)hypervisor_callback,
1417               GSEL(GCODE_SEL, SEL_KPL), (unsigned long)failsafe_callback);
1418 
1419           ldtstore = (union descriptor *)ldt_vaddr;
1420 #endif /* XENPV */
1421 
1422           /* make ldt gates and memory segments */
1423           ldtstore[LUCODE_SEL] = gdtstore[GUCODE_SEL];
1424           ldtstore[LUCODEBIG_SEL] = gdtstore[GUCODEBIG_SEL];
1425           ldtstore[LUDATA_SEL] = gdtstore[GUDATA_SEL];
1426 
1427           /* exceptions */
1428           for (x = 0; x < 32; x++) {
1429                     /* Reset to default. Special cases below */
1430                     int sel;
1431 #ifdef XENPV
1432                     sel = SEL_XEN;
1433 #else
1434                     sel = SEL_KPL;
1435 #endif /* XENPV */
1436 
1437                     idt_vec_reserve(iv, x);
1438 
1439                     switch (x) {
1440 #ifdef XENPV
1441                     case 2:  /* NMI */
1442                     case 18: /* MCA */
1443                               sel |= 0x4; /* Auto EOI/mask */
1444                               break;
1445 #endif /* XENPV */
1446                     case 3:
1447                     case 4:
1448                               sel = SEL_UPL;
1449                               break;
1450                     default:
1451                               break;
1452                     }
1453                     set_idtgate(&idt[x], IDTVEC(exceptions)[x], 0, SDT_SYS386IGT,
1454                         sel, GSEL(GCODE_SEL, SEL_KPL));
1455           }
1456 
1457           /* new-style interrupt gate for syscalls */
1458           idt_vec_reserve(iv, 128);
1459           set_idtgate(&idt[128], &IDTVEC(syscall), 0, SDT_SYS386IGT, SEL_UPL,
1460               GSEL(GCODE_SEL, SEL_KPL));
1461 
1462 #ifndef XENPV
1463           /*
1464            * Activate the second temporary GDT, allocated in
1465            * pmap_bootstrap with pmap_bootstrap_valloc/palloc, and
1466            * initialized with the content of the initial temporary GDT in
1467            * initgdt, plus an updated LDT.
1468            *
1469            * This ensures the %fs-relative addressing for the CPU-local
1470            * area used by CPUVAR(...), curcpu(), and curlwp will continue
1471            * to work after init386 returns and the initial temporary GDT
1472            * is popped off, before we call main and later create a
1473            * permanent GDT in gdt_init via cpu_startup.
1474            */
1475           setregion(&region, gdtstore, NGDT * sizeof(gdtstore[0]) - 1);
1476           lgdt(&region);
1477 #endif
1478 
1479           lldt(GSEL(GLDT_SEL, SEL_KPL));
1480           cpu_init_idt(&cpu_info_primary);
1481 
1482 #ifdef XENPV
1483           xen_init_ksyms();
1484 #else /* XENPV */
1485 #ifdef XEN
1486           if (vm_guest == VM_GUEST_XENPVH)
1487                     xen_init_ksyms();
1488           else
1489 #endif /* XEN */
1490                     init386_ksyms();
1491 #endif /* XENPV */
1492 
1493 #if NMCA > 0
1494           /*
1495            * check for MCA bus, needed to be done before ISA stuff - if
1496            * MCA is detected, ISA needs to use level triggered interrupts
1497            * by default
1498            * And we do not search for MCA using bioscall() on EFI systems
1499            * that lacks it (they lack MCA too, anyway).
1500            */
1501           if (lookup_bootinfo(BTINFO_EFI) == NULL && vm_guest != VM_GUEST_XENPVH)
1502                     mca_busprobe();
1503 #endif
1504 
1505 #ifdef XENPV
1506           extern int tmpstk;
1507           cpu_info_primary.ci_intrstack = &tmpstk;
1508           events_default_setup();
1509 #else
1510           intr_default_setup();
1511 #endif
1512 
1513           splraise(IPL_HIGH);
1514           x86_enable_intr();
1515 
1516 #ifdef DDB
1517           if (boothowto & RB_KDB)
1518                     Debugger();
1519 #endif
1520 #ifdef KGDB
1521           kgdb_port_init();
1522           if (boothowto & RB_KDB) {
1523                     kgdb_debug_init = 1;
1524                     kgdb_connect(1);
1525           }
1526 #endif
1527 
1528           if (physmem < btoc(2 * 1024 * 1024)) {
1529                     printf("warning: too little memory available; "
1530                            "have %lu bytes, want %lu bytes\n"
1531                            "running in degraded mode\n"
1532                            "press a key to confirm\n\n",
1533                            (unsigned long)ptoa(physmem), 2*1024*1024UL);
1534                     cngetc();
1535           }
1536 
1537           pcb->pcb_dbregs = NULL;
1538           x86_dbregs_init();
1539 }
1540 
1541 #include <dev/ic/mc146818reg.h>                   /* for NVRAM POST */
1542 #include <i386/isa/nvram.h>             /* for NVRAM POST */
1543 
1544 void
cpu_reset(void)1545 cpu_reset(void)
1546 {
1547 #ifdef XENPV
1548           HYPERVISOR_reboot();
1549           for (;;);
1550 #else /* XENPV */
1551           struct region_descriptor region;
1552           idt_descriptor_t *idt;
1553 
1554           idt = (idt_descriptor_t *)cpu_info_primary.ci_idtvec.iv_idt;
1555           x86_disable_intr();
1556 
1557           /*
1558            * Ensure the NVRAM reset byte contains something vaguely sane.
1559            */
1560 
1561           outb(IO_RTC, NVRAM_RESET);
1562           outb(IO_RTC+1, NVRAM_RESET_RST);
1563 
1564           /*
1565            * Reset AMD Geode SC1100.
1566            *
1567            * 1) Write PCI Configuration Address Register (0xcf8) to
1568            *    select Function 0, Register 0x44: Bridge Configuration,
1569            *    GPIO and LPC Configuration Register Space, Reset
1570            *    Control Register.
1571            *
1572            * 2) Write 0xf to PCI Configuration Data Register (0xcfc)
1573            *    to reset IDE controller, IDE bus, and PCI bus, and
1574            *    to trigger a system-wide reset.
1575            *
1576            * See AMD Geode SC1100 Processor Data Book, Revision 2.0,
1577            * sections 6.3.1, 6.3.2, and 6.4.1.
1578            */
1579           if (cpu_info_primary.ci_signature == 0x540) {
1580                     outl(0xcf8, 0x80009044);
1581                     outl(0xcfc, 0xf);
1582           }
1583 
1584           x86_reset();
1585 
1586           /*
1587            * Try to cause a triple fault and watchdog reset by making the IDT
1588            * invalid and causing a fault.
1589            */
1590           memset((void *)idt, 0, NIDT * sizeof(idt[0]));
1591           setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1592           lidt(&region);
1593           breakpoint();
1594 
1595 #if 0
1596           /*
1597            * Try to cause a triple fault and watchdog reset by unmapping the
1598            * entire address space and doing a TLB flush.
1599            */
1600           memset((void *)PTD, 0, PAGE_SIZE);
1601           tlbflush();
1602 #endif
1603 
1604           for (;;);
1605 #endif /* XENPV */
1606 }
1607 
1608 void
cpu_getmcontext(struct lwp * l,mcontext_t * mcp,unsigned int * flags)1609 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
1610 {
1611           const struct trapframe *tf = l->l_md.md_regs;
1612           __greg_t *gr = mcp->__gregs;
1613           __greg_t ras_eip;
1614 
1615           /* Save register context. */
1616           gr[_REG_GS]  = tf->tf_gs;
1617           gr[_REG_FS]  = tf->tf_fs;
1618           gr[_REG_ES]  = tf->tf_es;
1619           gr[_REG_DS]  = tf->tf_ds;
1620           gr[_REG_EFL] = tf->tf_eflags;
1621 
1622           gr[_REG_EDI]    = tf->tf_edi;
1623           gr[_REG_ESI]    = tf->tf_esi;
1624           gr[_REG_EBP]    = tf->tf_ebp;
1625           gr[_REG_EBX]    = tf->tf_ebx;
1626           gr[_REG_EDX]    = tf->tf_edx;
1627           gr[_REG_ECX]    = tf->tf_ecx;
1628           gr[_REG_EAX]    = tf->tf_eax;
1629           gr[_REG_EIP]    = tf->tf_eip;
1630           gr[_REG_CS]     = tf->tf_cs;
1631           gr[_REG_ESP]    = tf->tf_esp;
1632           gr[_REG_UESP]   = tf->tf_esp;
1633           gr[_REG_SS]     = tf->tf_ss;
1634           gr[_REG_TRAPNO] = tf->tf_trapno;
1635           gr[_REG_ERR]    = tf->tf_err;
1636 
1637           if ((ras_eip = (__greg_t)ras_lookup(l->l_proc,
1638               (void *) gr[_REG_EIP])) != -1)
1639                     gr[_REG_EIP] = ras_eip;
1640 
1641           *flags |= _UC_CPU;
1642 
1643           mcp->_mc_tlsbase = (uintptr_t)l->l_private;
1644           *flags |= _UC_TLSBASE;
1645 
1646           /*
1647            * Save floating point register context.
1648            *
1649            * If the cpu doesn't support fxsave we must still write to
1650            * the entire 512 byte area - otherwise we leak kernel memory
1651            * contents to userspace.
1652            * It wouldn't matter if we were doing the copyout here.
1653            * So we might as well convert to fxsave format.
1654            */
1655           __CTASSERT(sizeof (struct fxsave) ==
1656               sizeof mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1657           process_read_fpregs_xmm(l, (struct fxsave *)
1658               &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1659           memset(&mcp->__fpregs.__fp_pad, 0, sizeof mcp->__fpregs.__fp_pad);
1660           *flags |= _UC_FXSAVE | _UC_FPU;
1661 }
1662 
1663 int
cpu_mcontext_validate(struct lwp * l,const mcontext_t * mcp)1664 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
1665 {
1666           const __greg_t *gr = mcp->__gregs;
1667           struct trapframe *tf = l->l_md.md_regs;
1668 
1669           /*
1670            * Check for security violations.  If we're returning
1671            * to protected mode, the CPU will validate the segment
1672            * registers automatically and generate a trap on
1673            * violations.  We handle the trap, rather than doing
1674            * all of the checking here.
1675            */
1676           if (((gr[_REG_EFL] ^ tf->tf_eflags) & PSL_USERSTATIC) ||
1677               !USERMODE(gr[_REG_CS]))
1678                     return EINVAL;
1679 
1680           return 0;
1681 }
1682 
1683 int
cpu_setmcontext(struct lwp * l,const mcontext_t * mcp,unsigned int flags)1684 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
1685 {
1686           struct trapframe *tf = l->l_md.md_regs;
1687           const __greg_t *gr = mcp->__gregs;
1688           struct proc *p = l->l_proc;
1689           int error;
1690 
1691           /* Restore register context, if any. */
1692           if ((flags & _UC_CPU) != 0) {
1693                     error = cpu_mcontext_validate(l, mcp);
1694                     if (error)
1695                               return error;
1696 
1697                     tf->tf_gs = gr[_REG_GS];
1698                     tf->tf_fs = gr[_REG_FS];
1699                     tf->tf_es = gr[_REG_ES];
1700                     tf->tf_ds = gr[_REG_DS];
1701                     /* Only change the user-alterable part of eflags */
1702                     tf->tf_eflags &= ~PSL_USER;
1703                     tf->tf_eflags |= (gr[_REG_EFL] & PSL_USER);
1704 
1705                     tf->tf_edi    = gr[_REG_EDI];
1706                     tf->tf_esi    = gr[_REG_ESI];
1707                     tf->tf_ebp    = gr[_REG_EBP];
1708                     tf->tf_ebx    = gr[_REG_EBX];
1709                     tf->tf_edx    = gr[_REG_EDX];
1710                     tf->tf_ecx    = gr[_REG_ECX];
1711                     tf->tf_eax    = gr[_REG_EAX];
1712                     tf->tf_eip    = gr[_REG_EIP];
1713                     tf->tf_cs     = gr[_REG_CS];
1714                     tf->tf_esp    = gr[_REG_UESP];
1715                     tf->tf_ss     = gr[_REG_SS];
1716           }
1717 
1718           if ((flags & _UC_TLSBASE) != 0)
1719                     lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
1720 
1721           /* Restore floating point register context, if given. */
1722           if ((flags & _UC_FPU) != 0) {
1723                     __CTASSERT(sizeof (struct fxsave) ==
1724                         sizeof mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1725                     __CTASSERT(sizeof (struct save87) ==
1726                         sizeof mcp->__fpregs.__fp_reg_set.__fpchip_state);
1727 
1728                     if (flags & _UC_FXSAVE) {
1729                               process_write_fpregs_xmm(l, (const struct fxsave *)
1730                                             &mcp->__fpregs.__fp_reg_set.__fp_xmm_state);
1731                     } else {
1732                               process_write_fpregs_s87(l, (const struct save87 *)
1733                                             &mcp->__fpregs.__fp_reg_set.__fpchip_state);
1734                     }
1735           }
1736 
1737           mutex_enter(p->p_lock);
1738           if (flags & _UC_SETSTACK)
1739                     l->l_sigstk.ss_flags |= SS_ONSTACK;
1740           if (flags & _UC_CLRSTACK)
1741                     l->l_sigstk.ss_flags &= ~SS_ONSTACK;
1742           mutex_exit(p->p_lock);
1743           return (0);
1744 }
1745 
1746 #define   DEV_IO 14           /* iopl for compat_10 */
1747 
1748 int
mm_md_open(dev_t dev,int flag,int mode,struct lwp * l)1749 mm_md_open(dev_t dev, int flag, int mode, struct lwp *l)
1750 {
1751 
1752           switch (minor(dev)) {
1753           case DEV_IO:
1754                     /*
1755                      * This is done by i386_iopl(3) now.
1756                      *
1757                      * #if defined(COMPAT_10) || defined(COMPAT_FREEBSD)
1758                      */
1759                     if (flag & FWRITE) {
1760                               struct trapframe *fp;
1761                               int error;
1762 
1763                               error = kauth_authorize_machdep(l->l_cred,
1764                                   KAUTH_MACHDEP_IOPL, NULL, NULL, NULL, NULL);
1765                               if (error)
1766                                         return (error);
1767                               fp = curlwp->l_md.md_regs;
1768                               fp->tf_eflags |= PSL_IOPL;
1769                     }
1770                     break;
1771           default:
1772                     break;
1773           }
1774           return 0;
1775 }
1776 
1777 #ifdef PAE
1778 void
cpu_alloc_l3_page(struct cpu_info * ci)1779 cpu_alloc_l3_page(struct cpu_info *ci)
1780 {
1781           int ret;
1782           struct pglist pg;
1783           struct vm_page *vmap;
1784 
1785           KASSERT(ci != NULL);
1786           /*
1787            * Allocate a page for the per-CPU L3 PD. cr3 being 32 bits, PA musts
1788            * resides below the 4GB boundary.
1789            */
1790           ret = uvm_pglistalloc(PAGE_SIZE, 0, 0x100000000ULL, 32, 0, &pg, 1, 0);
1791           vmap = TAILQ_FIRST(&pg);
1792 
1793           if (ret != 0 || vmap == NULL)
1794                     panic("%s: failed to allocate L3 pglist for CPU %d (ret %d)\n",
1795                               __func__, cpu_index(ci), ret);
1796 
1797           ci->ci_pae_l3_pdirpa = VM_PAGE_TO_PHYS(vmap);
1798 
1799           ci->ci_pae_l3_pdir = (paddr_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
1800                     UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
1801           if (ci->ci_pae_l3_pdir == NULL)
1802                     panic("%s: failed to allocate L3 PD for CPU %d\n",
1803                               __func__, cpu_index(ci));
1804 
1805           pmap_kenter_pa((vaddr_t)ci->ci_pae_l3_pdir, ci->ci_pae_l3_pdirpa,
1806                     VM_PROT_READ | VM_PROT_WRITE, 0);
1807 
1808           pmap_update(pmap_kernel());
1809 }
1810 #endif /* PAE */
1811 
1812 static void
idt_vec_copy(struct idt_vec * dst,struct idt_vec * src)1813 idt_vec_copy(struct idt_vec *dst, struct idt_vec *src)
1814 {
1815           idt_descriptor_t *idt_dst;
1816 
1817           idt_dst = dst->iv_idt;
1818           memcpy(idt_dst, src->iv_idt, PAGE_SIZE);
1819           memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap));
1820 }
1821 
1822 void
idt_vec_init_cpu_md(struct idt_vec * iv,cpuid_t cid)1823 idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid)
1824 {
1825           vaddr_t va_idt, va_pentium_idt;
1826           struct vm_page *pg;
1827 
1828           if (idt_vec_is_pcpu() &&
1829               cid != cpu_index(&cpu_info_primary)) {
1830                     va_idt = uvm_km_alloc(kernel_map, PAGE_SIZE,
1831                         0, UVM_KMF_VAONLY);
1832                     pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
1833                     if (pg == NULL) {
1834                               panic("failed to allocate pcpu idt PA");
1835                     }
1836                     pmap_kenter_pa(va_idt, VM_PAGE_TO_PHYS(pg),
1837                         VM_PROT_READ|VM_PROT_WRITE, 0);
1838                     pmap_update(pmap_kernel());
1839 
1840                     memset((void *)va_idt, 0, PAGE_SIZE);
1841 
1842                     /* pentium f00f bug stuff */
1843                     va_pentium_idt = uvm_km_alloc(kernel_map, PAGE_SIZE,
1844                         0, UVM_KMF_VAONLY);
1845                     pmap_kenter_pa(va_pentium_idt, VM_PAGE_TO_PHYS(pg),
1846                         VM_PROT_READ, 0);
1847                     pmap_update(pmap_kernel());
1848 
1849                     iv->iv_idt = (void *)va_idt;
1850                     iv->iv_idt_pentium = (void *)va_pentium_idt;
1851 
1852                     idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec));
1853           } else {
1854                     iv->iv_idt = (void *)idt_vaddr;
1855                     iv->iv_idt_pentium = (void *)pentium_idt_vaddr;
1856           }
1857 }
1858