1 /*        $NetBSD: machdep.c,v 1.376 2025/04/30 15:30:53 imil Exp $   */
2 
3 /*
4  * Copyright (c) 1996, 1997, 1998, 2000, 2006, 2007, 2008, 2011
5  *     The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Charles M. Hannum and by Jason R. Thorpe of the Numerical Aerospace
10  * Simulation Facility, NASA Ames Research Center.
11  *
12  * This code is derived from software contributed to The NetBSD Foundation
13  * by Coyote Point Systems, Inc. which was written under contract to Coyote
14  * Point by Jed Davis and Devon O'Dell.
15  *
16  * Redistribution and use in source and binary forms, with or without
17  * modification, are permitted provided that the following conditions
18  * are met:
19  * 1. Redistributions of source code must retain the above copyright
20  *    notice, this list of conditions and the following disclaimer.
21  * 2. Redistributions in binary form must reproduce the above copyright
22  *    notice, this list of conditions and the following disclaimer in the
23  *    documentation and/or other materials provided with the distribution.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
26  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
27  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
29  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
30  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
31  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
32  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
33  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
34  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
35  * POSSIBILITY OF SUCH DAMAGE.
36  */
37 
38 /*
39  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
40  *
41  * Permission to use, copy, modify, and distribute this software for any
42  * purpose with or without fee is hereby granted, provided that the above
43  * copyright notice and this permission notice appear in all copies.
44  *
45  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
46  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
47  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
48  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
49  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
50  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
51  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
52  */
53 
54 /*
55  * Copyright (c) 2007 Manuel Bouyer.
56  *
57  * Redistribution and use in source and binary forms, with or without
58  * modification, are permitted provided that the following conditions
59  * are met:
60  * 1. Redistributions of source code must retain the above copyright
61  *    notice, this list of conditions and the following disclaimer.
62  * 2. Redistributions in binary form must reproduce the above copyright
63  *    notice, this list of conditions and the following disclaimer in the
64  *    documentation and/or other materials provided with the distribution.
65  *
66  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
67  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
68  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
69  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
70  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
71  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
72  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
73  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
74  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
75  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
76  */
77 
78 /*
79  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
80  * All rights reserved.
81  *
82  * This code is derived from software contributed to Berkeley by
83  * William Jolitz.
84  *
85  * Redistribution and use in source and binary forms, with or without
86  * modification, are permitted provided that the following conditions
87  * are met:
88  * 1. Redistributions of source code must retain the above copyright
89  *    notice, this list of conditions and the following disclaimer.
90  * 2. Redistributions in binary form must reproduce the above copyright
91  *    notice, this list of conditions and the following disclaimer in the
92  *    documentation and/or other materials provided with the distribution.
93  * 3. Neither the name of the University nor the names of its contributors
94  *    may be used to endorse or promote products derived from this software
95  *    without specific prior written permission.
96  *
97  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
98  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
99  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
100  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
101  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
102  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
103  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
104  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
105  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
106  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
107  * SUCH DAMAGE.
108  *
109  *        @(#)machdep.c       7.4 (Berkeley) 6/3/91
110  */
111 
112 #include <sys/cdefs.h>
113 __KERNEL_RCSID(0, "$NetBSD: machdep.c,v 1.376 2025/04/30 15:30:53 imil Exp $");
114 
115 #include "opt_modular.h"
116 #include "opt_user_ldt.h"
117 #include "opt_ddb.h"
118 #include "opt_kgdb.h"
119 #include "opt_cpureset_delay.h"
120 #include "opt_mtrr.h"
121 #include "opt_realmem.h"
122 #include "opt_xen.h"
123 #include "opt_svs.h"
124 #include "opt_kaslr.h"
125 #ifndef XENPV
126 #include "opt_physmem.h"
127 #endif
128 #include "isa.h"
129 #include "pci.h"
130 
131 #include <sys/param.h>
132 #include <sys/systm.h>
133 #include <sys/signal.h>
134 #include <sys/signalvar.h>
135 #include <sys/kernel.h>
136 #include <sys/cpu.h>
137 #include <sys/exec.h>
138 #include <sys/exec_aout.h>    /* for MID_* */
139 #include <sys/reboot.h>
140 #include <sys/conf.h>
141 #include <sys/msgbuf.h>
142 #include <sys/mount.h>
143 #include <sys/core.h>
144 #include <sys/kcore.h>
145 #include <sys/ucontext.h>
146 #include <machine/kcore.h>
147 #include <sys/ras.h>
148 #include <sys/syscallargs.h>
149 #include <sys/ksyms.h>
150 #include <sys/device.h>
151 #include <sys/lwp.h>
152 #include <sys/proc.h>
153 #include <sys/asan.h>
154 #include <sys/csan.h>
155 #include <sys/msan.h>
156 #include <sys/module.h>
157 #include <sys/timevar.h>
158 
159 #ifdef KGDB
160 #include <sys/kgdb.h>
161 #endif
162 
163 #include <lib/libkern/entpool.h> /* XXX */
164 
165 #include <dev/cons.h>
166 #include <dev/mm.h>
167 
168 #include <uvm/uvm.h>
169 #include <uvm/uvm_page.h>
170 
171 #include <sys/sysctl.h>
172 
173 #include <machine/cpu.h>
174 #include <machine/cpu_rng.h>
175 #include <machine/cpufunc.h>
176 #include <machine/gdt.h>
177 #include <machine/intr.h>
178 #include <machine/pio.h>
179 #include <machine/psl.h>
180 #include <machine/reg.h>
181 #include <machine/specialreg.h>
182 #include <machine/bootinfo.h>
183 #include <x86/fpu.h>
184 #include <x86/dbregs.h>
185 #include <machine/mtrr.h>
186 #include <machine/mpbiosvar.h>
187 #include <machine/pmap_private.h>
188 
189 #include <x86/bootspace.h>
190 #include <x86/cputypes.h>
191 #include <x86/cpuvar.h>
192 #include <x86/machdep.h>
193 #include <x86/x86/tsc.h>
194 
195 #include <dev/isa/isareg.h>
196 #include <machine/isa_machdep.h>
197 #include <dev/ic/i8042reg.h>
198 
199 #ifdef XEN
200 #include <xen/xen.h>
201 #include <xen/hypervisor.h>
202 #include <xen/evtchn.h>
203 #include <xen/include/public/version.h>
204 #include <xen/include/public/vcpu.h>
205 #endif /* XEN */
206 
207 #include <ddb/db_active.h>
208 
209 #ifdef DDB
210 #include <machine/db_machdep.h>
211 #include <ddb/db_extern.h>
212 #include <ddb/db_output.h>
213 #include <ddb/db_interface.h>
214 #endif
215 
216 #include "acpica.h"
217 
218 #if NACPICA > 0
219 #include <dev/acpi/acpivar.h>
220 #define ACPI_MACHDEP_PRIVATE
221 #include <machine/acpi_machdep.h>
222 #else
223 #include <machine/i82489var.h>
224 #endif
225 
226 #include "isa.h"
227 #include "isadma.h"
228 #include "ksyms.h"
229 
230 /* the following is used externally (sysctl_hw) */
231 char machine[] = "amd64";               /* CPU "architecture" */
232 char machine_arch[] = "x86_64";                   /* machine == machine_arch */
233 
234 #ifdef CPURESET_DELAY
235 int cpureset_delay = CPURESET_DELAY;
236 #else
237 int cpureset_delay = 2000; /* default to 2s */
238 #endif
239 
240 int cpu_class = CPUCLASS_686;
241 
242 #ifdef MTRR
243 const struct mtrr_funcs *mtrr_funcs;
244 #endif
245 
246 int cpu_class;
247 int use_pae;
248 
249 #ifndef NO_SPARSE_DUMP
250 int sparse_dump = 1;
251 
252 paddr_t max_paddr = 0;
253 unsigned char *sparse_dump_physmap;
254 #endif
255 
256 char *dump_headerbuf, *dump_headerbuf_ptr;
257 #define dump_headerbuf_size PAGE_SIZE
258 #define dump_headerbuf_end (dump_headerbuf + dump_headerbuf_size)
259 #define dump_headerbuf_avail (dump_headerbuf_end - dump_headerbuf_ptr)
260 daddr_t dump_header_blkno;
261 
262 size_t dump_nmemsegs;
263 size_t dump_npages;
264 size_t dump_header_size;
265 size_t dump_totalbytesleft;
266 
267 vaddr_t idt_vaddr;
268 paddr_t idt_paddr;
269 vaddr_t gdt_vaddr;
270 paddr_t gdt_paddr;
271 vaddr_t ldt_vaddr;
272 paddr_t ldt_paddr;
273 
274 static struct vm_map module_map_store;
275 extern struct bootspace bootspace;
276 extern struct slotspace slotspace;
277 
278 vaddr_t vm_min_kernel_address __read_mostly = VM_MIN_KERNEL_ADDRESS_DEFAULT;
279 vaddr_t vm_max_kernel_address __read_mostly = VM_MAX_KERNEL_ADDRESS_DEFAULT;
280 pd_entry_t *pte_base __read_mostly;
281 
282 struct vm_map *phys_map = NULL;
283 
284 extern paddr_t lowmem_rsvd;
285 extern paddr_t avail_start, avail_end;
286 #ifdef XENPV
287 extern paddr_t pmap_pa_start, pmap_pa_end;
288 #endif
289 
290 struct nmistore {
291           uint64_t cr3;
292           uint64_t scratch;
293 } __packed;
294 
295 /*
296  * Size of memory segments, before any memory is stolen.
297  */
298 phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
299 int mem_cluster_cnt;
300 
301 int cpu_dump(void);
302 int cpu_dumpsize(void);
303 u_long cpu_dump_mempagecnt(void);
304 void dodumpsys(void);
305 void dumpsys(void);
306 
307 static void x86_64_proc0_pcb_ldt_init(void);
308 
309 void dump_misc_init(void);
310 void dump_seg_prep(void);
311 int dump_seg_iter(int (*)(paddr_t, paddr_t));
312 
313 #ifndef NO_SPARSE_DUMP
314 void sparse_dump_reset(void);
315 void sparse_dump_mark(void);
316 void cpu_dump_prep_sparse(void);
317 #endif
318 
319 void dump_header_start(void);
320 int dump_header_flush(void);
321 int dump_header_addbytes(const void*, size_t);
322 int dump_header_addseg(paddr_t, paddr_t);
323 int dump_header_finish(void);
324 
325 int dump_seg_count_range(paddr_t, paddr_t);
326 int dumpsys_seg(paddr_t, paddr_t);
327 
328 void init_bootspace(void);
329 void init_slotspace(void);
330 void init_x86_64(paddr_t);
331 
332 /*
333  * Machine-dependent startup code
334  */
335 void
cpu_startup(void)336 cpu_startup(void)
337 {
338           int x, y;
339           vaddr_t minaddr, maxaddr;
340           psize_t sz;
341 
342           /*
343            * For console drivers that require uvm and pmap to be initialized,
344            * we'll give them one more chance here...
345            */
346           consinit();
347 
348           /*
349            * Initialize error message buffer (at end of core).
350            */
351           if (msgbuf_p_cnt == 0)
352                     panic("msgbuf paddr map has not been set up");
353           for (x = 0, sz = 0; x < msgbuf_p_cnt; sz += msgbuf_p_seg[x++].sz)
354                     continue;
355 
356           msgbuf_vaddr = uvm_km_alloc(kernel_map, sz, 0, UVM_KMF_VAONLY);
357           if (msgbuf_vaddr == 0)
358                     panic("failed to valloc msgbuf_vaddr");
359 
360           for (y = 0, sz = 0; y < msgbuf_p_cnt; y++) {
361                     for (x = 0; x < btoc(msgbuf_p_seg[y].sz); x++, sz += PAGE_SIZE)
362                               pmap_kenter_pa((vaddr_t)msgbuf_vaddr + sz,
363                                   msgbuf_p_seg[y].paddr + x * PAGE_SIZE,
364                                   VM_PROT_READ|VM_PROT_WRITE, 0);
365           }
366 
367           pmap_update(pmap_kernel());
368 
369           initmsgbuf((void *)msgbuf_vaddr, round_page(sz));
370 
371           minaddr = 0;
372 
373           /*
374            * Allocate a submap for physio.
375            */
376           phys_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
377               VM_PHYS_SIZE, 0, false, NULL);
378 
379           /*
380            * Create the module map.
381            *
382            * The kernel uses RIP-relative addressing with a maximum offset of
383            * 2GB. Because of that, we can't put the kernel modules in kernel_map
384            * (like i386 does), since kernel_map is too far away in memory from
385            * the kernel sections. So we have to create a special module_map.
386            *
387            * The module map is taken as what is left of the bootstrap memory
388            * created in locore/prekern.
389            */
390           uvm_map_setup(&module_map_store, bootspace.smodule,
391               bootspace.emodule, 0);
392           module_map_store.pmap = pmap_kernel();
393           module_map = &module_map_store;
394 
395           /* Say hello. */
396           banner();
397 
398 #if NISA > 0 || NPCI > 0
399           /* Safe for i/o port / memory space allocation to use malloc now. */
400           x86_bus_space_mallocok();
401 #endif
402 
403 #ifdef __HAVE_PCPU_AREA
404           cpu_pcpuarea_init(&cpu_info_primary);
405 #endif
406           gdt_init();
407           x86_64_proc0_pcb_ldt_init();
408 
409           cpu_init_tss(&cpu_info_primary);
410 #if !defined(XENPV)
411           ltr(cpu_info_primary.ci_tss_sel);
412 #endif
413 
414           x86_startup();
415 }
416 
417 #ifdef XENPV
418 /* used in assembly */
419 void hypervisor_callback(void);
420 void failsafe_callback(void);
421 void x86_64_switch_context(struct pcb *);
422 void x86_64_tls_switch(struct lwp *);
423 
424 void
x86_64_switch_context(struct pcb * new)425 x86_64_switch_context(struct pcb *new)
426 {
427           HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), new->pcb_rsp0);
428           struct physdev_set_iopl set_iopl;
429           set_iopl.iopl = new->pcb_iopl;
430           HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
431 }
432 
433 void
x86_64_tls_switch(struct lwp * l)434 x86_64_tls_switch(struct lwp *l)
435 {
436           struct cpu_info *ci = curcpu();
437           struct pcb *pcb = lwp_getpcb(l);
438           struct trapframe *tf = l->l_md.md_regs;
439           uint64_t zero = 0;
440 
441           /*
442            * Raise the IPL to IPL_HIGH. XXX Still needed?
443            */
444           (void)splhigh();
445 
446           /* Update segment registers */
447           if (pcb->pcb_flags & PCB_COMPAT32) {
448                     update_descriptor(&ci->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
449                     update_descriptor(&ci->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
450                     setds(GSEL(GUDATA32_SEL, SEL_UPL));
451                     setes(GSEL(GUDATA32_SEL, SEL_UPL));
452                     setfs(GSEL(GUDATA32_SEL, SEL_UPL));
453                     HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, tf->tf_gs);
454           } else {
455                     update_descriptor(&ci->ci_gdt[GUFS_SEL], &zero);
456                     update_descriptor(&ci->ci_gdt[GUGS_SEL], &zero);
457                     setds(GSEL(GUDATA_SEL, SEL_UPL));
458                     setes(GSEL(GUDATA_SEL, SEL_UPL));
459                     setfs(0);
460                     HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, 0);
461                     HYPERVISOR_set_segment_base(SEGBASE_FS, pcb->pcb_fs);
462                     HYPERVISOR_set_segment_base(SEGBASE_GS_USER, pcb->pcb_gs);
463           }
464 }
465 #endif /* XENPV */
466 
467 /*
468  * Set up proc0's PCB and LDT.
469  */
470 static void
x86_64_proc0_pcb_ldt_init(void)471 x86_64_proc0_pcb_ldt_init(void)
472 {
473           struct lwp *l = &lwp0;
474           struct pcb *pcb = lwp_getpcb(l);
475 
476           pcb->pcb_flags = 0;
477           pcb->pcb_fs = 0;
478           pcb->pcb_gs = 0;
479           pcb->pcb_rsp0 = (uvm_lwp_getuarea(l) + USPACE - 16) & ~0xf;
480           pcb->pcb_iopl = IOPL_KPL;
481           pcb->pcb_dbregs = NULL;
482           pcb->pcb_cr0 = rcr0() & ~CR0_TS;
483           l->l_md.md_regs = (struct trapframe *)pcb->pcb_rsp0 - 1;
484 
485 #if !defined(XENPV)
486           lldt(GSYSSEL(GLDT_SEL, SEL_KPL));
487 #else
488           xen_set_ldt((vaddr_t)ldtstore, LDT_SIZE >> 3);
489           /* Reset TS bit and set kernel stack for interrupt handlers */
490           HYPERVISOR_fpu_taskswitch(1);
491           HYPERVISOR_stack_switch(GSEL(GDATA_SEL, SEL_KPL), pcb->pcb_rsp0);
492           struct physdev_set_iopl set_iopl;
493           set_iopl.iopl = pcb->pcb_iopl;
494           HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
495 #endif
496 }
497 
498 /*
499  * Set up TSS and I/O bitmap.
500  */
501 void
cpu_init_tss(struct cpu_info * ci)502 cpu_init_tss(struct cpu_info *ci)
503 {
504 #ifdef __HAVE_PCPU_AREA
505           const cpuid_t cid = cpu_index(ci);
506 #endif
507           struct cpu_tss *cputss;
508           struct nmistore *store;
509           uintptr_t p;
510 
511 #ifdef __HAVE_PCPU_AREA
512           cputss = (struct cpu_tss *)&pcpuarea->ent[cid].tss;
513 #else
514           cputss = (struct cpu_tss *)uvm_km_alloc(kernel_map,
515               sizeof(struct cpu_tss), 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
516 #endif
517 
518           cputss->tss.tss_iobase = IOMAP_INVALOFF << 16;
519 
520           /* DDB stack */
521 #ifdef __HAVE_PCPU_AREA
522           p = (vaddr_t)&pcpuarea->ent[cid].ist0;
523 #else
524           p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
525 #endif
526           cputss->tss.tss_ist[0] = p + PAGE_SIZE - 16;
527 
528           /* double fault */
529 #ifdef __HAVE_PCPU_AREA
530           p = (vaddr_t)&pcpuarea->ent[cid].ist1;
531 #else
532           p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
533 #endif
534           cputss->tss.tss_ist[1] = p + PAGE_SIZE - 16;
535 
536           /* NMI - store a structure at the top of the stack */
537 #ifdef __HAVE_PCPU_AREA
538           p = (vaddr_t)&pcpuarea->ent[cid].ist2;
539 #else
540           p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
541 #endif
542           cputss->tss.tss_ist[2] = p + PAGE_SIZE - sizeof(struct nmistore);
543           store = (struct nmistore *)(p + PAGE_SIZE - sizeof(struct nmistore));
544           store->cr3 = pmap_pdirpa(pmap_kernel(), 0);
545 
546           /* DB */
547 #ifdef __HAVE_PCPU_AREA
548           p = (vaddr_t)&pcpuarea->ent[cid].ist3;
549 #else
550           p = uvm_km_alloc(kernel_map, PAGE_SIZE, 0, UVM_KMF_WIRED|UVM_KMF_ZERO);
551 #endif
552           cputss->tss.tss_ist[3] = p + PAGE_SIZE - 16;
553 
554           ci->ci_tss = cputss;
555           ci->ci_tss_sel = tss_alloc(&cputss->tss);
556 }
557 
558 void
buildcontext(struct lwp * l,void * catcher,void * f)559 buildcontext(struct lwp *l, void *catcher, void *f)
560 {
561           struct trapframe *tf = l->l_md.md_regs;
562 
563           tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
564           tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
565           tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
566           tf->tf_gs = GSEL(GUDATA_SEL, SEL_UPL);
567 
568           tf->tf_rip = (uint64_t)catcher;
569           tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
570           tf->tf_rflags &= ~PSL_CLEARSIG;
571           tf->tf_rsp = (uint64_t)f;
572           tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
573 
574           /* Ensure FP state is sane */
575           fpu_sigreset(l);
576 }
577 
578 void
sendsig_sigcontext(const ksiginfo_t * ksi,const sigset_t * mask)579 sendsig_sigcontext(const ksiginfo_t *ksi, const sigset_t *mask)
580 {
581 
582           printf("sendsig_sigcontext: illegal\n");
583           sigexit(curlwp, SIGILL);
584 }
585 
586 void
sendsig_siginfo(const ksiginfo_t * ksi,const sigset_t * mask)587 sendsig_siginfo(const ksiginfo_t *ksi, const sigset_t *mask)
588 {
589           struct lwp *l = curlwp;
590           struct proc *p = l->l_proc;
591           struct sigacts *ps = p->p_sigacts;
592           int onstack, error;
593           int sig = ksi->ksi_signo;
594           struct sigframe_siginfo *fp, frame;
595           sig_t catcher = SIGACTION(p, sig).sa_handler;
596           struct trapframe *tf = l->l_md.md_regs;
597           char *sp;
598 
599           KASSERT(mutex_owned(p->p_lock));
600 
601           /* Do we need to jump onto the signal stack? */
602           onstack =
603               (l->l_sigstk.ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
604               (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
605 
606           /* Allocate space for the signal handler context. */
607           if (onstack)
608                     sp = ((char *)l->l_sigstk.ss_sp + l->l_sigstk.ss_size);
609           else
610                     /* AMD64 ABI 128-bytes "red zone". */
611                     sp = (char *)tf->tf_rsp - 128;
612 
613           sp -= sizeof(struct sigframe_siginfo);
614           /* Round down the stackpointer to a multiple of 16 for the ABI. */
615           fp = (struct sigframe_siginfo *)(((unsigned long)sp &
616                     ~STACK_ALIGNBYTES) - 8);
617 
618           memset(&frame, 0, sizeof(frame));
619           frame.sf_ra = (uint64_t)ps->sa_sigdesc[sig].sd_tramp;
620           frame.sf_si._info = ksi->ksi_info;
621           frame.sf_uc.uc_flags = _UC_SIGMASK;
622           frame.sf_uc.uc_sigmask = *mask;
623           frame.sf_uc.uc_link = l->l_ctxlink;
624           frame.sf_uc.uc_flags |= (l->l_sigstk.ss_flags & SS_ONSTACK)
625               ? _UC_SETSTACK : _UC_CLRSTACK;
626           sendsig_reset(l, sig);
627 
628           mutex_exit(p->p_lock);
629           cpu_getmcontext(l, &frame.sf_uc.uc_mcontext, &frame.sf_uc.uc_flags);
630           /* Copyout all the fp regs, the signal handler might expect them. */
631           error = copyout(&frame, fp, sizeof frame);
632           mutex_enter(p->p_lock);
633 
634           if (error != 0) {
635                     /*
636                      * Process has trashed its stack; give it an illegal
637                      * instruction to halt it in its tracks.
638                      */
639                     sigexit(l, SIGILL);
640                     /* NOTREACHED */
641           }
642 
643           buildcontext(l, catcher, fp);
644 
645           tf->tf_rdi = sig;
646           tf->tf_rsi = (uint64_t)&fp->sf_si;
647           tf->tf_rdx = tf->tf_r15 = (uint64_t)&fp->sf_uc;
648 
649           /* Remember that we're now on the signal stack. */
650           if (onstack)
651                     l->l_sigstk.ss_flags |= SS_ONSTACK;
652 
653           if ((vaddr_t)catcher >= VM_MAXUSER_ADDRESS) {
654                     /*
655                      * process has given an invalid address for the
656                      * handler. Stop it, but do not do it before so
657                      * we can return the right info to userland (or in core dump)
658                      */
659                     sigexit(l, SIGILL);
660                     /* NOTREACHED */
661           }
662 }
663 
664 struct pcb dumppcb;
665 
666 void
cpu_reboot(int howto,char * bootstr)667 cpu_reboot(int howto, char *bootstr)
668 {
669           static bool syncdone = false;
670           int s = IPL_NONE;
671           __USE(s); /* ugly otherwise */
672 
673           if (cold) {
674                     howto |= RB_HALT;
675                     goto haltsys;
676           }
677 
678           boothowto = howto;
679 
680           /* i386 maybe_dump() */
681 
682           /*
683            * If we've panic'd, don't make the situation potentially
684            * worse by syncing or unmounting the file systems.
685            */
686           if ((howto & RB_NOSYNC) == 0 && panicstr == NULL) {
687                     if (!syncdone) {
688                               syncdone = true;
689                               /* XXX used to force unmount as well, here */
690                               vfs_sync_all(curlwp);
691                     }
692 
693                     while (vfs_unmountall1(curlwp, false, false) ||
694                            config_detach_all(boothowto) ||
695                            vfs_unmount_forceone(curlwp))
696                               ;         /* do nothing */
697           } else {
698                     if (!db_active)
699                               suspendsched();
700           }
701 
702           pmf_system_shutdown(boothowto);
703 
704           /* Disable interrupts. */
705           s = splhigh();
706 
707           /* Do a dump if requested. */
708           if ((howto & (RB_DUMP | RB_HALT)) == RB_DUMP)
709                     dumpsys();
710 
711 haltsys:
712           doshutdownhooks();
713 
714         if ((howto & RB_POWERDOWN) == RB_POWERDOWN) {
715 #if NACPICA > 0
716                     if (s != IPL_NONE)
717                               splx(s);
718 
719                     acpi_enter_sleep_state(ACPI_STATE_S5);
720 #endif
721 #ifdef XEN
722                     if (vm_guest == VM_GUEST_XENPV ||
723                         vm_guest == VM_GUEST_XENPVH ||
724                         vm_guest == VM_GUEST_XENPVHVM)
725                               HYPERVISOR_shutdown();
726 #endif /* XEN */
727           }
728 
729           cpu_broadcast_halt();
730 
731           if (howto & RB_HALT) {
732 #if NACPICA > 0
733                     acpi_disable();
734 #endif
735 
736                     printf("\n");
737                     printf("The operating system has halted.\n");
738                     printf("Please press any key to reboot.\n\n");
739                     cnpollc(1);         /* for proper keyboard command handling */
740                     if (cngetc() == 0) {
741                               /* no console attached, so just hlt */
742                               printf("No keyboard - cannot reboot after all.\n");
743                               for(;;) {
744                                         x86_hlt();
745                               }
746                     }
747                     cnpollc(0);
748           }
749 
750           printf("rebooting...\n");
751           if (cpureset_delay > 0)
752                     delay(cpureset_delay * 1000);
753           cpu_reset();
754           for(;;) ;
755           /*NOTREACHED*/
756 }
757 
758 /*
759  * XXXfvdl share dumpcode.
760  */
761 
762 /*
763  * Perform assorted dump-related initialization tasks.  Assumes that
764  * the maximum physical memory address will not increase afterwards.
765  */
766 void
dump_misc_init(void)767 dump_misc_init(void)
768 {
769 #ifndef NO_SPARSE_DUMP
770           int i;
771 #endif
772 
773           if (dump_headerbuf != NULL)
774                     return; /* already called */
775 
776 #ifndef NO_SPARSE_DUMP
777           for (i = 0; i < mem_cluster_cnt; ++i) {
778                     paddr_t top = mem_clusters[i].start + mem_clusters[i].size;
779                     if (max_paddr < top)
780                               max_paddr = top;
781           }
782 #ifdef DEBUG
783           printf("dump_misc_init: max_paddr = 0x%lx\n",
784               (unsigned long)max_paddr);
785 #endif
786           if (max_paddr == 0) {
787                     printf("Your machine does not initialize mem_clusters; "
788                         "sparse_dumps disabled\n");
789                     sparse_dump = 0;
790           } else {
791                     sparse_dump_physmap = (void *)uvm_km_alloc(kernel_map,
792                         roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE),
793                         PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
794           }
795 #endif
796           dump_headerbuf = (void *)uvm_km_alloc(kernel_map,
797               dump_headerbuf_size,
798               PAGE_SIZE, UVM_KMF_WIRED|UVM_KMF_ZERO);
799           /* XXXjld should check for failure here, disable dumps if so. */
800 }
801 
802 #ifndef NO_SPARSE_DUMP
803 /*
804  * Clear the set of pages to include in a sparse dump.
805  */
806 void
sparse_dump_reset(void)807 sparse_dump_reset(void)
808 {
809           memset(sparse_dump_physmap, 0,
810               roundup(max_paddr / (PAGE_SIZE * NBBY), PAGE_SIZE));
811 }
812 
813 /*
814  * Include or exclude pages in a sparse dump.
815  */
816 void
sparse_dump_mark(void)817 sparse_dump_mark(void)
818 {
819           paddr_t p, pstart, pend;
820           struct vm_page *pg;
821           int i;
822           uvm_physseg_t upm;
823 
824           /*
825            * Mark all memory pages, then unmark pages that are uninteresting.
826            * Dereferenceing pg->uobject might crash again if another CPU
827            * frees the object out from under us, but we can't lock anything
828            * so it's a risk we have to take.
829            */
830 
831           for (i = 0; i < mem_cluster_cnt; ++i) {
832                     pstart = mem_clusters[i].start / PAGE_SIZE;
833                     pend = pstart + mem_clusters[i].size / PAGE_SIZE;
834 
835                     for (p = pstart; p < pend; p++) {
836                               setbit(sparse_dump_physmap, p);
837                     }
838           }
839         for (upm = uvm_physseg_get_first();
840                uvm_physseg_valid_p(upm);
841                upm = uvm_physseg_get_next(upm)) {
842                     paddr_t pfn;
843 
844                     /*
845                      * We assume that seg->start to seg->end are
846                      * uvm_page_physload()ed
847                      */
848                     for (pfn = uvm_physseg_get_start(upm);
849                          pfn < uvm_physseg_get_end(upm);
850                          pfn++) {
851                               pg = PHYS_TO_VM_PAGE(ptoa(pfn));
852 
853                               if (pg->uanon || (pg->flags & PG_FREE) ||
854                                   (pg->uobject && pg->uobject->pgops)) {
855                                         p = VM_PAGE_TO_PHYS(pg) / PAGE_SIZE;
856                                         clrbit(sparse_dump_physmap, p);
857                               }
858                     }
859           }
860 }
861 
862 /*
863  * Machine-dependently decides on the contents of a sparse dump, using
864  * the above.
865  */
866 void
cpu_dump_prep_sparse(void)867 cpu_dump_prep_sparse(void)
868 {
869           sparse_dump_reset();
870           /* XXX could the alternate recursive page table be skipped? */
871           sparse_dump_mark();
872           /* Memory for I/O buffers could be unmarked here, for example. */
873           /* The kernel text could also be unmarked, but gdb would be upset. */
874 }
875 #endif
876 
877 /*
878  * Abstractly iterate over the collection of memory segments to be
879  * dumped; the callback lacks the customary environment-pointer
880  * argument because none of the current users really need one.
881  *
882  * To be used only after dump_seg_prep is called to set things up.
883  */
884 int
dump_seg_iter(int (* callback)(paddr_t,paddr_t))885 dump_seg_iter(int (*callback)(paddr_t, paddr_t))
886 {
887           int error, i;
888 
889 #define CALLBACK(start,size) do {     \
890           error = callback(start,size); \
891           if (error)                    \
892                     return error;         \
893 } while(0)
894 
895           for (i = 0; i < mem_cluster_cnt; ++i) {
896 #ifndef NO_SPARSE_DUMP
897                     /*
898                      * The bitmap is scanned within each memory segment,
899                      * rather than over its entire domain, in case any
900                      * pages outside of the memory proper have been mapped
901                      * into kva; they might be devices that wouldn't
902                      * appreciate being arbitrarily read, and including
903                      * them could also break the assumption that a sparse
904                      * dump will always be smaller than a full one.
905                      */
906                     if (sparse_dump && sparse_dump_physmap) {
907                               paddr_t p, sp_start, sp_end;
908                               int lastset;
909 
910                               sp_start = mem_clusters[i].start;
911                               sp_end = sp_start + mem_clusters[i].size;
912                               sp_start = rounddown(sp_start, PAGE_SIZE); /* unnecessary? */
913                               lastset = 0;
914                               for (p = sp_start; p < sp_end; p += PAGE_SIZE) {
915                                         int thisset = isset(sparse_dump_physmap,
916                                             p/PAGE_SIZE);
917 
918                                         if (!lastset && thisset)
919                                                   sp_start = p;
920                                         if (lastset && !thisset)
921                                                   CALLBACK(sp_start, p - sp_start);
922                                         lastset = thisset;
923                               }
924                               if (lastset)
925                                         CALLBACK(sp_start, p - sp_start);
926                     } else
927 #endif
928                               CALLBACK(mem_clusters[i].start, mem_clusters[i].size);
929           }
930           return 0;
931 #undef CALLBACK
932 }
933 
934 /*
935  * Prepare for an impending core dump: decide what's being dumped and
936  * how much space it will take up.
937  */
938 void
dump_seg_prep(void)939 dump_seg_prep(void)
940 {
941 #ifndef NO_SPARSE_DUMP
942           if (sparse_dump && sparse_dump_physmap)
943                     cpu_dump_prep_sparse();
944 #endif
945 
946           dump_nmemsegs = 0;
947           dump_npages = 0;
948           dump_seg_iter(dump_seg_count_range);
949 
950           dump_header_size = ALIGN(sizeof(kcore_seg_t)) +
951               ALIGN(sizeof(cpu_kcore_hdr_t)) +
952               ALIGN(dump_nmemsegs * sizeof(phys_ram_seg_t));
953           dump_header_size = roundup(dump_header_size, dbtob(1));
954 
955           /*
956            * savecore(8) will read this to decide how many pages to
957            * copy, and cpu_dumpconf has already used the pessimistic
958            * value to set dumplo, so it's time to tell the truth.
959            */
960           dumpsize = dump_npages; /* XXX could these just be one variable? */
961 }
962 
963 int
dump_seg_count_range(paddr_t start,paddr_t size)964 dump_seg_count_range(paddr_t start, paddr_t size)
965 {
966           ++dump_nmemsegs;
967           dump_npages += size / PAGE_SIZE;
968           return 0;
969 }
970 
971 /*
972  * A sparse dump's header may be rather large, due to the number of
973  * "segments" emitted.  These routines manage a simple output buffer,
974  * so that the header can be written to disk incrementally.
975  */
976 void
dump_header_start(void)977 dump_header_start(void)
978 {
979           dump_headerbuf_ptr = dump_headerbuf;
980           dump_header_blkno = dumplo;
981 }
982 
983 int
dump_header_flush(void)984 dump_header_flush(void)
985 {
986           const struct bdevsw *bdev;
987           size_t to_write;
988           int error;
989 
990           bdev = bdevsw_lookup(dumpdev);
991           to_write = roundup(dump_headerbuf_ptr - dump_headerbuf, dbtob(1));
992           error = bdev->d_dump(dumpdev, dump_header_blkno,
993               dump_headerbuf, to_write);
994           dump_header_blkno += btodb(to_write);
995           dump_headerbuf_ptr = dump_headerbuf;
996           return error;
997 }
998 
999 int
dump_header_addbytes(const void * vptr,size_t n)1000 dump_header_addbytes(const void* vptr, size_t n)
1001 {
1002           const char* ptr = vptr;
1003           int error;
1004 
1005           while (n > dump_headerbuf_avail) {
1006                     memcpy(dump_headerbuf_ptr, ptr, dump_headerbuf_avail);
1007                     ptr += dump_headerbuf_avail;
1008                     n -= dump_headerbuf_avail;
1009                     dump_headerbuf_ptr = dump_headerbuf_end;
1010                     error = dump_header_flush();
1011                     if (error)
1012                               return error;
1013           }
1014           memcpy(dump_headerbuf_ptr, ptr, n);
1015           dump_headerbuf_ptr += n;
1016 
1017           return 0;
1018 }
1019 
1020 int
dump_header_addseg(paddr_t start,paddr_t size)1021 dump_header_addseg(paddr_t start, paddr_t size)
1022 {
1023           phys_ram_seg_t seg = { start, size };
1024           int error;
1025 
1026           error = dump_header_addbytes(&seg, sizeof(seg));
1027           if (error) {
1028                     printf("[seg 0x%"PRIxPADDR" bytes 0x%"PRIxPSIZE" failed,"
1029                         " error=%d] ", start, size, error);
1030           }
1031           return error;
1032 }
1033 
1034 int
dump_header_finish(void)1035 dump_header_finish(void)
1036 {
1037           int error;
1038 
1039           memset(dump_headerbuf_ptr, 0, dump_headerbuf_avail);
1040           error = dump_header_flush();
1041           if (error)
1042                     printf("[finish failed, error=%d] ", error);
1043           return error;
1044 }
1045 
1046 
1047 /*
1048  * These variables are needed by /sbin/savecore
1049  */
1050 uint32_t  dumpmag = 0x8fca0101;         /* magic number */
1051 int       dumpsize = 0;                 /* pages */
1052 long      dumplo = 0;                   /* blocks */
1053 
1054 /*
1055  * cpu_dumpsize: calculate size of machine-dependent kernel core dump headers
1056  * for a full (non-sparse) dump.
1057  */
1058 int
cpu_dumpsize(void)1059 cpu_dumpsize(void)
1060 {
1061           int size;
1062 
1063           size = ALIGN(sizeof(kcore_seg_t)) + ALIGN(sizeof(cpu_kcore_hdr_t)) +
1064               ALIGN(mem_cluster_cnt * sizeof(phys_ram_seg_t));
1065           if (roundup(size, dbtob(1)) != dbtob(1))
1066                     return (-1);
1067 
1068           return (1);
1069 }
1070 
1071 /*
1072  * cpu_dump_mempagecnt: calculate the size of RAM (in pages) to be dumped
1073  * for a full (non-sparse) dump.
1074  */
1075 u_long
cpu_dump_mempagecnt(void)1076 cpu_dump_mempagecnt(void)
1077 {
1078           u_long i, n;
1079 
1080           n = 0;
1081           for (i = 0; i < mem_cluster_cnt; i++)
1082                     n += atop(mem_clusters[i].size);
1083           return (n);
1084 }
1085 
1086 /*
1087  * cpu_dump: dump the machine-dependent kernel core dump headers.
1088  */
1089 int
cpu_dump(void)1090 cpu_dump(void)
1091 {
1092           kcore_seg_t seg;
1093           cpu_kcore_hdr_t cpuhdr;
1094           const struct bdevsw *bdev;
1095           int error;
1096 
1097           bdev = bdevsw_lookup(dumpdev);
1098           if (bdev == NULL) {
1099                     printf("[device 0x%llx ENXIO] ", (unsigned long long)dumpdev);
1100                     return ENXIO;
1101           }
1102 
1103           /*
1104            * Generate a segment header.
1105            */
1106           CORE_SETMAGIC(seg, KCORE_MAGIC, MID_MACHINE, CORE_CPU);
1107           seg.c_size = dump_header_size - ALIGN(sizeof(seg));
1108           error = dump_header_addbytes(&seg, ALIGN(sizeof(seg)));
1109           if (error) {
1110                     printf("[segment header %zu bytes failed, error=%d] ",
1111                         ALIGN(sizeof(seg)), error);
1112                     /* blithely proceed (can't fail?) */
1113           }
1114 
1115           /*
1116            * Add the machine-dependent header info.
1117            */
1118           cpuhdr.ptdpaddr = PDPpaddr;
1119           cpuhdr.nmemsegs = dump_nmemsegs;
1120           error = dump_header_addbytes(&cpuhdr, ALIGN(sizeof(cpuhdr)));
1121           if (error) {
1122                     printf("[MD header %zu bytes failed, error=%d] ",
1123                         ALIGN(sizeof(cpuhdr)), error);
1124                     /* blithely proceed (can't fail?) */
1125           }
1126 
1127           /*
1128            * Write out the memory segment descriptors.
1129            */
1130           return dump_seg_iter(dump_header_addseg);
1131 }
1132 
1133 /*
1134  * Doadump comes here after turning off memory management and
1135  * getting on the dump stack, either when called above, or by
1136  * the auto-restart code.
1137  */
1138 #define BYTES_PER_DUMP  PAGE_SIZE /* must be a multiple of pagesize XXX small */
1139 static vaddr_t dumpspace;
1140 
1141 vaddr_t
reserve_dumppages(vaddr_t p)1142 reserve_dumppages(vaddr_t p)
1143 {
1144 
1145           dumpspace = p;
1146           return (p + BYTES_PER_DUMP);
1147 }
1148 
1149 int
dumpsys_seg(paddr_t maddr,paddr_t bytes)1150 dumpsys_seg(paddr_t maddr, paddr_t bytes)
1151 {
1152           u_long i, m, n;
1153           daddr_t blkno;
1154           const struct bdevsw *bdev;
1155           int (*dump)(dev_t, daddr_t, void *, size_t);
1156           int error;
1157 
1158           if (dumpdev == NODEV)
1159                     return ENODEV;
1160           bdev = bdevsw_lookup(dumpdev);
1161           if (bdev == NULL || bdev->d_psize == NULL)
1162                     return ENODEV;
1163 
1164           dump = bdev->d_dump;
1165 
1166           blkno = dump_header_blkno;
1167           for (i = 0; i < bytes; i += n, dump_totalbytesleft -= n) {
1168                     /* Print out how many MBs we have left to go. */
1169                     if ((dump_totalbytesleft % (1024*1024)) == 0)
1170                               printf_nolog("%lu ", (unsigned long)
1171                                   (dump_totalbytesleft / (1024 * 1024)));
1172 
1173                     /* Limit size for next transfer. */
1174                     n = bytes - i;
1175                     if (n > BYTES_PER_DUMP)
1176                               n = BYTES_PER_DUMP;
1177 
1178                     for (m = 0; m < n; m += NBPG)
1179                               pmap_kenter_pa(dumpspace + m, maddr + m,
1180                                   VM_PROT_READ, 0);
1181                     pmap_update(pmap_kernel());
1182 
1183                     error = (*dump)(dumpdev, blkno, (void *)dumpspace, n);
1184                     pmap_kremove_local(dumpspace, n);
1185                     if (error)
1186                               return error;
1187                     maddr += n;
1188                     blkno += btodb(n);            /* XXX? */
1189 
1190 #if 0     /* XXX this doesn't work.  grr. */
1191                     /* operator aborting dump? */
1192                     if (sget() != NULL)
1193                               return EINTR;
1194 #endif
1195           }
1196           dump_header_blkno = blkno;
1197 
1198           return 0;
1199 }
1200 
1201 void
dodumpsys(void)1202 dodumpsys(void)
1203 {
1204           const struct bdevsw *bdev;
1205           int dumpend, psize;
1206           int error;
1207 
1208           if (dumpdev == NODEV)
1209                     return;
1210 
1211           bdev = bdevsw_lookup(dumpdev);
1212           if (bdev == NULL || bdev->d_psize == NULL)
1213                     return;
1214           /*
1215            * For dumps during autoconfiguration,
1216            * if dump device has already configured...
1217            */
1218           if (dumpsize == 0)
1219                     cpu_dumpconf();
1220 
1221           printf("\ndumping to dev %llu,%llu (offset=%ld, size=%d):",
1222               (unsigned long long)major(dumpdev),
1223               (unsigned long long)minor(dumpdev), dumplo, dumpsize);
1224 
1225           if (dumplo <= 0 || dumpsize <= 0) {
1226                     printf(" not possible\n");
1227                     return;
1228           }
1229 
1230           psize = bdev_size(dumpdev);
1231           printf("\ndump ");
1232           if (psize == -1) {
1233                     printf("area unavailable\n");
1234                     return;
1235           }
1236 
1237 #if 0     /* XXX this doesn't work.  grr. */
1238           /* toss any characters present prior to dump */
1239           while (sget() != NULL); /*syscons and pccons differ */
1240 #endif
1241 
1242           dump_seg_prep();
1243           dumpend = dumplo + btodb(dump_header_size) + ctod(dump_npages);
1244           if (dumpend > psize) {
1245                     printf("failed: insufficient space (%d < %d)\n",
1246                         psize, dumpend);
1247                     goto failed;
1248           }
1249 
1250           dump_header_start();
1251           if ((error = cpu_dump()) != 0)
1252                     goto err;
1253           if ((error = dump_header_finish()) != 0)
1254                     goto err;
1255 
1256           if (dump_header_blkno != dumplo + btodb(dump_header_size)) {
1257                     printf("BAD header size (%ld [written] != %ld [expected])\n",
1258                         (long)(dump_header_blkno - dumplo),
1259                         (long)btodb(dump_header_size));
1260                     goto failed;
1261           }
1262 
1263           dump_totalbytesleft = roundup(ptoa(dump_npages), BYTES_PER_DUMP);
1264           error = dump_seg_iter(dumpsys_seg);
1265 
1266           if (error == 0 && dump_header_blkno != dumpend) {
1267                     printf("BAD dump size (%ld [written] != %ld [expected])\n",
1268                         (long)(dumpend - dumplo),
1269                         (long)(dump_header_blkno - dumplo));
1270                     goto failed;
1271           }
1272 
1273 err:
1274           switch (error) {
1275 
1276           case ENXIO:
1277                     printf("device bad\n");
1278                     break;
1279 
1280           case EFAULT:
1281                     printf("device not ready\n");
1282                     break;
1283 
1284           case EINVAL:
1285                     printf("area improper\n");
1286                     break;
1287 
1288           case EIO:
1289                     printf("i/o error\n");
1290                     break;
1291 
1292           case EINTR:
1293                     printf("aborted from console\n");
1294                     break;
1295 
1296           case 0:
1297                     printf("succeeded\n");
1298                     break;
1299 
1300           default:
1301                     printf("error %d\n", error);
1302                     break;
1303           }
1304 failed:
1305           printf("\n\n");
1306           delay(5000000);               /* 5 seconds */
1307 }
1308 
1309 /*
1310  * This is called by main to set dumplo and dumpsize.
1311  * Dumps always skip the first PAGE_SIZE of disk space
1312  * in case there might be a disk label stored there.
1313  * If there is extra space, put dump at the end to
1314  * reduce the chance that swapping trashes it.
1315  *
1316  * Sparse dumps can't placed as close to the end as possible, because
1317  * savecore(8) has to know where to start reading in the dump device
1318  * before it has access to any of the crashed system's state.
1319  *
1320  * Note also that a sparse dump will never be larger than a full one:
1321  * in order to add a phys_ram_seg_t to the header, at least one page
1322  * must be removed.
1323  */
1324 void
cpu_dumpconf(void)1325 cpu_dumpconf(void)
1326 {
1327           int nblks, dumpblks;          /* size of dump area */
1328 
1329           if (dumpdev == NODEV)
1330                     goto bad;
1331           nblks = bdev_size(dumpdev);
1332           if (nblks <= ctod(1))
1333                     goto bad;
1334 
1335           dumpblks = cpu_dumpsize();
1336           if (dumpblks < 0)
1337                     goto bad;
1338 
1339           /* dumpsize is in page units, and doesn't include headers. */
1340           dumpsize = cpu_dump_mempagecnt();
1341 
1342           dumpblks += ctod(dumpsize);
1343 
1344           /* If dump won't fit (incl. room for possible label), punt. */
1345           if (dumpblks > (nblks - ctod(1))) {
1346 #ifndef NO_SPARSE_DUMP
1347                     /* A sparse dump might (and hopefully will) fit. */
1348                     dumplo = ctod(1);
1349 #else
1350                     /* But if we're not configured for that, punt. */
1351                     goto bad;
1352 #endif
1353           } else {
1354                     /* Put dump at end of partition */
1355                     dumplo = nblks - dumpblks;
1356           }
1357 
1358 
1359           /* Now that we've decided this will work, init ancillary stuff. */
1360           dump_misc_init();
1361           return;
1362 
1363  bad:
1364           dumpsize = 0;
1365 }
1366 
1367 /*
1368  * Clear registers on exec
1369  */
1370 void
setregs(struct lwp * l,struct exec_package * pack,vaddr_t stack)1371 setregs(struct lwp *l, struct exec_package *pack, vaddr_t stack)
1372 {
1373           struct pcb *pcb = lwp_getpcb(l);
1374           struct trapframe *tf;
1375 
1376 #ifdef USER_LDT
1377           pmap_ldt_cleanup(l);
1378 #endif
1379 
1380           fpu_clear(l, pack->ep_osversion >= 699002600
1381               ? __NetBSD_NPXCW__ : __NetBSD_COMPAT_NPXCW__);
1382           x86_dbregs_clear(l);
1383 
1384           kpreempt_disable();
1385           pcb->pcb_flags = 0;
1386           l->l_proc->p_flag &= ~PK_32;
1387           l->l_md.md_flags = MDL_IRET;
1388           cpu_segregs64_zero(l);
1389           kpreempt_enable();
1390 
1391           tf = l->l_md.md_regs;
1392           memset(tf, 0, sizeof(*tf));
1393 
1394           tf->tf_trapno = T_ASTFLT;
1395           tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
1396           tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
1397           tf->tf_rdi = 0;
1398           tf->tf_rsi = 0;
1399           tf->tf_rbp = 0;
1400           tf->tf_rbx = l->l_proc->p_psstrp;
1401           tf->tf_rdx = 0;
1402           tf->tf_rcx = 0;
1403           tf->tf_rax = 0;
1404           tf->tf_rip = pack->ep_entry;
1405           tf->tf_cs = LSEL(LUCODE_SEL, SEL_UPL);
1406           tf->tf_rflags = PSL_USERSET;
1407           tf->tf_rsp = stack;
1408           tf->tf_ss = LSEL(LUDATA_SEL, SEL_UPL);
1409 }
1410 
1411 /*
1412  * Initialize segments and descriptor tables
1413  */
1414 char *ldtstore;
1415 char *gdtstore;
1416 
1417 void
setgate(struct gate_descriptor * gd,void * func,int ist,int type,int dpl,int sel)1418 setgate(struct gate_descriptor *gd, void *func,
1419     int ist, int type, int dpl, int sel)
1420 {
1421           vaddr_t vaddr;
1422 
1423           vaddr = ((vaddr_t)gd) & ~PAGE_MASK;
1424 
1425           kpreempt_disable();
1426           pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
1427 
1428           gd->gd_looffset = (uint64_t)func & 0xffff;
1429           gd->gd_selector = sel;
1430           gd->gd_ist = ist;
1431           gd->gd_type = type;
1432           gd->gd_dpl = dpl;
1433           gd->gd_p = 1;
1434           gd->gd_hioffset = (uint64_t)func >> 16;
1435           gd->gd_zero = 0;
1436           gd->gd_xx1 = 0;
1437           gd->gd_xx2 = 0;
1438           gd->gd_xx3 = 0;
1439 
1440           pmap_changeprot_local(vaddr, VM_PROT_READ);
1441           kpreempt_enable();
1442 }
1443 
1444 void
unsetgate(struct gate_descriptor * gd)1445 unsetgate(struct gate_descriptor *gd)
1446 {
1447           vaddr_t vaddr;
1448 
1449           vaddr = ((vaddr_t)gd) & ~PAGE_MASK;
1450 
1451           kpreempt_disable();
1452           pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
1453 
1454           memset(gd, 0, sizeof (*gd));
1455 
1456           pmap_changeprot_local(vaddr, VM_PROT_READ);
1457           kpreempt_enable();
1458 }
1459 
1460 void
setregion(struct region_descriptor * rd,void * base,uint16_t limit)1461 setregion(struct region_descriptor *rd, void *base, uint16_t limit)
1462 {
1463           rd->rd_limit = limit;
1464           rd->rd_base = (uint64_t)base;
1465 }
1466 
1467 /*
1468  * Note that the base and limit fields are ignored in long mode.
1469  */
1470 void
set_mem_segment(struct mem_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran,int def32,int is64)1471 set_mem_segment(struct mem_segment_descriptor *sd, void *base, size_t limit,
1472           int type, int dpl, int gran, int def32, int is64)
1473 {
1474           sd->sd_lolimit = (unsigned)limit;
1475           sd->sd_lobase = (unsigned long)base;
1476           sd->sd_type = type;
1477           sd->sd_dpl = dpl;
1478           sd->sd_p = 1;
1479           sd->sd_hilimit = (unsigned)limit >> 16;
1480           sd->sd_avl = 0;
1481           sd->sd_long = is64;
1482           sd->sd_def32 = def32;
1483           sd->sd_gran = gran;
1484           sd->sd_hibase = (unsigned long)base >> 24;
1485 }
1486 
1487 void
set_sys_segment(struct sys_segment_descriptor * sd,void * base,size_t limit,int type,int dpl,int gran)1488 set_sys_segment(struct sys_segment_descriptor *sd, void *base, size_t limit,
1489           int type, int dpl, int gran)
1490 {
1491           memset(sd, 0, sizeof *sd);
1492           sd->sd_lolimit = (unsigned)limit;
1493           sd->sd_lobase = (uint64_t)base;
1494           sd->sd_type = type;
1495           sd->sd_dpl = dpl;
1496           sd->sd_p = 1;
1497           sd->sd_hilimit = (unsigned)limit >> 16;
1498           sd->sd_gran = gran;
1499           sd->sd_hibase = (uint64_t)base >> 24;
1500 }
1501 
1502 void
cpu_init_idt(struct cpu_info * ci)1503 cpu_init_idt(struct cpu_info *ci)
1504 {
1505           struct region_descriptor region;
1506           idt_descriptor_t *idt;
1507 
1508           idt = ci->ci_idtvec.iv_idt;
1509           setregion(&region, idt, NIDT * sizeof(idt[0]) - 1);
1510           lidt(&region);
1511 }
1512 
1513 #define   IDTVEC(name)        __CONCAT(X, name)
1514 typedef void (vector)(void);
1515 extern vector IDTVEC(syscall);
1516 extern vector IDTVEC(syscall32);
1517 extern vector IDTVEC(osyscall);
1518 extern vector *x86_exceptions[];
1519 
1520 #ifndef XENPV
1521 static void
init_x86_64_ksyms(void)1522 init_x86_64_ksyms(void)
1523 {
1524 #if NKSYMS || defined(DDB) || defined(MODULAR)
1525           extern int end;
1526           extern int *esym;
1527           struct btinfo_symtab *symtab;
1528           vaddr_t tssym, tesym;
1529 
1530 #ifdef DDB
1531           db_machine_init();
1532 #endif
1533 
1534           symtab = lookup_bootinfo(BTINFO_SYMTAB);
1535           if (symtab) {
1536 #ifdef KASLR
1537                     tssym = bootspace.head.va;
1538                     tesym = bootspace.head.va; /* (unused...) */
1539 #else
1540                     tssym = (vaddr_t)symtab->ssym + KERNBASE;
1541                     tesym = (vaddr_t)symtab->esym + KERNBASE;
1542 #endif
1543                     ksyms_addsyms_elf(symtab->nsym, (void *)tssym, (void *)tesym);
1544           } else {
1545                     uintptr_t endp = (uintptr_t)(void *)&end;
1546 #ifdef XEN
1547                     /*
1548                      * cpu_probe() / identify_hypervisor() overrides VM_GUEST_GENPVH,
1549                      * we can't rely on vm_guest == VM_GUEST_GENPVH
1550                      */
1551                     if (pvh_boot && vm_guest != VM_GUEST_XENPVH)
1552                               ksyms_addsyms_elf(0, ((long *)endp) + 1, esym);
1553                     else
1554 #endif
1555                               ksyms_addsyms_elf(*(long *)endp, ((long *)endp) + 1, esym);
1556           }
1557 #endif
1558 }
1559 #endif /* XENPV */
1560 
1561 void __noasan
init_bootspace(void)1562 init_bootspace(void)
1563 {
1564           extern char __rodata_start;
1565           extern char __data_start;
1566           extern char __kernel_end;
1567           size_t i = 0;
1568 
1569           memset(&bootspace, 0, sizeof(bootspace));
1570 
1571           bootspace.head.va = KERNTEXTOFF;
1572           bootspace.head.pa = KERNTEXTOFF - KERNBASE;
1573           bootspace.head.sz = 0;
1574 
1575           bootspace.segs[i].type = BTSEG_TEXT;
1576           bootspace.segs[i].va = KERNTEXTOFF;
1577           bootspace.segs[i].pa = KERNTEXTOFF - KERNBASE;
1578           bootspace.segs[i].sz = (size_t)&__rodata_start - KERNTEXTOFF;
1579           i++;
1580 
1581           bootspace.segs[i].type = BTSEG_RODATA;
1582           bootspace.segs[i].va = (vaddr_t)&__rodata_start;
1583           bootspace.segs[i].pa = (paddr_t)&__rodata_start - KERNBASE;
1584           bootspace.segs[i].sz = (size_t)&__data_start - (size_t)&__rodata_start;
1585           i++;
1586 
1587           bootspace.segs[i].type = BTSEG_DATA;
1588           bootspace.segs[i].va = (vaddr_t)&__data_start;
1589           bootspace.segs[i].pa = (paddr_t)&__data_start - KERNBASE;
1590           bootspace.segs[i].sz = (size_t)&__kernel_end - (size_t)&__data_start;
1591           i++;
1592 
1593           bootspace.boot.va = (vaddr_t)&__kernel_end;
1594           bootspace.boot.pa = (paddr_t)&__kernel_end - KERNBASE;
1595           bootspace.boot.sz = (size_t)(atdevbase + IOM_SIZE) -
1596               (size_t)&__kernel_end;
1597 
1598           /* In locore.S, we allocated a tmp va. We will use it now. */
1599           bootspace.spareva = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;
1600 
1601           /* Virtual address of the L4 page. */
1602           bootspace.pdir = (vaddr_t)(PDPpaddr + KERNBASE);
1603 
1604           /* Kernel module map. */
1605           bootspace.smodule = (vaddr_t)atdevbase + IOM_SIZE;
1606           bootspace.emodule = KERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2;
1607 }
1608 
1609 static void
init_pte(void)1610 init_pte(void)
1611 {
1612 #ifndef XENPV
1613           extern uint32_t nox_flag;
1614           pd_entry_t *pdir = (pd_entry_t *)bootspace.pdir;
1615           pdir[L4_SLOT_PTE] = PDPpaddr | PTE_W | ((uint64_t)nox_flag << 32) |
1616               PTE_P;
1617 #endif
1618 
1619           extern pd_entry_t *normal_pdes[3];
1620           normal_pdes[0] = L2_BASE;
1621           normal_pdes[1] = L3_BASE;
1622           normal_pdes[2] = L4_BASE;
1623 }
1624 
1625 void
init_slotspace(void)1626 init_slotspace(void)
1627 {
1628           /*
1629            * XXX Too early to use cprng(9), or even entropy_extract.
1630            */
1631           struct entpool pool;
1632           size_t randhole;
1633           vaddr_t randva;
1634           uint64_t sample;
1635           vaddr_t va;
1636 
1637           memset(&pool, 0, sizeof pool);
1638           cpu_rng_early_sample(&sample);
1639           entpool_enter(&pool, &sample, sizeof sample);
1640 
1641           memset(&slotspace, 0, sizeof(slotspace));
1642 
1643           /* User. [256, because we want to land in >= 256] */
1644           slotspace.area[SLAREA_USER].sslot = 0;
1645           slotspace.area[SLAREA_USER].nslot = PDIR_SLOT_USERLIM+1;
1646           slotspace.area[SLAREA_USER].active = true;
1647 
1648 #ifdef XENPV
1649           /* PTE. */
1650           slotspace.area[SLAREA_PTE].sslot = PDIR_SLOT_PTE;
1651           slotspace.area[SLAREA_PTE].nslot = 1;
1652           slotspace.area[SLAREA_PTE].active = true;
1653 #endif
1654 
1655 #ifdef __HAVE_PCPU_AREA
1656           /* Per-CPU. */
1657           slotspace.area[SLAREA_PCPU].sslot = PDIR_SLOT_PCPU;
1658           slotspace.area[SLAREA_PCPU].nslot = 1;
1659           slotspace.area[SLAREA_PCPU].active = true;
1660 #endif
1661 
1662 #ifdef __HAVE_DIRECT_MAP
1663           /* Direct Map. [Randomized later] */
1664           slotspace.area[SLAREA_DMAP].active = false;
1665 #endif
1666 
1667 #ifdef XENPV
1668           /* Hypervisor. */
1669           slotspace.area[SLAREA_HYPV].sslot = 256;
1670           slotspace.area[SLAREA_HYPV].nslot = 17;
1671           slotspace.area[SLAREA_HYPV].active = true;
1672 #endif
1673 
1674 #ifdef KASAN
1675           /* ASAN. */
1676           slotspace.area[SLAREA_ASAN].sslot = L4_SLOT_KASAN;
1677           slotspace.area[SLAREA_ASAN].nslot = NL4_SLOT_KASAN;
1678           slotspace.area[SLAREA_ASAN].active = true;
1679 #endif
1680 
1681 #ifdef KMSAN
1682           /* MSAN. */
1683           slotspace.area[SLAREA_MSAN].sslot = L4_SLOT_KMSAN;
1684           slotspace.area[SLAREA_MSAN].nslot = NL4_SLOT_KMSAN;
1685           slotspace.area[SLAREA_MSAN].active = true;
1686 #endif
1687 
1688           /* Kernel. */
1689           slotspace.area[SLAREA_KERN].sslot = L4_SLOT_KERNBASE;
1690           slotspace.area[SLAREA_KERN].nslot = 1;
1691           slotspace.area[SLAREA_KERN].active = true;
1692 
1693           /* Main. */
1694           cpu_rng_early_sample(&sample);
1695           entpool_enter(&pool, &sample, sizeof sample);
1696           entpool_extract(&pool, &randhole, sizeof randhole);
1697           entpool_extract(&pool, &randva, sizeof randva);
1698           va = slotspace_rand(SLAREA_MAIN, NKL4_MAX_ENTRIES * NBPD_L4,
1699               NBPD_L4, randhole, randva); /* TODO: NBPD_L1 */
1700           vm_min_kernel_address = va;
1701           vm_max_kernel_address = va + NKL4_MAX_ENTRIES * NBPD_L4;
1702 
1703 #ifndef XENPV
1704           /* PTE. */
1705           cpu_rng_early_sample(&sample);
1706           entpool_enter(&pool, &sample, sizeof sample);
1707           entpool_extract(&pool, &randhole, sizeof randhole);
1708           entpool_extract(&pool, &randva, sizeof randva);
1709           va = slotspace_rand(SLAREA_PTE, NBPD_L4, NBPD_L4, randhole, randva);
1710           pte_base = (pd_entry_t *)va;
1711 #endif
1712 
1713           explicit_memset(&pool, 0, sizeof pool);
1714 }
1715 
1716 void
init_x86_64(paddr_t first_avail)1717 init_x86_64(paddr_t first_avail)
1718 {
1719           extern void consinit(void);
1720           struct region_descriptor region;
1721           struct mem_segment_descriptor *ldt_segp;
1722           struct idt_vec *iv;
1723           idt_descriptor_t *idt;
1724           int x;
1725           struct pcb *pcb;
1726           extern vaddr_t lwp0uarea;
1727 #ifndef XENPV
1728           extern paddr_t local_apic_pa;
1729 #endif
1730 
1731           KASSERT(first_avail % PAGE_SIZE == 0);
1732 
1733 #ifdef XENPV
1734           KASSERT(HYPERVISOR_shared_info != NULL);
1735           cpu_info_primary.ci_vcpu = &HYPERVISOR_shared_info->vcpu_info[0];
1736 #endif
1737 
1738 #ifdef XEN
1739           if (pvh_boot)
1740                     xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1741 #endif
1742           init_pte();
1743 
1744           uvm_lwp_setuarea(&lwp0, lwp0uarea);
1745 
1746           cpu_probe(&cpu_info_primary);
1747 #ifdef SVS
1748           svs_init();
1749 #endif
1750 
1751           /*
1752            * Initialize MSRs on cpu0:
1753            *
1754            * - Enables SYSCALL/SYSRET.
1755            *
1756            * - Sets up %fs and %gs so that %gs points to the current
1757            *   struct cpu_info as needed for CPUVAR(...), curcpu(), and
1758            *   curlwp.
1759            *
1760            * - Enables the no-execute bit if supported.
1761            *
1762            * Thus, after this point, CPUVAR(...), curcpu(), and curlwp
1763            * will work on cpu0.
1764            *
1765            * Note: The call to cpu_init_msrs for secondary CPUs happens
1766            * in cpu_hatch.
1767            */
1768           cpu_init_msrs(&cpu_info_primary, true);
1769 
1770 #ifndef XENPV
1771           cpu_speculation_init(&cpu_info_primary);
1772 #endif
1773 
1774           use_pae = 1; /* PAE always enabled in long mode */
1775 
1776           pcb = lwp_getpcb(&lwp0);
1777 #ifdef XENPV
1778           mutex_init(&pte_lock, MUTEX_DEFAULT, IPL_VM);
1779           pcb->pcb_cr3 = xen_start_info.pt_base - KERNBASE;
1780 #else
1781           pcb->pcb_cr3 = PDPpaddr;
1782 #endif
1783 
1784 #if NISA > 0 || NPCI > 0
1785           x86_bus_space_init();
1786 #endif
1787 
1788           pat_init(&cpu_info_primary);
1789 
1790           consinit();         /* XXX SHOULD NOT BE DONE HERE */
1791 
1792           /*
1793            * Initialize PAGE_SIZE-dependent variables.
1794            */
1795           uvm_md_init();
1796 
1797           uvmexp.ncolors = 2;
1798 
1799           avail_start = first_avail;
1800 
1801 #ifndef XENPV
1802           /*
1803            * Low memory reservations:
1804            * Page 0:          BIOS data
1805            * Page 1:          BIOS callback (not used yet, for symmetry with i386)
1806            * Page 2:          MP bootstrap code (MP_TRAMPOLINE)
1807            * Page 3:          ACPI wakeup code (ACPI_WAKEUP_ADDR)
1808            * Page 4:          Temporary page table for 0MB-4MB
1809            * Page 5:          Temporary page directory
1810            * Page 6:          Temporary page map level 3
1811            * Page 7:          Temporary page map level 4
1812            */
1813           lowmem_rsvd = 8 * PAGE_SIZE;
1814 
1815           /* Initialize the memory clusters (needed in pmap_bootstrap). */
1816           init_x86_clusters();
1817 #else
1818           /* Parse Xen command line (replace bootinfo) */
1819           xen_parse_cmdline(XEN_PARSE_BOOTFLAGS, NULL);
1820 
1821           avail_end = ctob(xen_start_info.nr_pages);
1822           pmap_pa_start = (KERNTEXTOFF - KERNBASE);
1823           pmap_pa_end = avail_end;
1824 #endif
1825 
1826           /*
1827            * Call pmap initialization to make new kernel address space.
1828            * We must do this before loading pages into the VM system.
1829            */
1830           pmap_bootstrap(VM_MIN_KERNEL_ADDRESS);
1831 
1832           /*
1833            * Initialize RNG to get entropy ASAP either from CPU
1834            * RDRAND/RDSEED or from seed on disk.  Constraints:
1835            *
1836            * - Must happen after cpu_init_msrs so that curcpu() and
1837            *   curlwp work.
1838            *
1839            * - Must happen after consinit so we have the opportunity to
1840            *   print useful feedback.
1841            *
1842            * - On KASLR kernels, must happen after pmap_bootstrap because
1843            *   x86_rndseed requires access to the direct map.
1844            */
1845           cpu_rng_init();
1846           x86_rndseed();
1847 
1848 #ifndef XENPV
1849           /* Internalize the physical pages into the VM system. */
1850           init_x86_vm(avail_start);
1851 #else
1852           physmem = xen_start_info.nr_pages;
1853           uvm_page_physload(atop(avail_start), atop(avail_end),
1854               atop(avail_start), atop(avail_end), VM_FREELIST_DEFAULT);
1855 #endif
1856 
1857           init_x86_msgbuf();
1858 
1859           kasan_init();
1860           kcsan_init();
1861           kmsan_init((void *)lwp0uarea);
1862 
1863           pmap_growkernel(VM_MIN_KERNEL_ADDRESS + 32 * 1024 * 1024);
1864 
1865           kpreempt_disable();
1866 
1867 #ifndef XENPV
1868           pmap_kenter_pa(local_apic_va, local_apic_pa,
1869               VM_PROT_READ|VM_PROT_WRITE, 0);
1870           pmap_update(pmap_kernel());
1871           memset((void *)local_apic_va, 0, PAGE_SIZE);
1872 #endif
1873 
1874           pmap_kenter_pa(idt_vaddr, idt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1875           pmap_kenter_pa(gdt_vaddr, gdt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1876           pmap_kenter_pa(ldt_vaddr, ldt_paddr, VM_PROT_READ|VM_PROT_WRITE, 0);
1877           pmap_update(pmap_kernel());
1878           memset((void *)idt_vaddr, 0, PAGE_SIZE);
1879           memset((void *)gdt_vaddr, 0, PAGE_SIZE);
1880           memset((void *)ldt_vaddr, 0, PAGE_SIZE);
1881 
1882 #ifndef XENPV
1883           pmap_changeprot_local(idt_vaddr, VM_PROT_READ);
1884 #endif
1885 
1886           pmap_update(pmap_kernel());
1887 
1888           iv = &(cpu_info_primary.ci_idtvec);
1889           idt_vec_init_cpu_md(iv, cpu_index(&cpu_info_primary));
1890           idt = iv->iv_idt;
1891           gdtstore = (char *)gdt_vaddr;
1892           ldtstore = (char *)ldt_vaddr;
1893 
1894           /*
1895            * Make GDT gates and memory segments.
1896            */
1897           set_mem_segment(GDT_ADDR_MEM(gdtstore, GCODE_SEL), 0,
1898               0xfffff, SDT_MEMERA, SEL_KPL, 1, 0, 1);
1899 
1900           set_mem_segment(GDT_ADDR_MEM(gdtstore, GDATA_SEL), 0,
1901               0xfffff, SDT_MEMRWA, SEL_KPL, 1, 0, 1);
1902 
1903           set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE_SEL), 0,
1904               x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMERA, SEL_UPL, 1, 0, 1);
1905 
1906           set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA_SEL), 0,
1907               x86_btop(VM_MAXUSER_ADDRESS) - 1, SDT_MEMRWA, SEL_UPL, 1, 0, 1);
1908 
1909 #ifndef XENPV
1910           set_sys_segment(GDT_ADDR_SYS(gdtstore, GLDT_SEL), ldtstore,
1911               LDT_SIZE - 1, SDT_SYSLDT, SEL_KPL, 0);
1912 #endif
1913 
1914           /*
1915            * Make LDT memory segments.
1916            */
1917           *(struct mem_segment_descriptor *)(ldtstore + LUCODE_SEL) =
1918               *GDT_ADDR_MEM(gdtstore, GUCODE_SEL);
1919           *(struct mem_segment_descriptor *)(ldtstore + LUDATA_SEL) =
1920               *GDT_ADDR_MEM(gdtstore, GUDATA_SEL);
1921 
1922           /*
1923            * 32 bit GDT entries.
1924            */
1925           set_mem_segment(GDT_ADDR_MEM(gdtstore, GUCODE32_SEL), 0,
1926               x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMERA, SEL_UPL, 1, 1, 0);
1927 
1928           set_mem_segment(GDT_ADDR_MEM(gdtstore, GUDATA32_SEL), 0,
1929               x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1930 
1931           set_mem_segment(GDT_ADDR_MEM(gdtstore, GUFS_SEL), 0,
1932               x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1933 
1934           set_mem_segment(GDT_ADDR_MEM(gdtstore, GUGS_SEL), 0,
1935               x86_btop(VM_MAXUSER_ADDRESS32) - 1, SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1936 
1937           /*
1938            * 32 bit LDT entries.
1939            */
1940           ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUCODE32_SEL);
1941           set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1942               SDT_MEMERA, SEL_UPL, 1, 1, 0);
1943           ldt_segp = (struct mem_segment_descriptor *)(ldtstore + LUDATA32_SEL);
1944           set_mem_segment(ldt_segp, 0, x86_btop(VM_MAXUSER_ADDRESS32) - 1,
1945               SDT_MEMRWA, SEL_UPL, 1, 1, 0);
1946 
1947           /* CPU-specific IDT exceptions. */
1948           for (x = 0; x < NCPUIDT; x++) {
1949                     int sel, ist;
1950 
1951                     /* Reset to default. Special cases below */
1952                     sel = SEL_KPL;
1953                     ist = 0;
1954 
1955                     idt_vec_reserve(iv, x);
1956 
1957                     switch (x) {
1958                     case 1:   /* DB */
1959                               ist = 4;
1960                               break;
1961                     case 2:   /* NMI */
1962                               ist = 3;
1963                               break;
1964                     case 3:
1965                     case 4:
1966                               sel = SEL_UPL;
1967                               break;
1968                     case 8:   /* double fault */
1969                               ist = 2;
1970                               break;
1971 #ifdef XENPV
1972                     case 18: /* MCA */
1973                               sel |= 0x4; /* Auto EOI/mask */
1974                               break;
1975 #endif /* XENPV */
1976                     default:
1977                               break;
1978                     }
1979 
1980                     set_idtgate(&idt[x], x86_exceptions[x], ist, SDT_SYS386IGT,
1981                         sel, GSEL(GCODE_SEL, SEL_KPL));
1982           }
1983 
1984           /* new-style interrupt gate for syscalls */
1985           idt_vec_reserve(iv, 128);
1986           set_idtgate(&idt[128], &IDTVEC(osyscall), 0, SDT_SYS386IGT, SEL_UPL,
1987               GSEL(GCODE_SEL, SEL_KPL));
1988 
1989           kpreempt_enable();
1990 
1991           setregion(&region, gdtstore, DYNSEL_START - 1);
1992           lgdt(&region);
1993 
1994 #ifdef XENPV
1995           /* Init Xen callbacks and syscall handlers */
1996           if (HYPERVISOR_set_callbacks(
1997               (unsigned long) hypervisor_callback,
1998               (unsigned long) failsafe_callback,
1999               (unsigned long) Xsyscall))
2000                     panic("HYPERVISOR_set_callbacks() failed");
2001 #endif /* XENPV */
2002 
2003           cpu_init_idt(&cpu_info_primary);
2004 
2005 #ifdef XENPV
2006           xen_init_ksyms();
2007 #else /* XENPV */
2008 #ifdef XEN
2009           if (vm_guest == VM_GUEST_XENPVH)
2010                     xen_init_ksyms();
2011           else
2012 #endif /* XEN */
2013                     init_x86_64_ksyms();
2014 #endif /* XENPV */
2015 
2016 #ifndef XENPV
2017           intr_default_setup();
2018 #else
2019           events_default_setup();
2020 #endif
2021 
2022           splraise(IPL_HIGH);
2023           x86_enable_intr();
2024 
2025 #ifdef DDB
2026           if (boothowto & RB_KDB)
2027                     Debugger();
2028 #endif
2029 #ifdef KGDB
2030           kgdb_port_init();
2031           if (boothowto & RB_KDB) {
2032                     kgdb_debug_init = 1;
2033                     kgdb_connect(1);
2034           }
2035 #endif
2036 
2037           pcb->pcb_dbregs = NULL;
2038           x86_dbregs_init();
2039 }
2040 
2041 void
cpu_reset(void)2042 cpu_reset(void)
2043 {
2044 #ifndef XENPV
2045           idt_descriptor_t *idt;
2046           vaddr_t vaddr;
2047 
2048           idt = cpu_info_primary.ci_idtvec.iv_idt;
2049           vaddr = (vaddr_t)idt;
2050 #endif
2051 
2052           x86_disable_intr();
2053 
2054 #ifdef XENPV
2055           HYPERVISOR_reboot();
2056 #else
2057 
2058           x86_reset();
2059 
2060           /*
2061            * Try to cause a triple fault and watchdog reset by making the IDT
2062            * invalid and causing a fault.
2063            */
2064           kpreempt_disable();
2065           pmap_changeprot_local(vaddr, VM_PROT_READ|VM_PROT_WRITE);
2066           memset((void *)idt, 0, NIDT * sizeof(idt[0]));
2067           kpreempt_enable();
2068           breakpoint();
2069 
2070 #if 0
2071           /*
2072            * Try to cause a triple fault and watchdog reset by unmapping the
2073            * entire address space and doing a TLB flush.
2074            */
2075           memset((void *)PTD, 0, PAGE_SIZE);
2076           tlbflush();
2077 #endif
2078 #endif    /* XENPV */
2079 
2080           for (;;);
2081 }
2082 
2083 void
cpu_getmcontext(struct lwp * l,mcontext_t * mcp,unsigned int * flags)2084 cpu_getmcontext(struct lwp *l, mcontext_t *mcp, unsigned int *flags)
2085 {
2086           const struct trapframe *tf = l->l_md.md_regs;
2087           __greg_t ras_rip;
2088 
2089           mcp->__gregs[_REG_RDI] = tf->tf_rdi;
2090           mcp->__gregs[_REG_RSI] = tf->tf_rsi;
2091           mcp->__gregs[_REG_RDX] = tf->tf_rdx;
2092           mcp->__gregs[_REG_R10] = tf->tf_r10;
2093           mcp->__gregs[_REG_R8]  = tf->tf_r8;
2094           mcp->__gregs[_REG_R9]  = tf->tf_r9;
2095           /* argX not touched */
2096           mcp->__gregs[_REG_RCX] = tf->tf_rcx;
2097           mcp->__gregs[_REG_R11] = tf->tf_r11;
2098           mcp->__gregs[_REG_R12] = tf->tf_r12;
2099           mcp->__gregs[_REG_R13] = tf->tf_r13;
2100           mcp->__gregs[_REG_R14] = tf->tf_r14;
2101           mcp->__gregs[_REG_R15] = tf->tf_r15;
2102           mcp->__gregs[_REG_RBP] = tf->tf_rbp;
2103           mcp->__gregs[_REG_RBX] = tf->tf_rbx;
2104           mcp->__gregs[_REG_RAX] = tf->tf_rax;
2105           mcp->__gregs[_REG_GS]  = 0;
2106           mcp->__gregs[_REG_FS]  = 0;
2107           mcp->__gregs[_REG_ES]  = GSEL(GUDATA_SEL, SEL_UPL);
2108           mcp->__gregs[_REG_DS]  = GSEL(GUDATA_SEL, SEL_UPL);
2109           mcp->__gregs[_REG_TRAPNO] = tf->tf_trapno;
2110           mcp->__gregs[_REG_ERR] = tf->tf_err;
2111           mcp->__gregs[_REG_RIP] = tf->tf_rip;
2112           mcp->__gregs[_REG_CS]  = LSEL(LUCODE_SEL, SEL_UPL);
2113           mcp->__gregs[_REG_RFLAGS] = tf->tf_rflags;
2114           mcp->__gregs[_REG_RSP] = tf->tf_rsp;
2115           mcp->__gregs[_REG_SS]  = LSEL(LUDATA_SEL, SEL_UPL);
2116 
2117           if ((ras_rip = (__greg_t)ras_lookup(l->l_proc,
2118               (void *) mcp->__gregs[_REG_RIP])) != -1)
2119                     mcp->__gregs[_REG_RIP] = ras_rip;
2120 
2121           *flags |= _UC_CPU;
2122 
2123           mcp->_mc_tlsbase = (uintptr_t)l->l_private;
2124           *flags |= _UC_TLSBASE;
2125 
2126           process_read_fpregs_xmm(l, (struct fxsave *)&mcp->__fpregs);
2127           *flags |= _UC_FPU;
2128 }
2129 
2130 int
cpu_setmcontext(struct lwp * l,const mcontext_t * mcp,unsigned int flags)2131 cpu_setmcontext(struct lwp *l, const mcontext_t *mcp, unsigned int flags)
2132 {
2133           struct trapframe *tf = l->l_md.md_regs;
2134           const __greg_t *gr = mcp->__gregs;
2135           struct proc *p = l->l_proc;
2136           int error;
2137           int64_t rflags;
2138 
2139           CTASSERT(sizeof (mcontext_t) == 26 * 8 + 8 + 512);
2140 
2141           if ((flags & _UC_CPU) != 0) {
2142                     error = cpu_mcontext_validate(l, mcp);
2143                     if (error != 0)
2144                               return error;
2145 
2146                     tf->tf_rdi  = gr[_REG_RDI];
2147                     tf->tf_rsi  = gr[_REG_RSI];
2148                     tf->tf_rdx  = gr[_REG_RDX];
2149                     tf->tf_r10  = gr[_REG_R10];
2150                     tf->tf_r8   = gr[_REG_R8];
2151                     tf->tf_r9   = gr[_REG_R9];
2152                     /* argX not touched */
2153                     tf->tf_rcx  = gr[_REG_RCX];
2154                     tf->tf_r11  = gr[_REG_R11];
2155                     tf->tf_r12  = gr[_REG_R12];
2156                     tf->tf_r13  = gr[_REG_R13];
2157                     tf->tf_r14  = gr[_REG_R14];
2158                     tf->tf_r15  = gr[_REG_R15];
2159                     tf->tf_rbp  = gr[_REG_RBP];
2160                     tf->tf_rbx  = gr[_REG_RBX];
2161                     tf->tf_rax  = gr[_REG_RAX];
2162                     tf->tf_gs   = 0;
2163                     tf->tf_fs   = 0;
2164                     tf->tf_es   = GSEL(GUDATA_SEL, SEL_UPL);
2165                     tf->tf_ds   = GSEL(GUDATA_SEL, SEL_UPL);
2166                     /* trapno, err not touched */
2167                     tf->tf_rip  = gr[_REG_RIP];
2168                     tf->tf_cs   = LSEL(LUCODE_SEL, SEL_UPL);
2169                     rflags = tf->tf_rflags;
2170                     rflags &= ~PSL_USER;
2171                     tf->tf_rflags = rflags | (gr[_REG_RFLAGS] & PSL_USER);
2172                     tf->tf_rsp  = gr[_REG_RSP];
2173                     tf->tf_ss   = LSEL(LUDATA_SEL, SEL_UPL);
2174 
2175                     l->l_md.md_flags |= MDL_IRET;
2176           }
2177 
2178           if ((flags & _UC_FPU) != 0)
2179                     process_write_fpregs_xmm(l, (const struct fxsave *)&mcp->__fpregs);
2180 
2181           if ((flags & _UC_TLSBASE) != 0)
2182                     lwp_setprivate(l, (void *)(uintptr_t)mcp->_mc_tlsbase);
2183 
2184           mutex_enter(p->p_lock);
2185           if (flags & _UC_SETSTACK)
2186                     l->l_sigstk.ss_flags |= SS_ONSTACK;
2187           if (flags & _UC_CLRSTACK)
2188                     l->l_sigstk.ss_flags &= ~SS_ONSTACK;
2189           mutex_exit(p->p_lock);
2190 
2191           return 0;
2192 }
2193 
2194 int
cpu_mcontext_validate(struct lwp * l,const mcontext_t * mcp)2195 cpu_mcontext_validate(struct lwp *l, const mcontext_t *mcp)
2196 {
2197           struct proc *p __diagused = l->l_proc;
2198           struct trapframe *tf = l->l_md.md_regs;
2199           const __greg_t *gr;
2200           uint16_t sel;
2201 
2202           KASSERT((p->p_flag & PK_32) == 0);
2203           gr = mcp->__gregs;
2204 
2205           if (((gr[_REG_RFLAGS] ^ tf->tf_rflags) & PSL_USERSTATIC) != 0)
2206                     return EINVAL;
2207 
2208           sel = gr[_REG_ES] & 0xffff;
2209           if (sel != 0 && !VALID_USER_DSEL(sel))
2210                     return EINVAL;
2211 
2212           sel = gr[_REG_FS] & 0xffff;
2213           if (sel != 0 && !VALID_USER_DSEL(sel))
2214                     return EINVAL;
2215 
2216           sel = gr[_REG_GS] & 0xffff;
2217           if (sel != 0 && !VALID_USER_DSEL(sel))
2218                     return EINVAL;
2219 
2220           sel = gr[_REG_DS] & 0xffff;
2221           if (!VALID_USER_DSEL(sel))
2222                     return EINVAL;
2223 
2224 #ifndef XENPV
2225           sel = gr[_REG_SS] & 0xffff;
2226           if (!VALID_USER_DSEL(sel))
2227                     return EINVAL;
2228 
2229           sel = gr[_REG_CS] & 0xffff;
2230           if (!VALID_USER_CSEL(sel))
2231                     return EINVAL;
2232 #endif
2233 
2234           if (gr[_REG_RIP] >= VM_MAXUSER_ADDRESS)
2235                     return EINVAL;
2236 
2237           return 0;
2238 }
2239 
2240 int
mm_md_kernacc(void * ptr,vm_prot_t prot,bool * handled)2241 mm_md_kernacc(void *ptr, vm_prot_t prot, bool *handled)
2242 {
2243           const vaddr_t v = (vaddr_t)ptr;
2244           vaddr_t kva, kva_end;
2245           size_t i;
2246 
2247           kva = bootspace.head.va;
2248           kva_end = kva + bootspace.head.sz;
2249           if (v >= kva && v < kva_end) {
2250                     *handled = true;
2251                     return 0;
2252           }
2253 
2254           for (i = 0; i < BTSPACE_NSEGS; i++) {
2255                     kva = bootspace.segs[i].va;
2256                     kva_end = kva + bootspace.segs[i].sz;
2257                     if (v < kva || v >= kva_end)
2258                               continue;
2259                     *handled = true;
2260                     if (bootspace.segs[i].type == BTSEG_TEXT ||
2261                         bootspace.segs[i].type == BTSEG_RODATA) {
2262                               if (prot & VM_PROT_WRITE) {
2263                                         return EFAULT;
2264                               }
2265                     }
2266                     return 0;
2267           }
2268 
2269           kva = bootspace.boot.va;
2270           kva_end = kva + bootspace.boot.sz;
2271           if (v >= kva && v < kva_end) {
2272                     *handled = true;
2273                     return 0;
2274           }
2275 
2276           if (v >= bootspace.smodule && v < bootspace.emodule) {
2277                     *handled = true;
2278                     if (!uvm_map_checkprot(module_map, v, v + 1, prot)) {
2279                               return EFAULT;
2280                     }
2281           } else {
2282                     *handled = false;
2283           }
2284           return 0;
2285 }
2286 
2287 /*
2288  * Zero out a 64bit LWP's segments registers. Used when exec'ing a new
2289  * 64bit program.
2290  */
2291 void
cpu_segregs64_zero(struct lwp * l)2292 cpu_segregs64_zero(struct lwp *l)
2293 {
2294           struct trapframe * const tf = l->l_md.md_regs;
2295           struct pcb *pcb;
2296           uint64_t zero = 0;
2297 
2298           KASSERT(kpreempt_disabled());
2299           KASSERT((l->l_proc->p_flag & PK_32) == 0);
2300           KASSERT(l == curlwp);
2301 
2302           pcb = lwp_getpcb(l);
2303 
2304           tf->tf_fs = 0;
2305           tf->tf_gs = 0;
2306           setds(GSEL(GUDATA_SEL, SEL_UPL));
2307           setes(GSEL(GUDATA_SEL, SEL_UPL));
2308           setfs(0);
2309           setusergs(0);
2310 
2311 #ifndef XENPV
2312           wrmsr(MSR_FSBASE, 0);
2313           wrmsr(MSR_KERNELGSBASE, 0);
2314 #else
2315           HYPERVISOR_set_segment_base(SEGBASE_FS, 0);
2316           HYPERVISOR_set_segment_base(SEGBASE_GS_USER, 0);
2317 #endif
2318 
2319           pcb->pcb_fs = 0;
2320           pcb->pcb_gs = 0;
2321           update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
2322           update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
2323 }
2324 
2325 /*
2326  * Zero out a 32bit LWP's segments registers. Used when exec'ing a new
2327  * 32bit program.
2328  */
2329 void
cpu_segregs32_zero(struct lwp * l)2330 cpu_segregs32_zero(struct lwp *l)
2331 {
2332           struct trapframe * const tf = l->l_md.md_regs;
2333           struct pcb *pcb;
2334           uint64_t zero = 0;
2335 
2336           KASSERT(kpreempt_disabled());
2337           KASSERT(l->l_proc->p_flag & PK_32);
2338           KASSERT(l == curlwp);
2339 
2340           pcb = lwp_getpcb(l);
2341 
2342           tf->tf_fs = 0;
2343           tf->tf_gs = 0;
2344           setds(GSEL(GUDATA32_SEL, SEL_UPL));
2345           setes(GSEL(GUDATA32_SEL, SEL_UPL));
2346           setfs(0);
2347           setusergs(0);
2348           pcb->pcb_fs = 0;
2349           pcb->pcb_gs = 0;
2350           update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &zero);
2351           update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &zero);
2352 }
2353 
2354 /*
2355  * Load an LWP's TLS context, possibly changing the %fs and %gs selectors.
2356  * Used only for 32-bit processes.
2357  */
2358 void
cpu_fsgs_reload(struct lwp * l,int fssel,int gssel)2359 cpu_fsgs_reload(struct lwp *l, int fssel, int gssel)
2360 {
2361           struct trapframe *tf;
2362           struct pcb *pcb;
2363 
2364           KASSERT(l->l_proc->p_flag & PK_32);
2365           KASSERT(l == curlwp);
2366 
2367           tf = l->l_md.md_regs;
2368           fssel &= 0xFFFF;
2369           gssel &= 0xFFFF;
2370 
2371           pcb = lwp_getpcb(l);
2372           kpreempt_disable();
2373           update_descriptor(&curcpu()->ci_gdt[GUFS_SEL], &pcb->pcb_fs);
2374           update_descriptor(&curcpu()->ci_gdt[GUGS_SEL], &pcb->pcb_gs);
2375 
2376 #ifdef XENPV
2377           setusergs(gssel);
2378 #endif
2379 
2380           tf->tf_fs = fssel;
2381           tf->tf_gs = gssel;
2382           kpreempt_enable();
2383 }
2384 
2385 bool
mm_md_direct_mapped_io(void * addr,paddr_t * paddr)2386 mm_md_direct_mapped_io(void *addr, paddr_t *paddr)
2387 {
2388           vaddr_t va = (vaddr_t)addr;
2389 
2390 #ifdef __HAVE_DIRECT_MAP
2391           if (va >= PMAP_DIRECT_BASE && va < PMAP_DIRECT_END) {
2392                     *paddr = PMAP_DIRECT_UNMAP(va);
2393                     return true;
2394           }
2395 #else
2396           __USE(va);
2397 #endif
2398 
2399           return false;
2400 }
2401 
2402 bool
mm_md_direct_mapped_phys(paddr_t paddr,vaddr_t * vaddr)2403 mm_md_direct_mapped_phys(paddr_t paddr, vaddr_t *vaddr)
2404 {
2405 #ifdef __HAVE_DIRECT_MAP
2406           *vaddr = PMAP_DIRECT_MAP(paddr);
2407           return true;
2408 #else
2409           return false;
2410 #endif
2411 }
2412 
2413 static void
idt_vec_copy(struct idt_vec * dst,struct idt_vec * src)2414 idt_vec_copy(struct idt_vec *dst, struct idt_vec *src)
2415 {
2416           idt_descriptor_t *idt_dst;
2417 
2418           idt_dst = dst->iv_idt;
2419 
2420           kpreempt_disable();
2421           pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ|VM_PROT_WRITE);
2422 
2423           memcpy(idt_dst, src->iv_idt, PAGE_SIZE);
2424           memcpy(dst->iv_allocmap, src->iv_allocmap, sizeof(dst->iv_allocmap));
2425 
2426           pmap_changeprot_local((vaddr_t)idt_dst, VM_PROT_READ);
2427           kpreempt_enable();
2428 }
2429 
2430 void
idt_vec_init_cpu_md(struct idt_vec * iv,cpuid_t cid)2431 idt_vec_init_cpu_md(struct idt_vec *iv, cpuid_t cid)
2432 {
2433           vaddr_t va;
2434 
2435           if (cid != cpu_index(&cpu_info_primary) &&
2436               idt_vec_is_pcpu()) {
2437 #ifdef __HAVE_PCPU_AREA
2438                     va = (vaddr_t)&pcpuarea->ent[cid].idt;
2439 #else
2440                     struct vm_page *pg;
2441 
2442                     va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
2443                         UVM_KMF_VAONLY);
2444                     pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
2445                     if (pg == NULL) {
2446                               panic("failed to allocate a page for IDT");
2447                     }
2448                     pmap_kenter_pa(va, VM_PAGE_TO_PHYS(pg),
2449                         VM_PROT_READ|VM_PROT_WRITE, 0);
2450                     pmap_update(pmap_kernel());
2451 #endif
2452 
2453                     memset((void *)va, 0, PAGE_SIZE);
2454 #ifndef XENPV
2455                     pmap_changeprot_local(va, VM_PROT_READ);
2456 #endif
2457                     pmap_update(pmap_kernel());
2458 
2459                     iv->iv_idt = (void *)va;
2460                     idt_vec_copy(iv, &(cpu_info_primary.ci_idtvec));
2461           } else {
2462                     iv->iv_idt = (void *)idt_vaddr;
2463           }
2464 }
2465