1 /*        $NetBSD: x86_machdep.c,v 1.158 2025/04/30 05:15:08 imil Exp $         */
2 
3 /*-
4  * Copyright (c) 2002, 2006, 2007 YAMAMOTO Takashi,
5  * Copyright (c) 2005, 2008, 2009, 2019, 2023 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Julio M. Merino Vidal, and Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 __KERNEL_RCSID(0, "$NetBSD: x86_machdep.c,v 1.158 2025/04/30 05:15:08 imil Exp $");
35 
36 #include "opt_modular.h"
37 #include "opt_physmem.h"
38 #include "opt_splash.h"
39 #include "opt_kaslr.h"
40 #include "opt_svs.h"
41 #include "opt_xen.h"
42 
43 #include <sys/types.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/kcore.h>
47 #include <sys/errno.h>
48 #include <sys/kauth.h>
49 #include <sys/mutex.h>
50 #include <sys/cpu.h>
51 #include <sys/intr.h>
52 #include <sys/atomic.h>
53 #include <sys/module.h>
54 #include <sys/sysctl.h>
55 #include <sys/extent.h>
56 #include <sys/rnd.h>
57 
58 #include <x86/bootspace.h>
59 #include <x86/cpuvar.h>
60 #include <x86/cputypes.h>
61 #include <x86/efi.h>
62 #include <x86/machdep.h>
63 #include <x86/nmi.h>
64 #include <x86/pio.h>
65 
66 #include <dev/splash/splash.h>
67 #include <dev/isa/isareg.h>
68 #include <dev/ic/i8042reg.h>
69 #include <dev/mm.h>
70 
71 #include <machine/bootinfo.h>
72 #include <machine/pmap_private.h>
73 #include <machine/vmparam.h>
74 
75 #include <uvm/uvm_extern.h>
76 
77 #include "tsc.h"
78 
79 #include "acpica.h"
80 #include "ioapic.h"
81 #include "lapic.h"
82 
83 #if NACPICA > 0
84 #include <dev/acpi/acpivar.h>
85 #endif
86 
87 #if NIOAPIC > 0 || NACPICA > 0
88 #include <machine/i82093var.h>
89 #endif
90 
91 #include "opt_md.h"
92 #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC)
93 #include <dev/md.h>
94 #endif
95 
96 void (*x86_cpu_idle)(void);
97 static bool x86_cpu_idle_ipi;
98 static char x86_cpu_idle_text[16];
99 
100 static bool x86_user_ldt_enabled __read_mostly = false;
101 
102 #ifdef XEN
103 
104 #include <xen/xen.h>
105 #include <xen/hypervisor.h>
106 #endif
107 
108 #ifndef XENPV
109 void (*delay_func)(unsigned int) = i8254_delay;
110 void (*x86_initclock_func)(void) = i8254_initclocks;
111 #else /* XENPV */
112 void (*delay_func)(unsigned int) = xen_delay;
113 void (*x86_initclock_func)(void) = xen_initclocks;
114 #endif
115 
116 
117 /* --------------------------------------------------------------------- */
118 
119 /*
120  * Main bootinfo structure.  This is filled in by the bootstrap process
121  * done in locore.S based on the information passed by the boot loader.
122  */
123 struct bootinfo bootinfo;
124 
125 /* --------------------------------------------------------------------- */
126 
127 bool bootmethod_efi;
128 
129 static kauth_listener_t x86_listener;
130 
131 extern paddr_t lowmem_rsvd, avail_start, avail_end;
132 
133 vaddr_t msgbuf_vaddr;
134 
135 struct msgbuf_p_seg msgbuf_p_seg[VM_PHYSSEG_MAX];
136 
137 unsigned int msgbuf_p_cnt = 0;
138 
139 void init_x86_msgbuf(void);
140 
141 /*
142  * Given the type of a bootinfo entry, looks for a matching item inside
143  * the bootinfo structure.  If found, returns a pointer to it (which must
144  * then be casted to the appropriate bootinfo_* type); otherwise, returns
145  * NULL.
146  */
147 void *
lookup_bootinfo(int type)148 lookup_bootinfo(int type)
149 {
150           bool found;
151           int i;
152           struct btinfo_common *bic;
153 
154           bic = (struct btinfo_common *)(bootinfo.bi_data);
155           found = FALSE;
156           for (i = 0; i < bootinfo.bi_nentries && !found; i++) {
157                     if (bic->type == type)
158                               found = TRUE;
159                     else
160                               bic = (struct btinfo_common *)
161                                   ((uint8_t *)bic + bic->len);
162           }
163 
164           return found ? bic : NULL;
165 }
166 
167 #ifdef notyet
168 /*
169  * List the available bootinfo entries.
170  */
171 static const char *btinfo_str[] = {
172           BTINFO_STR
173 };
174 
175 void
aprint_bootinfo(void)176 aprint_bootinfo(void)
177 {
178           int i;
179           struct btinfo_common *bic;
180 
181           aprint_normal("bootinfo:");
182           bic = (struct btinfo_common *)(bootinfo.bi_data);
183           for (i = 0; i < bootinfo.bi_nentries; i++) {
184                     if (bic->type >= 0 && bic->type < __arraycount(btinfo_str))
185                               aprint_normal(" %s", btinfo_str[bic->type]);
186                     else
187                               aprint_normal(" %d", bic->type);
188                     bic = (struct btinfo_common *)
189                         ((uint8_t *)bic + bic->len);
190           }
191           aprint_normal("\n");
192 }
193 #endif
194 
195 /*
196  * mm_md_physacc: check if given pa is accessible.
197  */
198 int
mm_md_physacc(paddr_t pa,vm_prot_t prot)199 mm_md_physacc(paddr_t pa, vm_prot_t prot)
200 {
201           extern phys_ram_seg_t mem_clusters[VM_PHYSSEG_MAX];
202           extern int mem_cluster_cnt;
203           int i;
204 
205           for (i = 0; i < mem_cluster_cnt; i++) {
206                     const phys_ram_seg_t *seg = &mem_clusters[i];
207                     paddr_t lstart = seg->start;
208 
209                     if (lstart <= pa && pa - lstart <= seg->size) {
210                               return 0;
211                     }
212           }
213           return kauth_authorize_machdep(kauth_cred_get(),
214               KAUTH_MACHDEP_UNMANAGEDMEM, NULL, NULL, NULL, NULL);
215 }
216 
217 #ifdef MODULAR
218 /*
219  * Push any modules loaded by the boot loader.
220  */
221 void
module_init_md(void)222 module_init_md(void)
223 {
224           struct btinfo_modulelist *biml;
225           struct bi_modulelist_entry *bi, *bimax;
226 
227           biml = lookup_bootinfo(BTINFO_MODULELIST);
228           if (biml == NULL) {
229                     aprint_debug("No module info at boot\n");
230                     return;
231           }
232 
233           bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml));
234           bimax = bi + biml->num;
235           for (; bi < bimax; bi++) {
236                     switch (bi->type) {
237                     case BI_MODULE_ELF:
238                               aprint_debug("Prep module path=%s len=%d pa=%x\n",
239                                   bi->path, bi->len, bi->base);
240                               KASSERT(trunc_page(bi->base) == bi->base);
241                               module_prime(bi->path,
242 #ifdef KASLR
243                                   (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
244 #else
245                                   (void *)((uintptr_t)bi->base + KERNBASE),
246 #endif
247                                   bi->len);
248                               break;
249                     case BI_MODULE_IMAGE:
250 #ifdef SPLASHSCREEN
251                               aprint_debug("Splash image path=%s len=%d pa=%x\n",
252                                   bi->path, bi->len, bi->base);
253                               KASSERT(trunc_page(bi->base) == bi->base);
254                               splash_setimage(
255 #ifdef KASLR
256                                   (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
257 #else
258                                   (void *)((uintptr_t)bi->base + KERNBASE),
259 #endif
260                                   bi->len);
261 #endif
262                               break;
263                     case BI_MODULE_RND:
264                               /* handled in x86_rndseed */
265                               break;
266                     case BI_MODULE_FS:
267                               aprint_debug("File-system image path=%s len=%d pa=%x\n",
268                                   bi->path, bi->len, bi->base);
269                               KASSERT(trunc_page(bi->base) == bi->base);
270 #if defined(MEMORY_DISK_HOOKS) && defined(MEMORY_DISK_DYNAMIC)
271                               md_root_setconf(
272 #ifdef KASLR
273                                   (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
274 #else
275                                   (void *)((uintptr_t)bi->base + KERNBASE),
276 #endif
277                                   bi->len);
278 #endif
279                               break;
280                     default:
281                               aprint_debug("Skipping non-ELF module\n");
282                               break;
283                     }
284           }
285 }
286 #endif    /* MODULAR */
287 
288 void
x86_rndseed(void)289 x86_rndseed(void)
290 {
291           struct btinfo_modulelist *biml;
292           struct bi_modulelist_entry *bi, *bimax;
293 
294           biml = lookup_bootinfo(BTINFO_MODULELIST);
295           if (biml == NULL) {
296                     aprint_debug("No module info at boot\n");
297                     return;
298           }
299 
300           bi = (struct bi_modulelist_entry *)((uint8_t *)biml + sizeof(*biml));
301           bimax = bi + biml->num;
302           for (; bi < bimax; bi++) {
303                     switch (bi->type) {
304                     case BI_MODULE_RND:
305                               aprint_debug("Random seed data path=%s len=%d pa=%x\n",
306                                              bi->path, bi->len, bi->base);
307                               KASSERT(trunc_page(bi->base) == bi->base);
308                               rnd_seed(
309 #ifdef KASLR
310                                   (void *)PMAP_DIRECT_MAP((uintptr_t)bi->base),
311 #else
312                                   (void *)((uintptr_t)bi->base + KERNBASE),
313 #endif
314                                    bi->len);
315                     }
316           }
317 }
318 
319 void
cpu_need_resched(struct cpu_info * ci,struct lwp * l,int flags)320 cpu_need_resched(struct cpu_info *ci, struct lwp *l, int flags)
321 {
322 
323           KASSERT(kpreempt_disabled());
324 
325           if ((flags & RESCHED_IDLE) != 0) {
326                     if ((flags & RESCHED_REMOTE) != 0 &&
327                         x86_cpu_idle_ipi != false) {
328                               cpu_kick(ci);
329                     }
330                     return;
331           }
332 
333 #ifdef __HAVE_PREEMPTION
334           if ((flags & RESCHED_KPREEMPT) != 0) {
335                     if ((flags & RESCHED_REMOTE) != 0) {
336 #ifdef XENPV
337                               xen_send_ipi(ci, XEN_IPI_KPREEMPT);
338 #else
339                               x86_send_ipi(ci, X86_IPI_KPREEMPT);
340 #endif
341                     } else {
342                               softint_trigger(1 << SIR_PREEMPT);
343                     }
344                     return;
345           }
346 #endif
347 
348           KASSERT((flags & RESCHED_UPREEMPT) != 0);
349           if ((flags & RESCHED_REMOTE) != 0) {
350                     cpu_kick(ci);
351           } else {
352                     aston(l);
353           }
354 }
355 
356 void
cpu_signotify(struct lwp * l)357 cpu_signotify(struct lwp *l)
358 {
359 
360           KASSERT(kpreempt_disabled());
361 
362           if (l->l_cpu != curcpu()) {
363                     cpu_kick(l->l_cpu);
364           } else {
365                     aston(l);
366           }
367 }
368 
369 void
cpu_need_proftick(struct lwp * l)370 cpu_need_proftick(struct lwp *l)
371 {
372 
373           KASSERT(kpreempt_disabled());
374           KASSERT(l->l_cpu == curcpu());
375 
376           l->l_pflag |= LP_OWEUPC;
377           aston(l);
378 }
379 
380 bool
cpu_intr_p(void)381 cpu_intr_p(void)
382 {
383           int idepth;
384           long pctr;
385           lwp_t *l;
386 
387           l = curlwp;
388           if (__predict_false(l->l_cpu == NULL)) {
389                     KASSERT(l == &lwp0);
390                     return false;
391           }
392           do {
393                     pctr = lwp_pctr();
394                     idepth = l->l_cpu->ci_idepth;
395           } while (__predict_false(pctr != lwp_pctr()));
396 
397           return idepth >= 0;
398 }
399 
400 #ifdef __HAVE_PREEMPTION
401 /*
402  * Called to check MD conditions that would prevent preemption, and to
403  * arrange for those conditions to be rechecked later.
404  */
405 bool
cpu_kpreempt_enter(uintptr_t where,int s)406 cpu_kpreempt_enter(uintptr_t where, int s)
407 {
408           struct pcb *pcb;
409           lwp_t *l;
410 
411           KASSERT(kpreempt_disabled());
412           l = curlwp;
413 
414           /*
415            * If SPL raised, can't go.  Note this implies that spin
416            * mutexes at IPL_NONE are _not_ valid to use.
417            */
418           if (s > IPL_PREEMPT) {
419                     softint_trigger(1 << SIR_PREEMPT);
420                     return false;
421           }
422 
423           /* Must save cr2 or it could be clobbered. */
424           pcb = lwp_getpcb(l);
425           pcb->pcb_cr2 = rcr2();
426 
427           return true;
428 }
429 
430 /*
431  * Called after returning from a kernel preemption, and called with
432  * preemption disabled.
433  */
434 void
cpu_kpreempt_exit(uintptr_t where)435 cpu_kpreempt_exit(uintptr_t where)
436 {
437           extern char x86_copyfunc_start, x86_copyfunc_end;
438           struct pcb *pcb;
439 
440           KASSERT(kpreempt_disabled());
441 
442           /*
443            * If we interrupted any of the copy functions we must reload
444            * the pmap when resuming, as they cannot tolerate it being
445            * swapped out.
446            */
447           if (where >= (uintptr_t)&x86_copyfunc_start &&
448               where < (uintptr_t)&x86_copyfunc_end) {
449                     pmap_load();
450           }
451 
452           /* Restore cr2 only after the pmap, as pmap_load can block. */
453           pcb = lwp_getpcb(curlwp);
454           lcr2(pcb->pcb_cr2);
455 }
456 
457 /*
458  * Return true if preemption is disabled for MD reasons.  Must be called
459  * with preemption disabled, and thus is only for diagnostic checks.
460  */
461 bool
cpu_kpreempt_disabled(void)462 cpu_kpreempt_disabled(void)
463 {
464 
465           return curcpu()->ci_ilevel > IPL_NONE;
466 }
467 #endif    /* __HAVE_PREEMPTION */
468 
469 SYSCTL_SETUP(sysctl_machdep_cpu_idle, "sysctl machdep cpu_idle")
470 {
471           const struct sysctlnode       *mnode, *node;
472 
473           sysctl_createv(NULL, 0, NULL, &mnode,
474               CTLFLAG_PERMANENT, CTLTYPE_NODE, "machdep", NULL,
475               NULL, 0, NULL, 0, CTL_MACHDEP, CTL_EOL);
476 
477           sysctl_createv(NULL, 0, &mnode, &node,
478                            CTLFLAG_PERMANENT, CTLTYPE_STRING, "idle-mechanism",
479                            SYSCTL_DESCR("Mechanism used for the idle loop."),
480                            NULL, 0, x86_cpu_idle_text, 0,
481                            CTL_CREATE, CTL_EOL);
482 }
483 
484 void
x86_cpu_idle_init(void)485 x86_cpu_idle_init(void)
486 {
487 
488 #ifndef XENPV
489           if ((cpu_feature[1] & CPUID2_MONITOR) == 0)
490                     x86_cpu_idle_set(x86_cpu_idle_halt, "halt", true);
491           else
492                     x86_cpu_idle_set(x86_cpu_idle_mwait, "mwait", false);
493 #else
494           x86_cpu_idle_set(x86_cpu_idle_xen, "xen", true);
495 #endif
496 }
497 
498 void
x86_cpu_idle_get(void (** func)(void),char * text,size_t len)499 x86_cpu_idle_get(void (**func)(void), char *text, size_t len)
500 {
501 
502           *func = x86_cpu_idle;
503 
504           (void)strlcpy(text, x86_cpu_idle_text, len);
505 }
506 
507 void
x86_cpu_idle_set(void (* func)(void),const char * text,bool ipi)508 x86_cpu_idle_set(void (*func)(void), const char *text, bool ipi)
509 {
510 
511           x86_cpu_idle = func;
512           x86_cpu_idle_ipi = ipi;
513 
514           (void)strlcpy(x86_cpu_idle_text, text, sizeof(x86_cpu_idle_text));
515 }
516 
517 #ifndef XENPV
518 
519 #define KBTOB(x)    ((size_t)(x) * 1024UL)
520 #define MBTOB(x)    ((size_t)(x) * 1024UL * 1024UL)
521 
522 static struct {
523           int freelist;
524           uint64_t limit;
525 } x86_freelists[VM_NFREELIST] = {
526           { VM_FREELIST_DEFAULT, 0 },
527 #ifdef VM_FREELIST_FIRST1T
528           /* 40-bit addresses needed for modern graphics. */
529           { VM_FREELIST_FIRST1T,        1ULL * 1024 * 1024 * 1024 * 1024 },
530 #endif
531 #ifdef VM_FREELIST_FIRST64G
532           /* 36-bit addresses needed for oldish graphics. */
533           { VM_FREELIST_FIRST64G, 64ULL * 1024 * 1024 * 1024 },
534 #endif
535 #ifdef VM_FREELIST_FIRST4G
536           /* 32-bit addresses needed for PCI 32-bit DMA and old graphics. */
537           { VM_FREELIST_FIRST4G,  4ULL * 1024 * 1024 * 1024 },
538 #endif
539           /* 30-bit addresses needed for ancient graphics. */
540           { VM_FREELIST_FIRST1G,        1ULL * 1024 * 1024 * 1024 },
541           /* 24-bit addresses needed for ISA DMA. */
542           { VM_FREELIST_FIRST16,        16 * 1024 * 1024 },
543 };
544 
545 int
x86_select_freelist(uint64_t maxaddr)546 x86_select_freelist(uint64_t maxaddr)
547 {
548           unsigned int i;
549 
550           if (avail_end <= maxaddr)
551                     return VM_NFREELIST;
552 
553           for (i = 0; i < __arraycount(x86_freelists); i++) {
554                     if ((x86_freelists[i].limit - 1) <= maxaddr)
555                               return x86_freelists[i].freelist;
556           }
557 
558           panic("no freelist for maximum address %"PRIx64, maxaddr);
559 }
560 
561 static int
x86_add_cluster(uint64_t seg_start,uint64_t seg_end,uint32_t type)562 x86_add_cluster(uint64_t seg_start, uint64_t seg_end, uint32_t type)
563 {
564           extern struct extent *iomem_ex;
565           const uint64_t endext = MAXIOMEM + 1;
566           uint64_t new_physmem = 0;
567           phys_ram_seg_t *cluster;
568           int i;
569 
570           if (seg_end > MAXPHYSMEM) {
571                     aprint_verbose("WARNING: skipping large memory map entry: "
572                         "0x%"PRIx64"/0x%"PRIx64"/0x%x\n",
573                         seg_start, (seg_end - seg_start), type);
574                     return 0;
575           }
576 
577           /*
578            * XXX: Chop the last page off the size so that it can fit in avail_end.
579            */
580           if (seg_end == MAXPHYSMEM)
581                     seg_end -= PAGE_SIZE;
582 
583           if (seg_end <= seg_start)
584                     return 0;
585 
586           for (i = 0; i < mem_cluster_cnt; i++) {
587                     cluster = &mem_clusters[i];
588                     if ((cluster->start == round_page(seg_start)) &&
589                         (cluster->size == trunc_page(seg_end) - cluster->start)) {
590 #ifdef DEBUG_MEMLOAD
591                               printf("WARNING: skipping duplicate segment entry\n");
592 #endif
593                               return 0;
594                     }
595           }
596 
597           /*
598            * This cluster is used by RAM. If it is included in the iomem extent,
599            * allocate it from there, so that we won't unintentionally reuse it
600            * later with extent_alloc_region. A way to avoid collision (with UVM
601            * for example).
602            *
603            * This is done before the addresses are page rounded just to make
604            * sure we get them all.
605            */
606           if (seg_start < endext) {
607                     uint64_t io_end;
608 
609                     if (seg_end > endext)
610                               io_end = endext;
611                     else
612                               io_end = seg_end;
613 
614                     if (iomem_ex != NULL && extent_alloc_region(iomem_ex, seg_start,
615                         io_end - seg_start, EX_NOWAIT)) {
616                               /* XXX What should we do? */
617                               printf("WARNING: CAN't ALLOCATE MEMORY SEGMENT "
618                                   "(0x%"PRIx64"/0x%"PRIx64"/0x%x) FROM "
619                                   "IOMEM EXTENT MAP!\n",
620                                   seg_start, seg_end - seg_start, type);
621                               return 0;
622                     }
623           }
624 
625           /* If it's not free memory, skip it. */
626           if (type != BIM_Memory)
627                     return 0;
628 
629           if (mem_cluster_cnt >= VM_PHYSSEG_MAX) {
630                     printf("WARNING: too many memory segments"
631                         "(increase VM_PHYSSEG_MAX)");
632                     return -1;
633           }
634 
635 #ifdef PHYSMEM_MAX_ADDR
636           if (seg_start >= MBTOB(PHYSMEM_MAX_ADDR))
637                     return 0;
638           if (seg_end > MBTOB(PHYSMEM_MAX_ADDR))
639                     seg_end = MBTOB(PHYSMEM_MAX_ADDR);
640 #endif
641 
642           seg_start = round_page(seg_start);
643           seg_end = trunc_page(seg_end);
644 
645           if (seg_start == seg_end)
646                     return 0;
647 
648           cluster = &mem_clusters[mem_cluster_cnt];
649           cluster->start = seg_start;
650           if (iomem_ex != NULL)
651                     new_physmem = physmem + atop(seg_end - seg_start);
652 
653 #ifdef PHYSMEM_MAX_SIZE
654           if (iomem_ex != NULL) {
655                     if (physmem >= atop(MBTOB(PHYSMEM_MAX_SIZE)))
656                               return 0;
657                     if (new_physmem > atop(MBTOB(PHYSMEM_MAX_SIZE))) {
658                               seg_end = seg_start + MBTOB(PHYSMEM_MAX_SIZE) - ptoa(physmem);
659                               new_physmem = atop(MBTOB(PHYSMEM_MAX_SIZE));
660                     }
661           }
662 #endif
663 
664           cluster->size = seg_end - seg_start;
665 
666           if (iomem_ex != NULL) {
667                     if (avail_end < seg_end)
668                               avail_end = seg_end;
669                     physmem = new_physmem;
670           }
671           mem_cluster_cnt++;
672 
673           return 0;
674 }
675 
676 static int
x86_parse_clusters(struct btinfo_memmap * bim)677 x86_parse_clusters(struct btinfo_memmap *bim)
678 {
679           uint64_t seg_start, seg_end;
680           uint64_t addr, size;
681           uint32_t type;
682           int x;
683 
684           KASSERT(bim != NULL);
685           KASSERT(bim->num > 0);
686 
687 #ifdef DEBUG_MEMLOAD
688           printf("MEMMAP: %s MEMORY MAP (%d ENTRIES):\n",
689               lookup_bootinfo(BTINFO_EFIMEMMAP) != NULL ? "UEFI" : "BIOS",
690               bim->num);
691 #endif
692 
693           for (x = 0; x < bim->num; x++) {
694                     addr = bim->entry[x].addr;
695                     size = bim->entry[x].size;
696                     type = bim->entry[x].type;
697 #ifdef DEBUG_MEMLOAD
698                     printf("MEMMAP: 0x%016" PRIx64 "-0x%016" PRIx64
699                         "\n\tsize=0x%016" PRIx64 ", type=%d(%s)\n",
700                         addr, addr + size - 1, size, type,
701                         (type == BIM_Memory) ?  "Memory" :
702                         (type == BIM_Reserved) ?  "Reserved" :
703                         (type == BIM_ACPI) ? "ACPI" :
704                         (type == BIM_NVS) ? "NVS" :
705                         (type == BIM_PMEM) ? "Persistent" :
706                         (type == BIM_PRAM) ? "Persistent (Legacy)" :
707                         "unknown");
708 #endif
709 
710                     /* If the segment is not memory, skip it. */
711                     switch (type) {
712                     case BIM_Memory:
713                     case BIM_ACPI:
714                     case BIM_NVS:
715                               break;
716                     default:
717                               continue;
718                     }
719 
720                     /* If the segment is smaller than a page, skip it. */
721                     if (size < PAGE_SIZE)
722                               continue;
723 
724                     seg_start = addr;
725                     seg_end = addr + size;
726 
727                     /*
728                      * XXX XXX: Avoid the ISA I/O MEM.
729                      *
730                      * Some laptops (for example, Toshiba Satellite2550X) report
731                      * this area as valid.
732                      */
733                     if (seg_start < IOM_END && seg_end > IOM_BEGIN) {
734                               printf("WARNING: memory map entry overlaps "
735                                   "with ``Compatibility Holes'': "
736                                   "0x%"PRIx64"/0x%"PRIx64"/0x%x\n", seg_start,
737                                   seg_end - seg_start, type);
738 
739                               if (x86_add_cluster(seg_start, IOM_BEGIN, type) == -1)
740                                         break;
741                               if (x86_add_cluster(IOM_END, seg_end, type) == -1)
742                                         break;
743                     } else {
744                               if (x86_add_cluster(seg_start, seg_end, type) == -1)
745                                         break;
746                     }
747           }
748 
749           return 0;
750 }
751 
752 static int
x86_fake_clusters(void)753 x86_fake_clusters(void)
754 {
755           extern struct extent *iomem_ex;
756           phys_ram_seg_t *cluster;
757           KASSERT(mem_cluster_cnt == 0);
758 
759           /*
760            * Allocate the physical addresses used by RAM from the iomem extent
761            * map. This is done before the addresses are page rounded just to make
762            * sure we get them all.
763            */
764           if (extent_alloc_region(iomem_ex, 0, KBTOB(biosbasemem), EX_NOWAIT)) {
765                     /* XXX What should we do? */
766                     printf("WARNING: CAN'T ALLOCATE BASE MEMORY FROM "
767                         "IOMEM EXTENT MAP!\n");
768           }
769 
770           cluster = &mem_clusters[0];
771           cluster->start = 0;
772           cluster->size = trunc_page(KBTOB(biosbasemem));
773           physmem += atop(cluster->size);
774 
775           if (extent_alloc_region(iomem_ex, IOM_END, KBTOB(biosextmem),
776               EX_NOWAIT)) {
777                     /* XXX What should we do? */
778                     printf("WARNING: CAN'T ALLOCATE EXTENDED MEMORY FROM "
779                         "IOMEM EXTENT MAP!\n");
780           }
781 
782 #if NISADMA > 0
783           /*
784            * Some motherboards/BIOSes remap the 384K of RAM that would
785            * normally be covered by the ISA hole to the end of memory
786            * so that it can be used.  However, on a 16M system, this
787            * would cause bounce buffers to be allocated and used.
788            * This is not desirable behaviour, as more than 384K of
789            * bounce buffers might be allocated.  As a work-around,
790            * we round memory down to the nearest 1M boundary if
791            * we're using any isadma devices and the remapped memory
792            * is what puts us over 16M.
793            */
794           if (biosextmem > (15*1024) && biosextmem < (16*1024)) {
795                     char pbuf[9];
796 
797                     format_bytes(pbuf, sizeof(pbuf), biosextmem - (15*1024));
798                     printf("Warning: ignoring %s of remapped memory\n", pbuf);
799                     biosextmem = (15*1024);
800           }
801 #endif
802 
803           cluster = &mem_clusters[1];
804           cluster->start = IOM_END;
805           cluster->size = trunc_page(KBTOB(biosextmem));
806           physmem += atop(cluster->size);
807 
808           mem_cluster_cnt = 2;
809 
810           avail_end = IOM_END + trunc_page(KBTOB(biosextmem));
811 
812           return 0;
813 }
814 
815 /*
816  * x86_load_region: load the physical memory region from seg_start to seg_end
817  * into the VM system.
818  */
819 static void
x86_load_region(uint64_t seg_start,uint64_t seg_end)820 x86_load_region(uint64_t seg_start, uint64_t seg_end)
821 {
822           unsigned int i;
823           uint64_t tmp;
824 
825           i = __arraycount(x86_freelists);
826           while (i--) {
827                     if (x86_freelists[i].limit <= seg_start)
828                               continue;
829                     if (x86_freelists[i].freelist == VM_FREELIST_DEFAULT)
830                               continue;
831                     tmp = MIN(x86_freelists[i].limit, seg_end);
832                     if (tmp == seg_start)
833                               continue;
834 
835 #ifdef DEBUG_MEMLOAD
836                     printf("loading freelist %d 0x%"PRIx64"-0x%"PRIx64
837                         " (0x%"PRIx64"-0x%"PRIx64")\n", x86_freelists[i].freelist,
838                         seg_start, tmp, (uint64_t)atop(seg_start),
839                         (uint64_t)atop(tmp));
840 #endif
841 
842                     uvm_page_physload(atop(seg_start), atop(tmp), atop(seg_start),
843                         atop(tmp), x86_freelists[i].freelist);
844                     seg_start = tmp;
845           }
846 
847           if (seg_start != seg_end) {
848 #ifdef DEBUG_MEMLOAD
849                     printf("loading default 0x%"PRIx64"-0x%"PRIx64
850                         " (0x%"PRIx64"-0x%"PRIx64")\n", seg_start, seg_end,
851                         (uint64_t)atop(seg_start), (uint64_t)atop(seg_end));
852 #endif
853                     uvm_page_physload(atop(seg_start), atop(seg_end),
854                         atop(seg_start), atop(seg_end), VM_FREELIST_DEFAULT);
855           }
856 }
857 
858 #ifdef XEN
859 static void
x86_add_xen_clusters(void)860 x86_add_xen_clusters(void)
861 {
862           if (hvm_start_info->memmap_entries > 0) {
863                     struct hvm_memmap_table_entry *map_entry;
864                     map_entry = (void *)((uintptr_t)hvm_start_info->memmap_paddr + KERNBASE);
865                     for (int i = 0; i < hvm_start_info->memmap_entries; i++) {
866                               if (map_entry[i].size < PAGE_SIZE)
867                                         continue;
868                               switch (map_entry[i].type) {
869                               case XEN_HVM_MEMMAP_TYPE_RAM:
870                                         x86_add_cluster(map_entry[i].addr,
871                                             map_entry[i].addr + map_entry[i].size,
872                                             BIM_Memory);
873                                         break;
874                               case XEN_HVM_MEMMAP_TYPE_ACPI:
875                                         x86_add_cluster(map_entry[i].addr,
876                                             map_entry[i].addr + map_entry[i].size,
877                                             BIM_ACPI);
878                                         break;
879                               }
880                     }
881           } else {
882                     struct xen_memory_map memmap;
883                     static struct _xen_mmap {
884                               struct btinfo_memmap bim;
885                               struct bi_memmap_entry map[128]; /* same as FreeBSD */
886                     } __packed xen_mmap;
887                     int err;
888 
889                     memmap.nr_entries = 128;
890                     set_xen_guest_handle(memmap.buffer, &xen_mmap.bim.entry[0]);
891                     if ((err = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap))
892                         < 0)
893                               panic("XENMEM_memory_map %d", err);
894                     xen_mmap.bim.num = memmap.nr_entries;
895                     x86_parse_clusters(&xen_mmap.bim);
896           }
897 }
898 #endif /* XEN */
899 /*
900  * init_x86_clusters: retrieve the memory clusters provided by the BIOS, and
901  * initialize mem_clusters.
902  */
903 void
init_x86_clusters(void)904 init_x86_clusters(void)
905 {
906           struct btinfo_memmap *bim;
907           struct btinfo_efimemmap *biem;
908 
909           /*
910            * Check to see if we have a memory map from the BIOS (passed to us by
911            * the boot program).
912            */
913 #ifdef XEN
914           if (pvh_boot) {
915                     x86_add_xen_clusters();
916           }
917 #endif /* XEN */
918 
919 #ifdef i386
920           extern int biosmem_implicit;
921           biem = lookup_bootinfo(BTINFO_EFIMEMMAP);
922           if (biem != NULL)
923                     bim = efi_get_e820memmap();
924           else
925                     bim = lookup_bootinfo(BTINFO_MEMMAP);
926           if ((biosmem_implicit || (biosbasemem == 0 && biosextmem == 0)) &&
927               bim != NULL && bim->num > 0)
928                     x86_parse_clusters(bim);
929 #else
930 #if !defined(REALBASEMEM) && !defined(REALEXTMEM)
931           biem = lookup_bootinfo(BTINFO_EFIMEMMAP);
932           if (biem != NULL)
933                     bim = efi_get_e820memmap();
934           else
935                     bim = lookup_bootinfo(BTINFO_MEMMAP);
936           if (bim != NULL && bim->num > 0)
937                     x86_parse_clusters(bim);
938 #else
939           (void)bim, (void)biem;
940 #endif
941 #endif
942 
943           if (mem_cluster_cnt == 0) {
944                     /*
945                      * If x86_parse_clusters didn't find any valid segment, create
946                      * fake clusters.
947                      */
948                     x86_fake_clusters();
949           }
950 }
951 
952 /*
953  * init_x86_vm: initialize the VM system on x86. We basically internalize as
954  * many physical pages as we can, starting at lowmem_rsvd, but we don't
955  * internalize the kernel physical pages (from pa_kstart to pa_kend).
956  */
957 int
init_x86_vm(paddr_t pa_kend)958 init_x86_vm(paddr_t pa_kend)
959 {
960           extern struct bootspace bootspace;
961           paddr_t pa_kstart = bootspace.head.pa;
962           uint64_t seg_start, seg_end;
963           uint64_t seg_start1, seg_end1;
964           int x;
965           unsigned i;
966 
967           for (i = 0; i < __arraycount(x86_freelists); i++) {
968                     if (avail_end < x86_freelists[i].limit)
969                               x86_freelists[i].freelist = VM_FREELIST_DEFAULT;
970           }
971 
972           /*
973            * Now, load the memory clusters (which have already been rounded and
974            * truncated) into the VM system.
975            *
976            * NOTE: we assume that memory starts at 0.
977            */
978           for (x = 0; x < mem_cluster_cnt; x++) {
979                     const phys_ram_seg_t *cluster = &mem_clusters[x];
980 
981                     seg_start = cluster->start;
982                     seg_end = cluster->start + cluster->size;
983                     seg_start1 = 0;
984                     seg_end1 = 0;
985 
986 #ifdef DEBUG_MEMLOAD
987                     printf("segment %" PRIx64 " - %" PRIx64 "\n",
988                         seg_start, seg_end);
989 #endif
990 
991                     /* Skip memory before our available starting point. */
992                     if (seg_end <= lowmem_rsvd) {
993 #ifdef DEBUG_MEMLOAD
994                               printf("discard segment below starting point "
995                                   "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end);
996 #endif
997                               continue;
998                     }
999 
1000                     if (seg_start <= lowmem_rsvd && lowmem_rsvd < seg_end) {
1001                               seg_start = lowmem_rsvd;
1002                               if (seg_start == seg_end) {
1003 #ifdef DEBUG_MEMLOAD
1004                                         printf("discard segment below starting point "
1005                                             "%" PRIx64 " - %" PRIx64 "\n",
1006                                             seg_start, seg_end);
1007 
1008 
1009 #endif
1010                                         continue;
1011                               }
1012                     }
1013 
1014                     /*
1015                      * If this segment contains the kernel, split it in two, around
1016                      * the kernel.
1017                      *  [seg_start                       seg_end]
1018                      *             [pa_kstart  pa_kend]
1019                      */
1020                     if (seg_start <= pa_kstart && pa_kend <= seg_end) {
1021 #ifdef DEBUG_MEMLOAD
1022                               printf("split kernel overlapping to "
1023                                   "%" PRIx64 " - %" PRIxPADDR " and "
1024                                   "%" PRIxPADDR " - %" PRIx64 "\n",
1025                                   seg_start, pa_kstart, pa_kend, seg_end);
1026 #endif
1027                               seg_start1 = pa_kend;
1028                               seg_end1 = seg_end;
1029                               seg_end = pa_kstart;
1030                               KASSERT(seg_end < seg_end1);
1031                     }
1032 
1033                     /*
1034                      * Discard a segment inside the kernel
1035                      *  [pa_kstart                       pa_kend]
1036                      *             [seg_start  seg_end]
1037                      */
1038                     if (pa_kstart < seg_start && seg_end < pa_kend) {
1039 #ifdef DEBUG_MEMLOAD
1040                               printf("discard complete kernel overlap "
1041                                   "%" PRIx64 " - %" PRIx64 "\n", seg_start, seg_end);
1042 #endif
1043                               continue;
1044                     }
1045 
1046                     /*
1047                      * Discard leading hunk that overlaps the kernel
1048                      *  [pa_kstart             pa_kend]
1049                      *            [seg_start            seg_end]
1050                      */
1051                     if (pa_kstart < seg_start &&
1052                         seg_start < pa_kend &&
1053                         pa_kend < seg_end) {
1054 #ifdef DEBUG_MEMLOAD
1055                               printf("discard leading kernel overlap "
1056                                   "%" PRIx64 " - %" PRIxPADDR "\n",
1057                                   seg_start, pa_kend);
1058 #endif
1059                               seg_start = pa_kend;
1060                     }
1061 
1062                     /*
1063                      * Discard trailing hunk that overlaps the kernel
1064                      *             [pa_kstart            pa_kend]
1065                      *  [seg_start              seg_end]
1066                      */
1067                     if (seg_start < pa_kstart &&
1068                         pa_kstart < seg_end &&
1069                         seg_end < pa_kend) {
1070 #ifdef DEBUG_MEMLOAD
1071                               printf("discard trailing kernel overlap "
1072                                   "%" PRIxPADDR " - %" PRIx64 "\n",
1073                                   pa_kstart, seg_end);
1074 #endif
1075                               seg_end = pa_kstart;
1076                     }
1077 
1078                     /* First hunk */
1079                     if (seg_start != seg_end) {
1080                               x86_load_region(seg_start, seg_end);
1081                     }
1082 
1083                     /* Second hunk */
1084                     if (seg_start1 != seg_end1) {
1085                               x86_load_region(seg_start1, seg_end1);
1086                     }
1087           }
1088 
1089           return 0;
1090 }
1091 
1092 #endif /* !XENPV */
1093 
1094 void
init_x86_msgbuf(void)1095 init_x86_msgbuf(void)
1096 {
1097           /* Message buffer is located at end of core. */
1098           psize_t sz = round_page(MSGBUFSIZE);
1099           psize_t reqsz = sz;
1100           uvm_physseg_t x;
1101 
1102 search_again:
1103           for (x = uvm_physseg_get_first();
1104                uvm_physseg_valid_p(x);
1105                x = uvm_physseg_get_next(x)) {
1106 
1107                     if (ctob(uvm_physseg_get_avail_end(x)) == avail_end)
1108                               break;
1109           }
1110 
1111           if (uvm_physseg_valid_p(x) == false)
1112                     panic("init_x86_msgbuf: can't find end of memory");
1113 
1114           /* Shrink so it'll fit in the last segment. */
1115           if (uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x) < atop(sz))
1116                     sz = ctob(uvm_physseg_get_avail_end(x) - uvm_physseg_get_avail_start(x));
1117 
1118           msgbuf_p_seg[msgbuf_p_cnt].sz = sz;
1119           msgbuf_p_seg[msgbuf_p_cnt++].paddr = ctob(uvm_physseg_get_avail_end(x)) - sz;
1120           uvm_physseg_unplug(uvm_physseg_get_end(x) - atop(sz), atop(sz));
1121 
1122           /* Now find where the new avail_end is. */
1123           avail_end = ctob(uvm_physseg_get_highest_frame());
1124 
1125           if (sz == reqsz)
1126                     return;
1127 
1128           reqsz -= sz;
1129           if (msgbuf_p_cnt == VM_PHYSSEG_MAX) {
1130                     /* No more segments available, bail out. */
1131                     printf("WARNING: MSGBUFSIZE (%zu) too large, using %zu.\n",
1132                         (size_t)MSGBUFSIZE, (size_t)(MSGBUFSIZE - reqsz));
1133                     return;
1134           }
1135 
1136           sz = reqsz;
1137           goto search_again;
1138 }
1139 
1140 void
x86_reset(void)1141 x86_reset(void)
1142 {
1143           uint8_t b;
1144 
1145 #if NACPICA > 0
1146           /*
1147            * If ACPI is active, try to reset using the reset register
1148            * defined in the FADT.
1149            */
1150           if (acpi_active) {
1151                     if (acpi_reset() == 0) {
1152                               delay(500000); /* wait 0.5 sec to see if that did it */
1153                     }
1154           }
1155 #endif
1156 
1157           /*
1158            * The keyboard controller has 4 random output pins, one of which is
1159            * connected to the RESET pin on the CPU in many PCs.  We tell the
1160            * keyboard controller to pulse this line a couple of times.
1161            */
1162           outb(IO_KBD + KBCMDP, KBC_PULSE0);
1163           delay(100000);
1164           outb(IO_KBD + KBCMDP, KBC_PULSE0);
1165           delay(100000);
1166 
1167           /*
1168            * Attempt to force a reset via the Reset Control register at
1169            * I/O port 0xcf9.  Bit 2 forces a system reset when it
1170            * transitions from 0 to 1.  Bit 1 selects the type of reset
1171            * to attempt: 0 selects a "soft" reset, and 1 selects a
1172            * "hard" reset.  We try a "hard" reset.  The first write sets
1173            * bit 1 to select a "hard" reset and clears bit 2.  The
1174            * second write forces a 0 -> 1 transition in bit 2 to trigger
1175            * a reset.
1176            */
1177           outb(0xcf9, 0x2);
1178           outb(0xcf9, 0x6);
1179           DELAY(500000);      /* wait 0.5 sec to see if that did it */
1180 
1181           /*
1182            * Attempt to force a reset via the Fast A20 and Init register
1183            * at I/O port 0x92. Bit 1 serves as an alternate A20 gate.
1184            * Bit 0 asserts INIT# when set to 1. We are careful to only
1185            * preserve bit 1 while setting bit 0. We also must clear bit
1186            * 0 before setting it if it isn't already clear.
1187            */
1188           b = inb(0x92);
1189           if (b != 0xff) {
1190                     if ((b & 0x1) != 0)
1191                               outb(0x92, b & 0xfe);
1192                     outb(0x92, b | 0x1);
1193                     DELAY(500000);      /* wait 0.5 sec to see if that did it */
1194           }
1195 }
1196 
1197 static int
x86_listener_cb(kauth_cred_t cred,kauth_action_t action,void * cookie,void * arg0,void * arg1,void * arg2,void * arg3)1198 x86_listener_cb(kauth_cred_t cred, kauth_action_t action, void *cookie,
1199     void *arg0, void *arg1, void *arg2, void *arg3)
1200 {
1201           int result;
1202 
1203           result = KAUTH_RESULT_DEFER;
1204 
1205           switch (action) {
1206           case KAUTH_MACHDEP_IOPERM_GET:
1207                     result = KAUTH_RESULT_ALLOW;
1208                     break;
1209 
1210           case KAUTH_MACHDEP_LDT_GET:
1211           case KAUTH_MACHDEP_LDT_SET:
1212                     if (x86_user_ldt_enabled) {
1213                               result = KAUTH_RESULT_ALLOW;
1214                     }
1215                     break;
1216 
1217           default:
1218                     break;
1219           }
1220 
1221           return result;
1222 }
1223 
1224 void
machdep_init(void)1225 machdep_init(void)
1226 {
1227 
1228           x86_listener = kauth_listen_scope(KAUTH_SCOPE_MACHDEP,
1229               x86_listener_cb, NULL);
1230 }
1231 
1232 /*
1233  * x86_startup: x86 common startup routine
1234  *
1235  * called by cpu_startup.
1236  */
1237 
1238 void
x86_startup(void)1239 x86_startup(void)
1240 {
1241 #if !defined(XENPV)
1242           nmi_init();
1243 #endif
1244 }
1245 
1246 const char *
get_booted_kernel(void)1247 get_booted_kernel(void)
1248 {
1249           const struct btinfo_bootpath *bibp = lookup_bootinfo(BTINFO_BOOTPATH);
1250           return bibp ? bibp->bootpath : NULL;
1251 }
1252 
1253 /*
1254  * machine dependent system variables.
1255  */
1256 static int
sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)1257 sysctl_machdep_booted_kernel(SYSCTLFN_ARGS)
1258 {
1259           struct btinfo_bootpath *bibp;
1260           struct sysctlnode node;
1261 
1262           bibp = lookup_bootinfo(BTINFO_BOOTPATH);
1263           if (!bibp)
1264                     return ENOENT; /* ??? */
1265 
1266           node = *rnode;
1267           node.sysctl_data = bibp->bootpath;
1268           node.sysctl_size = sizeof(bibp->bootpath);
1269           return sysctl_lookup(SYSCTLFN_CALL(&node));
1270 }
1271 
1272 static int
sysctl_machdep_bootmethod(SYSCTLFN_ARGS)1273 sysctl_machdep_bootmethod(SYSCTLFN_ARGS)
1274 {
1275           struct sysctlnode node;
1276           char buf[5];
1277 
1278           node = *rnode;
1279           node.sysctl_data = buf;
1280           if (bootmethod_efi)
1281                     memcpy(node.sysctl_data, "UEFI", 5);
1282           else
1283                     memcpy(node.sysctl_data, "BIOS", 5);
1284 
1285           return sysctl_lookup(SYSCTLFN_CALL(&node));
1286 }
1287 
1288 
1289 static int
sysctl_machdep_diskinfo(SYSCTLFN_ARGS)1290 sysctl_machdep_diskinfo(SYSCTLFN_ARGS)
1291 {
1292           struct sysctlnode node;
1293           extern struct bi_devmatch *x86_alldisks;
1294           extern int x86_ndisks;
1295 
1296           if (x86_alldisks == NULL)
1297                     return EOPNOTSUPP;
1298 
1299           node = *rnode;
1300           node.sysctl_data = x86_alldisks;
1301           node.sysctl_size = sizeof(struct disklist) +
1302               (x86_ndisks - 1) * sizeof(struct nativedisk_info);
1303           return sysctl_lookup(SYSCTLFN_CALL(&node));
1304 }
1305 
1306 #ifndef XENPV
1307 static int
sysctl_machdep_tsc_enable(SYSCTLFN_ARGS)1308 sysctl_machdep_tsc_enable(SYSCTLFN_ARGS)
1309 {
1310           struct sysctlnode node;
1311           int error, val;
1312 
1313           val = *(int *)rnode->sysctl_data;
1314 
1315           node = *rnode;
1316           node.sysctl_data = &val;
1317 
1318           error = sysctl_lookup(SYSCTLFN_CALL(&node));
1319           if (error != 0 || newp == NULL)
1320                     return error;
1321 
1322           if (val == 1) {
1323                     tsc_user_enable();
1324           } else if (val == 0) {
1325                     tsc_user_disable();
1326           } else {
1327                     error = EINVAL;
1328           }
1329           if (error)
1330                     return error;
1331 
1332           *(int *)rnode->sysctl_data = val;
1333 
1334           return 0;
1335 }
1336 #endif
1337 
1338 static const char * const vm_guest_name[VM_LAST] = {
1339           [VM_GUEST_NO] =               "none",
1340           [VM_GUEST_VM] =               "generic",
1341           [VM_GUEST_XENPV] =  "XenPV",
1342           [VM_GUEST_XENPVH] = "XenPVH",
1343           [VM_GUEST_XENHVM] = "XenHVM",
1344           [VM_GUEST_XENPVHVM] =         "XenPVHVM",
1345           [VM_GUEST_GENPVH] = "GenPVH",
1346           [VM_GUEST_HV] =               "Hyper-V",
1347           [VM_GUEST_VMWARE] = "VMware",
1348           [VM_GUEST_KVM] =    "KVM",
1349           [VM_GUEST_VIRTUALBOX] =       "VirtualBox",
1350           [VM_GUEST_NVMM] =   "NVMM",
1351 };
1352 
1353 static int
sysctl_machdep_hypervisor(SYSCTLFN_ARGS)1354 sysctl_machdep_hypervisor(SYSCTLFN_ARGS)
1355 {
1356           struct sysctlnode node;
1357           const char *t = NULL;
1358           char buf[64];
1359 
1360           node = *rnode;
1361           node.sysctl_data = buf;
1362           if (vm_guest >= VM_GUEST_NO && vm_guest < VM_LAST)
1363                     t = vm_guest_name[vm_guest];
1364           if (t == NULL)
1365                     t = "unknown";
1366           strlcpy(buf, t, sizeof(buf));
1367           return sysctl_lookup(SYSCTLFN_CALL(&node));
1368 }
1369 
1370 static void
const_sysctl(struct sysctllog ** clog,const char * name,int type,u_quad_t value,int tag)1371 const_sysctl(struct sysctllog **clog, const char *name, int type,
1372     u_quad_t value, int tag)
1373 {
1374           (sysctl_createv)(clog, 0, NULL, NULL,
1375                            CTLFLAG_PERMANENT | CTLFLAG_IMMEDIATE,
1376                            type, name, NULL, NULL, value, NULL, 0,
1377                            CTL_MACHDEP, tag, CTL_EOL);
1378 }
1379 
1380 SYSCTL_SETUP(sysctl_machdep_setup, "sysctl machdep subtree setup")
1381 {
1382           extern uint64_t tsc_freq;
1383 #ifndef XENPV
1384           extern int tsc_user_enabled;
1385 #endif
1386           extern int sparse_dump;
1387 
1388           sysctl_createv(clog, 0, NULL, NULL,
1389                            CTLFLAG_PERMANENT,
1390                            CTLTYPE_NODE, "machdep", NULL,
1391                            NULL, 0, NULL, 0,
1392                            CTL_MACHDEP, CTL_EOL);
1393 
1394           sysctl_createv(clog, 0, NULL, NULL,
1395                            CTLFLAG_PERMANENT,
1396                            CTLTYPE_STRUCT, "console_device", NULL,
1397                            sysctl_consdev, 0, NULL, sizeof(dev_t),
1398                            CTL_MACHDEP, CPU_CONSDEV, CTL_EOL);
1399           sysctl_createv(clog, 0, NULL, NULL,
1400                            CTLFLAG_PERMANENT,
1401                            CTLTYPE_STRING, "booted_kernel", NULL,
1402                            sysctl_machdep_booted_kernel, 0, NULL, 0,
1403                            CTL_MACHDEP, CPU_BOOTED_KERNEL, CTL_EOL);
1404           sysctl_createv(clog, 0, NULL, NULL,
1405                            CTLFLAG_PERMANENT,
1406                            CTLTYPE_STRING, "bootmethod", NULL,
1407                            sysctl_machdep_bootmethod, 0, NULL, 0,
1408                            CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1409           sysctl_createv(clog, 0, NULL, NULL,
1410                            CTLFLAG_PERMANENT,
1411                            CTLTYPE_STRUCT, "diskinfo", NULL,
1412                            sysctl_machdep_diskinfo, 0, NULL, 0,
1413                            CTL_MACHDEP, CPU_DISKINFO, CTL_EOL);
1414           sysctl_createv(clog, 0, NULL, NULL,
1415                            CTLFLAG_PERMANENT,
1416                            CTLTYPE_STRING, "cpu_brand", NULL,
1417                            NULL, 0, cpu_brand_string, 0,
1418                            CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1419           sysctl_createv(clog, 0, NULL, NULL,
1420                            CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
1421                            CTLTYPE_INT, "sparse_dump", NULL,
1422                            NULL, 0, &sparse_dump, 0,
1423                            CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1424           sysctl_createv(clog, 0, NULL, NULL,
1425                            CTLFLAG_PERMANENT,
1426                            CTLTYPE_QUAD, "tsc_freq", NULL,
1427                            NULL, 0, &tsc_freq, 0,
1428                            CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1429           sysctl_createv(clog, 0, NULL, NULL,
1430                            CTLFLAG_PERMANENT,
1431                            CTLTYPE_INT, "pae",
1432                            SYSCTL_DESCR("Whether the kernel uses PAE"),
1433                            NULL, 0, &use_pae, 0,
1434                            CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1435 #ifndef XENPV
1436           sysctl_createv(clog, 0, NULL, NULL,
1437                            CTLFLAG_READWRITE,
1438                            CTLTYPE_INT, "tsc_user_enable",
1439                            SYSCTL_DESCR("RDTSC instruction enabled in usermode"),
1440                            sysctl_machdep_tsc_enable, 0, &tsc_user_enabled, 0,
1441                            CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1442 #endif
1443           sysctl_createv(clog, 0, NULL, NULL,
1444                            CTLFLAG_PERMANENT,
1445                            CTLTYPE_STRING, "hypervisor", NULL,
1446                            sysctl_machdep_hypervisor, 0, NULL, 0,
1447                            CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1448 #ifdef SVS
1449           const struct sysctlnode *svs_rnode = NULL;
1450           sysctl_createv(clog, 0, NULL, &svs_rnode,
1451                            CTLFLAG_PERMANENT,
1452                            CTLTYPE_NODE, "svs", NULL,
1453                            NULL, 0, NULL, 0,
1454                            CTL_MACHDEP, CTL_CREATE);
1455           sysctl_createv(clog, 0, &svs_rnode, NULL,
1456                            CTLFLAG_PERMANENT,
1457                            CTLTYPE_BOOL, "enabled",
1458                            SYSCTL_DESCR("Whether the kernel uses SVS"),
1459                            NULL, 0, &svs_enabled, 0,
1460                            CTL_CREATE, CTL_EOL);
1461           sysctl_createv(clog, 0, &svs_rnode, NULL,
1462                            CTLFLAG_PERMANENT,
1463                            CTLTYPE_BOOL, "pcid",
1464                            SYSCTL_DESCR("Whether SVS uses PCID"),
1465                            NULL, 0, &svs_pcid, 0,
1466                            CTL_CREATE, CTL_EOL);
1467 #endif
1468 
1469           sysctl_createv(clog, 0, NULL, NULL,
1470                            CTLFLAG_READWRITE,
1471                            CTLTYPE_BOOL, "user_ldt",
1472                            SYSCTL_DESCR("Whether USER_LDT is enabled"),
1473                            NULL, 0, &x86_user_ldt_enabled, 0,
1474                            CTL_MACHDEP, CTL_CREATE, CTL_EOL);
1475 
1476 #ifndef XENPV
1477           void sysctl_speculation_init(struct sysctllog **);
1478           sysctl_speculation_init(clog);
1479 #endif
1480 
1481           /* None of these can ever change once the system has booted */
1482           const_sysctl(clog, "fpu_present", CTLTYPE_INT, i386_fpu_present,
1483               CPU_FPU_PRESENT);
1484           const_sysctl(clog, "osfxsr", CTLTYPE_INT, i386_use_fxsave,
1485               CPU_OSFXSR);
1486           const_sysctl(clog, "sse", CTLTYPE_INT, i386_has_sse,
1487               CPU_SSE);
1488           const_sysctl(clog, "sse2", CTLTYPE_INT, i386_has_sse2,
1489               CPU_SSE2);
1490 
1491           const_sysctl(clog, "fpu_save", CTLTYPE_INT, x86_fpu_save,
1492               CPU_FPU_SAVE);
1493           const_sysctl(clog, "fpu_save_size", CTLTYPE_INT, x86_fpu_save_size,
1494               CPU_FPU_SAVE_SIZE);
1495           const_sysctl(clog, "xsave_features", CTLTYPE_QUAD, x86_xsave_features,
1496               CPU_XSAVE_FEATURES);
1497 
1498 #ifndef XENPV
1499           const_sysctl(clog, "biosbasemem", CTLTYPE_INT, biosbasemem,
1500               CPU_BIOSBASEMEM);
1501           const_sysctl(clog, "biosextmem", CTLTYPE_INT, biosextmem,
1502               CPU_BIOSEXTMEM);
1503 #endif
1504 }
1505 
1506 /* Here for want of a better place */
1507 #if defined(DOM0OPS) || !defined(XENPV)
1508 struct pic *
intr_findpic(int num)1509 intr_findpic(int num)
1510 {
1511 #if NIOAPIC > 0
1512           struct ioapic_softc *pic;
1513 
1514           pic = ioapic_find_bybase(num);
1515           if (pic != NULL)
1516                     return &pic->sc_pic;
1517 #endif
1518           if (num < NUM_LEGACY_IRQS)
1519                     return &i8259_pic;
1520 
1521           return NULL;
1522 }
1523 #endif
1524 
1525 void
cpu_initclocks(void)1526 cpu_initclocks(void)
1527 {
1528 
1529           /*
1530            * Re-calibrate TSC on boot CPU using most accurate time source,
1531            * thus making accurate TSC available for x86_initclock_func().
1532            */
1533           cpu_get_tsc_freq(curcpu());
1534 
1535           /* Now start the clocks on this CPU (the boot CPU). */
1536           (*x86_initclock_func)();
1537 }
1538 
1539 int
x86_cpu_is_lcall(const void * ip)1540 x86_cpu_is_lcall(const void *ip)
1541 {
1542           static const uint8_t lcall[] = { 0x9a, 0, 0, 0, 0 };
1543           int error;
1544           const size_t sz = sizeof(lcall) + 2;
1545           uint8_t tmp[sizeof(lcall) + 2];
1546 
1547           if ((error = copyin(ip, tmp, sz)) != 0)
1548                     return error;
1549 
1550           if (memcmp(tmp, lcall, sizeof(lcall)) != 0 || tmp[sz - 1] != 0)
1551                     return EINVAL;
1552 
1553           switch (tmp[sz - 2]) {
1554           case (uint8_t)0x07: /* NetBSD */
1555           case (uint8_t)0x87: /* BSD/OS */
1556                     return 0;
1557           default:
1558                     return EINVAL;
1559           }
1560 }
1561