xref: /dragonfly/sys/platform/pc64/x86_64/machdep.c (revision 2b3f93ea6d1f70880f3e87f3c2cbe0dc0bfc9332)
1 /*-
2  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
3  * Copyright (c) 1992 Terrence R. Lambert.
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 2008-2017 The DragonFly Project.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * William Jolitz.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *        This product includes software developed by the University of
22  *        California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: @(#)machdep.c        7.4 (Berkeley) 6/3/91
40  * $FreeBSD: src/sys/i386/i386/machdep.c,v 1.385.2.30 2003/05/31 08:48:05 alc Exp $
41  */
42 
43 #include "use_isa.h"
44 #include "opt_cpu.h"
45 #include "opt_ddb.h"
46 #include "opt_inet.h"
47 #include "opt_maxmem.h"
48 #include "opt_msgbuf.h"
49 #include "opt_swap.h"
50 
51 #include <sys/param.h>
52 #include <sys/systm.h>
53 #include <sys/sysmsg.h>
54 #include <sys/signalvar.h>
55 #include <sys/kernel.h>
56 #include <sys/linker.h>
57 #include <sys/malloc.h>
58 #include <sys/proc.h>
59 #include <sys/caps.h>
60 #include <sys/buf.h>
61 #include <sys/reboot.h>
62 #include <sys/mbuf.h>
63 #include <sys/msgbuf.h>
64 #include <sys/sysent.h>
65 #include <sys/sysctl.h>
66 #include <sys/vmmeter.h>
67 #include <sys/bus.h>
68 #include <sys/usched.h>
69 #include <sys/reg.h>
70 #include <sys/sbuf.h>
71 #include <sys/ctype.h>
72 #include <sys/serialize.h>
73 #include <sys/systimer.h>
74 
75 #include <vm/vm.h>
76 #include <vm/vm_param.h>
77 #include <sys/lock.h>
78 #include <vm/vm_kern.h>
79 #include <vm/vm_object.h>
80 #include <vm/vm_page.h>
81 #include <vm/vm_map.h>
82 #include <vm/vm_pager.h>
83 #include <vm/vm_extern.h>
84 
85 #include <sys/thread2.h>
86 #include <sys/mplock2.h>
87 
88 #include <sys/exec.h>
89 #include <sys/cons.h>
90 
91 #include <sys/efi.h>
92 
93 #include <ddb/ddb.h>
94 
95 #include <machine/cpu.h>
96 #include <machine/clock.h>
97 #include <machine/specialreg.h>
98 #if 0 /* JG */
99 #include <machine/bootinfo.h>
100 #endif
101 #include <machine/md_var.h>
102 #include <machine/metadata.h>
103 #include <machine/pc/bios.h>
104 #include <machine/pcb_ext.h>
105 #include <machine/globaldata.h>                   /* CPU_prvspace */
106 #include <machine/smp.h>
107 #include <machine/cputypes.h>
108 #include <machine/intr_machdep.h>
109 #include <machine/framebuffer.h>
110 
111 #ifdef OLD_BUS_ARCH
112 #include <bus/isa/isa_device.h>
113 #endif
114 #include <machine_base/isa/isa_intr.h>
115 #include <bus/isa/rtc.h>
116 #include <sys/random.h>
117 #include <sys/ptrace.h>
118 #include <machine/sigframe.h>
119 
120 #include <sys/machintr.h>
121 #include <machine_base/icu/icu_abi.h>
122 #include <machine_base/icu/elcr_var.h>
123 #include <machine_base/apic/lapic.h>
124 #include <machine_base/apic/ioapic.h>
125 #include <machine_base/apic/ioapic_abi.h>
126 #include <machine/mptable.h>
127 
128 #define PHYSMAP_ENTRIES                 10
129 #define MAXBUFSTRUCTSIZE      ((size_t)512 * 1024 * 1024)
130 
131 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
132 
133 extern void printcpuinfo(void);         /* XXX header file */
134 extern void identify_cpu(void);
135 extern void panicifcpuunsupported(void);
136 
137 static void cpu_startup(void *);
138 static void pic_finish(void *);
139 static void cpu_finish(void *);
140 
141 static void set_fpregs_xmm(struct save87 *, struct savexmm *);
142 static void fill_fpregs_xmm(struct savexmm *, struct save87 *);
143 static void init_locks(void);
144 
145 extern void pcpu_timer_always(struct intrframe *);
146 
147 SYSINIT(cpu, SI_BOOT2_START_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
148 SYSINIT(pic_finish, SI_BOOT2_FINISH_PIC, SI_ORDER_FIRST, pic_finish, NULL);
149 SYSINIT(cpu_finish, SI_BOOT2_FINISH_CPU, SI_ORDER_FIRST, cpu_finish, NULL);
150 
151 #ifdef DDB
152 extern vm_offset_t ksym_start, ksym_end;
153 #endif
154 
155 struct privatespace CPU_prvspace_bsp __aligned(4096);
156 struct privatespace *CPU_prvspace[MAXCPU] = { &CPU_prvspace_bsp };
157 
158 vm_paddr_t efi_systbl_phys;
159 int       _udatasel, _ucodesel, _ucode32sel;
160 u_long    atdevbase;
161 int64_t tsc_offsets[MAXCPU];
162 cpumask_t smp_idleinvl_mask;
163 cpumask_t smp_idleinvl_reqs;
164 
165  /* MWAIT hint (EAX) or CPU_MWAIT_HINT_ */
166 __read_mostly static int cpu_mwait_halt_global;
167 __read_mostly static int clock_debug1;
168 __read_mostly static int flame_poll_debug;
169 
170 SYSCTL_INT(_debug, OID_AUTO, flame_poll_debug,
171           CTLFLAG_RW, &flame_poll_debug, 0, "");
172 TUNABLE_INT("debug.flame_poll_debug", &flame_poll_debug);
173 
174 #if defined(SWTCH_OPTIM_STATS)
175 extern int swtch_optim_stats;
176 SYSCTL_INT(_debug, OID_AUTO, swtch_optim_stats,
177           CTLFLAG_RD, &swtch_optim_stats, 0, "");
178 SYSCTL_INT(_debug, OID_AUTO, tlb_flush_count,
179           CTLFLAG_RD, &tlb_flush_count, 0, "");
180 #endif
181 SYSCTL_INT(_debug, OID_AUTO, clock_debug1,
182           CTLFLAG_RW, &clock_debug1, 0, "");
183 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_halt,
184           CTLFLAG_RD, &cpu_mwait_halt_global, 0, "");
185 SYSCTL_INT(_hw, OID_AUTO, cpu_mwait_spin,
186           CTLFLAG_RD, &cpu_mwait_spin, 0, "monitor/mwait target state");
187 
188 #define CPU_MWAIT_HAS_CX      \
189           ((cpu_feature2 & CPUID2_MON) && \
190            (cpu_mwait_feature & CPUID_MWAIT_EXT))
191 
192 #define CPU_MWAIT_CX_NAMELEN  16
193 
194 #define CPU_MWAIT_C1                    1
195 #define CPU_MWAIT_C2                    2
196 #define CPU_MWAIT_C3                    3
197 #define CPU_MWAIT_CX_MAX      8
198 
199 #define CPU_MWAIT_HINT_AUTO   -1        /* C1 and C2 */
200 #define CPU_MWAIT_HINT_AUTODEEP         -2        /* C3+ */
201 
202 SYSCTL_NODE(_machdep, OID_AUTO, mwait, CTLFLAG_RW, 0, "MWAIT features");
203 SYSCTL_NODE(_machdep_mwait, OID_AUTO, CX, CTLFLAG_RW, 0, "MWAIT Cx settings");
204 
205 struct cpu_mwait_cx {
206           int                           subcnt;
207           char                          name[4];
208           struct sysctl_ctx_list        sysctl_ctx;
209           struct sysctl_oid   *sysctl_tree;
210 };
211 static struct cpu_mwait_cx    cpu_mwait_cx_info[CPU_MWAIT_CX_MAX];
212 static char                             cpu_mwait_cx_supported[256];
213 
214 static int                              cpu_mwait_c1_hints_cnt;
215 static int                              cpu_mwait_hints_cnt;
216 static int                              *cpu_mwait_hints;
217 
218 static int                              cpu_mwait_deep_hints_cnt;
219 static int                              *cpu_mwait_deep_hints;
220 
221 #define CPU_IDLE_REPEAT_DEFAULT         750
222 
223 static u_int                            cpu_idle_repeat = CPU_IDLE_REPEAT_DEFAULT;
224 static u_long                           cpu_idle_repeat_max = CPU_IDLE_REPEAT_DEFAULT;
225 static u_int                            cpu_mwait_repeat_shift = 1;
226 
227 #define CPU_MWAIT_C3_PREAMBLE_BM_ARB    0x1
228 #define CPU_MWAIT_C3_PREAMBLE_BM_STS    0x2
229 
230 static int                              cpu_mwait_c3_preamble =
231                                             CPU_MWAIT_C3_PREAMBLE_BM_ARB |
232                                             CPU_MWAIT_C3_PREAMBLE_BM_STS;
233 
234 SYSCTL_STRING(_machdep_mwait_CX, OID_AUTO, supported, CTLFLAG_RD,
235     cpu_mwait_cx_supported, 0, "MWAIT supported C states");
236 SYSCTL_INT(_machdep_mwait_CX, OID_AUTO, c3_preamble, CTLFLAG_RD,
237     &cpu_mwait_c3_preamble, 0, "C3+ preamble mask");
238 
239 static int          cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,
240                         int *, boolean_t);
241 static int          cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS);
242 static int          cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS);
243 static int          cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS);
244 
245 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, idle, CTLTYPE_STRING|CTLFLAG_RW,
246     NULL, 0, cpu_mwait_cx_idle_sysctl, "A", "");
247 SYSCTL_PROC(_machdep_mwait_CX, OID_AUTO, spin, CTLTYPE_STRING|CTLFLAG_RW,
248     NULL, 0, cpu_mwait_cx_spin_sysctl, "A", "");
249 SYSCTL_UINT(_machdep_mwait_CX, OID_AUTO, repeat_shift, CTLFLAG_RW,
250     &cpu_mwait_repeat_shift, 0, "");
251 
252 long physmem = 0;
253 
254 u_long ebda_addr = 0;
255 
256 int imcr_present = 0;
257 
258 int naps = 0; /* # of Applications processors */
259 
260 u_int base_memory;
261 
262 static int
sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)263 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
264 {
265           u_long pmem = ctob(physmem);
266           int error;
267 
268           error = sysctl_handle_long(oidp, &pmem, 0, req);
269 
270           return (error);
271 }
272 
273 SYSCTL_PROC(_hw, HW_PHYSMEM, physmem, CTLTYPE_ULONG|CTLFLAG_RD,
274           0, 0, sysctl_hw_physmem, "LU",
275           "Total system memory in bytes (number of pages * page size)");
276 
277 static int
sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)278 sysctl_hw_usermem(SYSCTL_HANDLER_ARGS)
279 {
280           u_long usermem = ctob(physmem - vmstats.v_wire_count);
281           int error;
282 
283           error = sysctl_handle_long(oidp, &usermem, 0, req);
284 
285           return (error);
286 }
287 
288 SYSCTL_PROC(_hw, HW_USERMEM, usermem, CTLTYPE_ULONG|CTLFLAG_RD,
289           0, 0, sysctl_hw_usermem, "LU", "");
290 
291 static int
sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)292 sysctl_hw_availpages(SYSCTL_HANDLER_ARGS)
293 {
294           int error;
295           u_long availpages;
296 
297           availpages = x86_64_btop(avail_end - avail_start);
298           error = sysctl_handle_long(oidp, &availpages, 0, req);
299 
300           return (error);
301 }
302 
303 SYSCTL_PROC(_hw, OID_AUTO, availpages, CTLTYPE_ULONG|CTLFLAG_RD,
304           0, 0, sysctl_hw_availpages, "LU", "");
305 
306 vm_paddr_t Maxmem;
307 vm_paddr_t Realmem;
308 
309 /*
310  * The number of PHYSMAP entries must be one less than the number of
311  * PHYSSEG entries because the PHYSMAP entry that spans the largest
312  * physical address that is accessible by ISA DMA is split into two
313  * PHYSSEG entries.
314  */
315 vm_phystable_t phys_avail[VM_PHYSSEG_MAX + 1];
316 vm_phystable_t dump_avail[VM_PHYSSEG_MAX + 1];
317 
318 /* must be 1 less so 0 0 can signal end of chunks */
319 #define PHYS_AVAIL_ARRAY_END (NELEM(phys_avail) - 1)
320 #define DUMP_AVAIL_ARRAY_END (NELEM(dump_avail) - 1)
321 
322 static vm_offset_t buffer_sva, buffer_eva;
323 vm_offset_t clean_sva, clean_eva;
324 static vm_offset_t pager_sva, pager_eva;
325 static struct trapframe proc0_tf;
326 
327 static void cpu_implement_smap(void);
328 
329 static void
cpu_startup(void * dummy)330 cpu_startup(void *dummy)
331 {
332           caddr_t v;
333           vm_size_t size = 0;
334           vm_offset_t firstaddr;
335 
336           /*
337            * Good {morning,afternoon,evening,night}.
338            */
339           kprintf("%s", version);
340           startrtclock();
341           printcpuinfo();
342           panicifcpuunsupported();
343           if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
344                     cpu_implement_smap();
345 
346           kprintf("real memory  = %ju (%ju MB)\n",
347                     (intmax_t)Realmem,
348                     (intmax_t)Realmem / 1024 / 1024);
349           /*
350            * Display any holes after the first chunk of extended memory.
351            */
352           if (bootverbose) {
353                     int indx;
354 
355                     kprintf("Physical memory chunk(s):\n");
356                     for (indx = 0; phys_avail[indx].phys_end != 0; ++indx) {
357                               vm_paddr_t size1;
358 
359                               size1 = phys_avail[indx].phys_end -
360                                         phys_avail[indx].phys_beg;
361 
362                               kprintf("0x%08jx - 0x%08jx, %ju bytes (%ju pages)\n",
363                                         (intmax_t)phys_avail[indx].phys_beg,
364                                         (intmax_t)phys_avail[indx].phys_end - 1,
365                                         (intmax_t)size1,
366                                         (intmax_t)(size1 / PAGE_SIZE));
367                     }
368           }
369 
370           /*
371            * Allocate space for system data structures.
372            * The first available kernel virtual address is in "v".
373            * As pages of kernel virtual memory are allocated, "v" is incremented.
374            * As pages of memory are allocated and cleared,
375            * "firstaddr" is incremented.
376            * An index into the kernel page table corresponding to the
377            * virtual memory address maintained in "v" is kept in "mapaddr".
378            */
379 
380           /*
381            * Make two passes.  The first pass calculates how much memory is
382            * needed and allocates it.  The second pass assigns virtual
383            * addresses to the various data structures.
384            */
385           firstaddr = 0;
386 again:
387           v = (caddr_t)firstaddr;
388 
389 #define   valloc(name, type, num) \
390               (name) = (type *)v; v = (caddr_t)((name)+(num))
391 #define   valloclim(name, type, num, lim) \
392               (name) = (type *)v; v = (caddr_t)((lim) = ((name)+(num)))
393 
394           /*
395            * Calculate nbuf such that maxbufspace uses approximately 1/20
396            * of physical memory by default, with a minimum of 50 buffers.
397            *
398            * The calculation is made after discounting 128MB.
399            *
400            * NOTE: maxbufspace is (nbuf * NBUFCALCSIZE) (NBUFCALCSIZE ~= 16KB).
401            *         nbuf = (kbytes / factor) would cover all of memory.
402            */
403           if (nbuf == 0) {
404                     long factor = NBUFCALCSIZE / 1024;                /* KB/nbuf */
405                     long kbytes = physmem * (PAGE_SIZE / 1024);       /* physmem */
406 
407                     nbuf = 50;
408                     if (kbytes > 128 * 1024)
409                               nbuf += (kbytes - 128 * 1024) / (factor * 20);
410                     if (maxbcache && nbuf > maxbcache / NBUFCALCSIZE)
411                               nbuf = maxbcache / NBUFCALCSIZE;
412                     if ((size_t)nbuf * sizeof(struct buf) > MAXBUFSTRUCTSIZE) {
413                               kprintf("Warning: nbuf capped at %ld due to the "
414                                         "reasonability limit\n", nbuf);
415                               nbuf = MAXBUFSTRUCTSIZE / sizeof(struct buf);
416                     }
417           }
418 
419           /*
420            * Do not allow the buffer_map to be more then 1/2 the size of the
421            * kernel_map.
422            */
423           if (nbuf > (virtual_end - virtual_start +
424                         virtual2_end - virtual2_start) / (MAXBSIZE * 2)) {
425                     nbuf = (virtual_end - virtual_start +
426                               virtual2_end - virtual2_start) / (MAXBSIZE * 2);
427                     kprintf("Warning: nbufs capped at %ld due to kvm\n", nbuf);
428           }
429 
430           /*
431            * Do not allow the buffer_map to use more than 50% of available
432            * physical-equivalent memory.  Since the VM pages which back
433            * individual buffers are typically wired, having too many bufs
434            * can prevent the system from paging properly.
435            */
436           if (nbuf > physmem * PAGE_SIZE / (NBUFCALCSIZE * 2)) {
437                     nbuf = physmem * PAGE_SIZE / (NBUFCALCSIZE * 2);
438                     kprintf("Warning: nbufs capped at %ld due to physmem\n", nbuf);
439           }
440 
441           /*
442            * Do not allow the sizeof(struct buf) * nbuf to exceed 1/4 of
443            * the valloc space which is just the virtual_end - virtual_start
444            * section.  This is typically ~2GB regardless of the amount of
445            * memory, so we use 500MB as a metric.
446            *
447            * This is because we use valloc() to allocate the buf header array.
448            *
449            * NOTE: buffer space in bytes is limited by vfs.*bufspace sysctls.
450            */
451           if (nbuf > (virtual_end - virtual_start) / (sizeof(struct buf) * 4)) {
452                     nbuf = (virtual_end - virtual_start) /
453                            (sizeof(struct buf) * 4);
454                     kprintf("Warning: nbufs capped at %ld due to "
455                               "valloc considerations\n",
456                               nbuf);
457           }
458 
459           nswbuf_mem = lmax(lmin(nbuf / 32, 512), 8);
460 #ifdef NSWBUF_MIN
461           if (nswbuf_mem < NSWBUF_MIN)
462                     nswbuf_mem = NSWBUF_MIN;
463 #endif
464           nswbuf_kva = lmax(lmin(nbuf / 4, 512), 16);
465 #ifdef NSWBUF_MIN
466           if (nswbuf_kva < NSWBUF_MIN)
467                     nswbuf_kva = NSWBUF_MIN;
468 #endif
469 
470           valloc(swbuf_mem, struct buf, nswbuf_mem);
471           valloc(swbuf_kva, struct buf, nswbuf_kva);
472           valloc(buf, struct buf, nbuf);
473 
474           /*
475            * End of first pass, size has been calculated so allocate memory
476            */
477           if (firstaddr == 0) {
478                     size = (vm_size_t)(v - firstaddr);
479                     firstaddr = kmem_alloc(kernel_map, round_page(size),
480                                                VM_SUBSYS_BUF);
481                     if (firstaddr == 0)
482                               panic("startup: no room for tables");
483                     goto again;
484           }
485 
486           /*
487            * End of second pass, addresses have been assigned
488            *
489            * nbuf is an int, make sure we don't overflow the field.
490            *
491            * On 64-bit systems we always reserve maximal allocations for
492            * buffer cache buffers and there are no fragmentation issues,
493            * so the KVA segment does not have to be excessively oversized.
494            */
495           if ((vm_size_t)(v - firstaddr) != size)
496                     panic("startup: table size inconsistency");
497 
498           kmem_suballoc(kernel_map, clean_map, &clean_sva, &clean_eva,
499                           ((vm_offset_t)(nbuf + 16) * MAXBSIZE) +
500                           ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size);
501           kmem_suballoc(clean_map, buffer_map, &buffer_sva, &buffer_eva,
502                           ((vm_offset_t)(nbuf + 16) * MAXBSIZE));
503           buffer_map->system_map = 1;
504           kmem_suballoc(clean_map, pager_map, &pager_sva, &pager_eva,
505                           ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) +
506                           pager_map_size);
507           pager_map->system_map = 1;
508           kprintf("avail memory = %ju (%ju MB)\n",
509                     (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages),
510                     (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages) /
511                     1024 / 1024);
512 }
513 
514 struct cpu_idle_stat {
515           int       hint;
516           int       reserved;
517           u_long    halt;
518           u_long    spin;
519           u_long    repeat;
520           u_long    repeat_last;
521           u_long    repeat_delta;
522           u_long    mwait_cx[CPU_MWAIT_CX_MAX];
523 } __cachealign;
524 
525 #define CPU_IDLE_STAT_HALT    -1
526 #define CPU_IDLE_STAT_SPIN    -2
527 
528 static struct cpu_idle_stat   cpu_idle_stats[MAXCPU];
529 
530 static int
sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)531 sysctl_cpu_idle_cnt(SYSCTL_HANDLER_ARGS)
532 {
533           int idx = arg2, cpu, error;
534           u_long val = 0;
535 
536           if (idx == CPU_IDLE_STAT_HALT) {
537                     for (cpu = 0; cpu < ncpus; ++cpu)
538                               val += cpu_idle_stats[cpu].halt;
539           } else if (idx == CPU_IDLE_STAT_SPIN) {
540                     for (cpu = 0; cpu < ncpus; ++cpu)
541                               val += cpu_idle_stats[cpu].spin;
542           } else {
543                     KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
544                         ("invalid index %d", idx));
545                     for (cpu = 0; cpu < ncpus; ++cpu)
546                               val += cpu_idle_stats[cpu].mwait_cx[idx];
547           }
548 
549           error = sysctl_handle_quad(oidp, &val, 0, req);
550         if (error || req->newptr == NULL)
551                   return error;
552 
553           if (idx == CPU_IDLE_STAT_HALT) {
554                     for (cpu = 0; cpu < ncpus; ++cpu)
555                               cpu_idle_stats[cpu].halt = 0;
556                     cpu_idle_stats[0].halt = val;
557           } else if (idx == CPU_IDLE_STAT_SPIN) {
558                     for (cpu = 0; cpu < ncpus; ++cpu)
559                               cpu_idle_stats[cpu].spin = 0;
560                     cpu_idle_stats[0].spin = val;
561           } else {
562                     KASSERT(idx >= 0 && idx < CPU_MWAIT_CX_MAX,
563                         ("invalid index %d", idx));
564                     for (cpu = 0; cpu < ncpus; ++cpu)
565                               cpu_idle_stats[cpu].mwait_cx[idx] = 0;
566                     cpu_idle_stats[0].mwait_cx[idx] = val;
567           }
568           return 0;
569 }
570 
571 static void
cpu_mwait_attach(void)572 cpu_mwait_attach(void)
573 {
574           struct sbuf sb;
575           int hint_idx, i;
576 
577           if (!CPU_MWAIT_HAS_CX)
578                     return;
579 
580           if (cpu_vendor_id == CPU_VENDOR_INTEL &&
581               (CPUID_TO_FAMILY(cpu_id) > 0xf ||
582                (CPUID_TO_FAMILY(cpu_id) == 0x6 &&
583                 CPUID_TO_MODEL(cpu_id) >= 0xf))) {
584                     int bm_sts = 1;
585 
586                     /*
587                      * Pentium dual-core, Core 2 and beyond do not need any
588                      * additional activities to enter deep C-state, i.e. C3(+).
589                      */
590                     cpu_mwait_cx_no_bmarb();
591 
592                     TUNABLE_INT_FETCH("machdep.cpu.mwait.bm_sts", &bm_sts);
593                     if (!bm_sts)
594                               cpu_mwait_cx_no_bmsts();
595           }
596 
597           sbuf_new(&sb, cpu_mwait_cx_supported,
598               sizeof(cpu_mwait_cx_supported), SBUF_FIXEDLEN);
599 
600           for (i = 0; i < CPU_MWAIT_CX_MAX; ++i) {
601                     struct cpu_mwait_cx *cx = &cpu_mwait_cx_info[i];
602                     int sub;
603 
604                     ksnprintf(cx->name, sizeof(cx->name), "C%d", i);
605 
606                     sysctl_ctx_init(&cx->sysctl_ctx);
607                     cx->sysctl_tree = SYSCTL_ADD_NODE(&cx->sysctl_ctx,
608                         SYSCTL_STATIC_CHILDREN(_machdep_mwait), OID_AUTO,
609                         cx->name, CTLFLAG_RW, NULL, "Cx control/info");
610                     if (cx->sysctl_tree == NULL)
611                               continue;
612 
613                     cx->subcnt = CPUID_MWAIT_CX_SUBCNT(cpu_mwait_extemu, i);
614                     SYSCTL_ADD_INT(&cx->sysctl_ctx,
615                         SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
616                         "subcnt", CTLFLAG_RD, &cx->subcnt, 0,
617                         "sub-state count");
618                     SYSCTL_ADD_PROC(&cx->sysctl_ctx,
619                         SYSCTL_CHILDREN(cx->sysctl_tree), OID_AUTO,
620                         "entered", (CTLTYPE_QUAD | CTLFLAG_RW), 0,
621                         i, sysctl_cpu_idle_cnt, "Q", "# of times entered");
622 
623                     for (sub = 0; sub < cx->subcnt; ++sub)
624                               sbuf_printf(&sb, "C%d/%d ", i, sub);
625           }
626           sbuf_trim(&sb);
627           sbuf_finish(&sb);
628 
629           /*
630            * Non-deep C-states
631            */
632           cpu_mwait_c1_hints_cnt = cpu_mwait_cx_info[CPU_MWAIT_C1].subcnt;
633           for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i)
634                     cpu_mwait_hints_cnt += cpu_mwait_cx_info[i].subcnt;
635           cpu_mwait_hints = kmalloc(sizeof(int) * cpu_mwait_hints_cnt,
636                                           M_DEVBUF, M_WAITOK);
637 
638           hint_idx = 0;
639           for (i = CPU_MWAIT_C1; i < CPU_MWAIT_C3; ++i) {
640                     int j, subcnt;
641 
642                     subcnt = cpu_mwait_cx_info[i].subcnt;
643                     for (j = 0; j < subcnt; ++j) {
644                               KASSERT(hint_idx < cpu_mwait_hints_cnt,
645                                   ("invalid mwait hint index %d", hint_idx));
646                               cpu_mwait_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
647                               ++hint_idx;
648                     }
649           }
650           KASSERT(hint_idx == cpu_mwait_hints_cnt,
651               ("mwait hint count %d != index %d",
652                cpu_mwait_hints_cnt, hint_idx));
653 
654           if (bootverbose) {
655                     kprintf("MWAIT hints (%d C1 hints):\n", cpu_mwait_c1_hints_cnt);
656                     for (i = 0; i < cpu_mwait_hints_cnt; ++i) {
657                               int hint = cpu_mwait_hints[i];
658 
659                               kprintf("  C%d/%d hint 0x%04x\n",
660                                   MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
661                                   hint);
662                     }
663           }
664 
665           /*
666            * Deep C-states
667            */
668           for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i)
669                     cpu_mwait_deep_hints_cnt += cpu_mwait_cx_info[i].subcnt;
670           cpu_mwait_deep_hints = kmalloc(sizeof(int) * cpu_mwait_deep_hints_cnt,
671               M_DEVBUF, M_WAITOK);
672 
673           hint_idx = 0;
674           for (i = CPU_MWAIT_C1; i < CPU_MWAIT_CX_MAX; ++i) {
675                     int j, subcnt;
676 
677                     subcnt = cpu_mwait_cx_info[i].subcnt;
678                     for (j = 0; j < subcnt; ++j) {
679                               KASSERT(hint_idx < cpu_mwait_deep_hints_cnt,
680                                   ("invalid mwait deep hint index %d", hint_idx));
681                               cpu_mwait_deep_hints[hint_idx] = MWAIT_EAX_HINT(i, j);
682                               ++hint_idx;
683                     }
684           }
685           KASSERT(hint_idx == cpu_mwait_deep_hints_cnt,
686               ("mwait deep hint count %d != index %d",
687                cpu_mwait_deep_hints_cnt, hint_idx));
688 
689           if (bootverbose) {
690                     kprintf("MWAIT deep hints:\n");
691                     for (i = 0; i < cpu_mwait_deep_hints_cnt; ++i) {
692                               int hint = cpu_mwait_deep_hints[i];
693 
694                               kprintf("  C%d/%d hint 0x%04x\n",
695                                   MWAIT_EAX_TO_CX(hint), MWAIT_EAX_TO_CX_SUB(hint),
696                                   hint);
697                     }
698           }
699           cpu_idle_repeat_max = 256 * cpu_mwait_deep_hints_cnt;
700 
701           for (i = 0; i < ncpus; ++i) {
702                     char name[16];
703 
704                     ksnprintf(name, sizeof(name), "idle%d", i);
705                     SYSCTL_ADD_PROC(NULL,
706                         SYSCTL_STATIC_CHILDREN(_machdep_mwait_CX), OID_AUTO,
707                         name, (CTLTYPE_STRING | CTLFLAG_RW), &cpu_idle_stats[i],
708                         0, cpu_mwait_cx_pcpu_idle_sysctl, "A", "");
709           }
710 }
711 
712 static void
cpu_finish(void * dummy __unused)713 cpu_finish(void *dummy __unused)
714 {
715           cpu_setregs();
716           cpu_mwait_attach();
717 }
718 
719 static void
pic_finish(void * dummy __unused)720 pic_finish(void *dummy __unused)
721 {
722           /* Log ELCR information */
723           elcr_dump();
724 
725           /* Log MPTABLE information */
726           mptable_pci_int_dump();
727 
728           /* Finalize PCI */
729           MachIntrABI.finalize();
730 }
731 
732 /*
733  * Send an interrupt to process.
734  *
735  * Stack is set up to allow sigcode stored
736  * at top to call routine, followed by kcall
737  * to sigreturn routine below.  After sigreturn
738  * resets the signal mask, the stack, and the
739  * frame pointer, it returns to the user
740  * specified pc, psl.
741  */
742 void
sendsig(sig_t catcher,int sig,sigset_t * mask,u_long code)743 sendsig(sig_t catcher, int sig, sigset_t *mask, u_long code)
744 {
745           struct lwp *lp = curthread->td_lwp;
746           struct proc *p = lp->lwp_proc;
747           struct trapframe *regs;
748           struct sigacts *psp = p->p_sigacts;
749           struct sigframe sf, *sfp;
750           int oonstack;
751           char *sp;
752 
753           regs = lp->lwp_md.md_regs;
754           oonstack = (lp->lwp_sigstk.ss_flags & SS_ONSTACK) ? 1 : 0;
755 
756           /* Save user context */
757           bzero(&sf, sizeof(struct sigframe));
758           sf.sf_uc.uc_sigmask = *mask;
759           sf.sf_uc.uc_stack = lp->lwp_sigstk;
760           sf.sf_uc.uc_mcontext.mc_onstack = oonstack;
761           KKASSERT(__offsetof(struct trapframe, tf_rdi) == 0);
762           /* gcc errors out on optimized bcopy */
763           _bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(struct trapframe));
764 
765           /* Make the size of the saved context visible to userland */
766           sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext);
767 
768           /* Allocate and validate space for the signal handler context. */
769         if ((lp->lwp_flags & LWP_ALTSTACK) != 0 && !oonstack &&
770               SIGISMEMBER(psp->ps_sigonstack, sig)) {
771                     sp = (char *)lp->lwp_sigstk.ss_sp + lp->lwp_sigstk.ss_size -
772                         sizeof(struct sigframe);
773                     lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
774           } else {
775                     /* We take red zone into account */
776                     sp = (char *)regs->tf_rsp - sizeof(struct sigframe) - 128;
777           }
778 
779           /*
780            * XXX AVX needs 64-byte alignment but sigframe has other fields and
781            * the embedded ucontext is not at the front, so aligning this won't
782            * help us.  Fortunately we bcopy in/out of the sigframe, so the
783            * kernel is ok.
784            *
785            * The problem though is if userland winds up trying to use the
786            * context directly.
787            */
788           sfp = (struct sigframe *)((intptr_t)sp & ~(intptr_t)0xF);
789 
790           /* Translate the signal is appropriate */
791           if (p->p_sysent->sv_sigtbl) {
792                     if (sig <= p->p_sysent->sv_sigsize)
793                               sig = p->p_sysent->sv_sigtbl[_SIG_IDX(sig)];
794           }
795 
796           /*
797            * Build the argument list for the signal handler.
798            *
799            * Arguments are in registers (%rdi, %rsi, %rdx, %rcx)
800            */
801           regs->tf_rdi = sig;                               /* argument 1 */
802           regs->tf_rdx = (register_t)&sfp->sf_uc;           /* argument 3 */
803 
804           if (SIGISMEMBER(psp->ps_siginfo, sig)) {
805                     /*
806                      * Signal handler installed with SA_SIGINFO.
807                      *
808                      * action(signo, siginfo, ucontext)
809                      */
810                     regs->tf_rsi = (register_t)&sfp->sf_si; /* argument 2 */
811                     regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
812                     sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
813 
814                     /* fill siginfo structure */
815                     sf.sf_si.si_signo = sig;
816                     sf.sf_si.si_pid = psp->ps_frominfo[sig].pid;
817                     sf.sf_si.si_uid = psp->ps_frominfo[sig].uid;
818                     sf.sf_si.si_code = code;
819                     sf.sf_si.si_addr = (void *)regs->tf_addr;
820           } else {
821                     /*
822                      * Old FreeBSD-style arguments.
823                      *
824                      * handler (signo, code, [uc], addr)
825                      */
826                     regs->tf_rsi = (register_t)code;        /* argument 2 */
827                     regs->tf_rcx = (register_t)regs->tf_addr; /* argument 4 */
828                     sf.sf_ahu.sf_handler = catcher;
829           }
830 
831           /*
832            * If we're a vm86 process, we want to save the segment registers.
833            * We also change eflags to be our emulated eflags, not the actual
834            * eflags.
835            */
836 #if 0 /* JG */
837           if (regs->tf_eflags & PSL_VM) {
838                     struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
839                     struct vm86_kernel *vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
840 
841                     sf.sf_uc.uc_mcontext.mc_gs = tf->tf_vm86_gs;
842                     sf.sf_uc.uc_mcontext.mc_fs = tf->tf_vm86_fs;
843                     sf.sf_uc.uc_mcontext.mc_es = tf->tf_vm86_es;
844                     sf.sf_uc.uc_mcontext.mc_ds = tf->tf_vm86_ds;
845 
846                     if (vm86->vm86_has_vme == 0)
847                               sf.sf_uc.uc_mcontext.mc_eflags =
848                                   (tf->tf_eflags & ~(PSL_VIF | PSL_VIP)) |
849                                   (vm86->vm86_eflags & (PSL_VIF | PSL_VIP));
850 
851                     /*
852                      * Clear PSL_NT to inhibit T_TSSFLT faults on return from
853                      * syscalls made by the signal handler.  This just avoids
854                      * wasting time for our lazy fixup of such faults.  PSL_NT
855                      * does nothing in vm86 mode, but vm86 programs can set it
856                      * almost legitimately in probes for old cpu types.
857                      */
858                     tf->tf_eflags &= ~(PSL_VM | PSL_NT | PSL_VIF | PSL_VIP);
859           }
860 #endif
861 
862           /*
863            * Save the FPU state and reinit the FP unit
864            */
865           npxpush(&sf.sf_uc.uc_mcontext);
866 
867           /*
868            * Copy the sigframe out to the user's stack.
869            */
870           if (copyout(&sf, sfp, sizeof(struct sigframe)) != 0) {
871                     /*
872                      * Something is wrong with the stack pointer.
873                      * ...Kill the process.
874                      */
875                     sigexit(lp, SIGILL);
876           }
877 
878           regs->tf_rsp = (register_t)sfp;
879           regs->tf_rip = trunc_page64(PS_STRINGS - *(p->p_sysent->sv_szsigcode));
880           regs->tf_rip -= SZSIGCODE_EXTRA_BYTES;
881 
882           /*
883            * x86 abi specifies that the direction flag must be cleared
884            * on function entry
885            */
886           regs->tf_rflags &= ~(PSL_T | PSL_D);
887 
888           /*
889            * 64 bit mode has a code and stack selector but
890            * no data or extra selector.  %fs and %gs are not
891            * stored in-context.
892            */
893           regs->tf_cs = _ucodesel;
894           regs->tf_ss = _udatasel;
895           clear_quickret();
896 }
897 
898 /*
899  * Sanitize the trapframe for a virtual kernel passing control to a custom
900  * VM context.  Remove any items that would otherwise create a privilage
901  * issue.
902  *
903  * XXX at the moment we allow userland to set the resume flag.  Is this a
904  * bad idea?
905  */
906 int
cpu_sanitize_frame(struct trapframe * frame)907 cpu_sanitize_frame(struct trapframe *frame)
908 {
909           frame->tf_cs = _ucodesel;
910           frame->tf_ss = _udatasel;
911           /* XXX VM (8086) mode not supported? */
912           frame->tf_rflags &= (PSL_RF | PSL_USERCHANGE | PSL_VM_UNSUPP);
913           frame->tf_rflags |= PSL_RESERVED_DEFAULT | PSL_I;
914 
915           return(0);
916 }
917 
918 /*
919  * Sanitize the tls so loading the descriptor does not blow up
920  * on us.  For x86_64 we don't have to do anything.
921  */
922 int
cpu_sanitize_tls(struct savetls * tls)923 cpu_sanitize_tls(struct savetls *tls)
924 {
925           return(0);
926 }
927 
928 /*
929  * sigreturn(ucontext_t *sigcntxp)
930  *
931  * System call to cleanup state after a signal
932  * has been taken.  Reset signal mask and
933  * stack state from context left by sendsig (above).
934  * Return to previous pc and psl as specified by
935  * context left by sendsig. Check carefully to
936  * make sure that the user has not modified the
937  * state to gain improper privileges.
938  *
939  * MPSAFE
940  */
941 #define   EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
942 #define   CS_SECURE(cs)                 (ISPL(cs) == SEL_UPL)
943 
944 int
sys_sigreturn(struct sysmsg * sysmsg,const struct sigreturn_args * uap)945 sys_sigreturn(struct sysmsg *sysmsg, const struct sigreturn_args *uap)
946 {
947           struct lwp *lp = curthread->td_lwp;
948           struct trapframe *regs;
949           ucontext_t uc;
950           ucontext_t *ucp;
951           register_t rflags;
952           int cs;
953           int error;
954 
955           /*
956            * We have to copy the information into kernel space so userland
957            * can't modify it while we are sniffing it.
958            */
959           regs = lp->lwp_md.md_regs;
960           error = copyin(uap->sigcntxp, &uc, sizeof(uc));
961           if (error)
962                     return (error);
963           ucp = &uc;
964           rflags = ucp->uc_mcontext.mc_rflags;
965 
966           /* VM (8086) mode not supported */
967           rflags &= ~PSL_VM_UNSUPP;
968 
969 #if 0 /* JG */
970           if (eflags & PSL_VM) {
971                     struct trapframe_vm86 *tf = (struct trapframe_vm86 *)regs;
972                     struct vm86_kernel *vm86;
973 
974                     /*
975                      * if pcb_ext == 0 or vm86_inited == 0, the user hasn't
976                      * set up the vm86 area, and we can't enter vm86 mode.
977                      */
978                     if (lp->lwp_thread->td_pcb->pcb_ext == 0)
979                               return (EINVAL);
980                     vm86 = &lp->lwp_thread->td_pcb->pcb_ext->ext_vm86;
981                     if (vm86->vm86_inited == 0)
982                               return (EINVAL);
983 
984                     /* go back to user mode if both flags are set */
985                     if ((eflags & PSL_VIP) && (eflags & PSL_VIF))
986                               trapsignal(lp, SIGBUS, 0);
987 
988                     if (vm86->vm86_has_vme) {
989                               eflags = (tf->tf_eflags & ~VME_USERCHANGE) |
990                                   (eflags & VME_USERCHANGE) | PSL_VM;
991                     } else {
992                               vm86->vm86_eflags = eflags;   /* save VIF, VIP */
993                               eflags = (tf->tf_eflags & ~VM_USERCHANGE) |
994                                   (eflags & VM_USERCHANGE) | PSL_VM;
995                     }
996                     bcopy(&ucp->uc_mcontext.mc_gs, tf, sizeof(struct trapframe));
997                     tf->tf_eflags = eflags;
998                     tf->tf_vm86_ds = tf->tf_ds;
999                     tf->tf_vm86_es = tf->tf_es;
1000                     tf->tf_vm86_fs = tf->tf_fs;
1001                     tf->tf_vm86_gs = tf->tf_gs;
1002                     tf->tf_ds = _udatasel;
1003                     tf->tf_es = _udatasel;
1004                     tf->tf_fs = _udatasel;
1005                     tf->tf_gs = _udatasel;
1006           } else
1007 #endif
1008           {
1009                     /*
1010                      * Don't allow users to change privileged or reserved flags.
1011                      */
1012                     /*
1013                      * XXX do allow users to change the privileged flag PSL_RF.
1014                      * The cpu sets PSL_RF in tf_eflags for faults.  Debuggers
1015                      * should sometimes set it there too.  tf_eflags is kept in
1016                      * the signal context during signal handling and there is no
1017                      * other place to remember it, so the PSL_RF bit may be
1018                      * corrupted by the signal handler without us knowing.
1019                      * Corruption of the PSL_RF bit at worst causes one more or
1020                      * one less debugger trap, so allowing it is fairly harmless.
1021                      */
1022                     if (!EFL_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) {
1023                               kprintf("sigreturn: rflags = 0x%lx\n", (long)rflags);
1024                               return(EINVAL);
1025                     }
1026 
1027                     /*
1028                      * Don't allow users to load a valid privileged %cs.  Let the
1029                      * hardware check for invalid selectors, excess privilege in
1030                      * other selectors, invalid %eip's and invalid %esp's.
1031                      */
1032                     cs = ucp->uc_mcontext.mc_cs;
1033                     if (!CS_SECURE(cs)) {
1034                               kprintf("sigreturn: cs = 0x%x\n", cs);
1035                               trapsignal(lp, SIGBUS, T_PROTFLT);
1036                               return(EINVAL);
1037                     }
1038                     /* gcc errors out on optimized bcopy */
1039                     _bcopy(&ucp->uc_mcontext.mc_rdi, regs,
1040                            sizeof(struct trapframe));
1041           }
1042 
1043           /*
1044            * Restore the FPU state from the frame
1045            */
1046           crit_enter();
1047           npxpop(&ucp->uc_mcontext);
1048 
1049           if (ucp->uc_mcontext.mc_onstack & 1)
1050                     lp->lwp_sigstk.ss_flags |= SS_ONSTACK;
1051           else
1052                     lp->lwp_sigstk.ss_flags &= ~SS_ONSTACK;
1053 
1054           lp->lwp_sigmask = ucp->uc_sigmask;
1055           SIG_CANTMASK(lp->lwp_sigmask);
1056           clear_quickret();
1057           crit_exit();
1058           return(EJUSTRETURN);
1059 }
1060 
1061 /*
1062  * Machine dependent boot() routine
1063  *
1064  * I haven't seen anything to put here yet
1065  * Possibly some stuff might be grafted back here from boot()
1066  */
1067 void
cpu_boot(int howto)1068 cpu_boot(int howto)
1069 {
1070 }
1071 
1072 /*
1073  * Shutdown the CPU as much as possible
1074  */
1075 void
cpu_halt(void)1076 cpu_halt(void)
1077 {
1078           for (;;)
1079                     __asm__ __volatile("hlt");
1080 }
1081 
1082 /*
1083  * cpu_idle() represents the idle LWKT.  You cannot return from this function
1084  * (unless you want to blow things up!).  Instead we look for runnable threads
1085  * and loop or halt as appropriate.  Giant is not held on entry to the thread.
1086  *
1087  * The main loop is entered with a critical section held, we must release
1088  * the critical section before doing anything else.  lwkt_switch() will
1089  * check for pending interrupts due to entering and exiting its own
1090  * critical section.
1091  *
1092  * NOTE: On an SMP system we rely on a scheduler IPI to wake a HLTed cpu up.
1093  *         However, there are cases where the idlethread will be entered with
1094  *         the possibility that no IPI will occur and in such cases
1095  *         lwkt_switch() sets TDF_IDLE_NOHLT.
1096  *
1097  * NOTE: cpu_idle_repeat determines how many entries into the idle thread
1098  *         must occur before it starts using ACPI halt.
1099  *
1100  * NOTE: Value overridden in hammer_time().
1101  */
1102 static int          cpu_idle_hlt = 2;
1103 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_hlt, CTLFLAG_RW,
1104     &cpu_idle_hlt, 0, "Idle loop HLT enable");
1105 SYSCTL_INT(_machdep, OID_AUTO, cpu_idle_repeat, CTLFLAG_RW,
1106     &cpu_idle_repeat, 0, "Idle entries before acpi hlt");
1107 
1108 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_hltcnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1109     0, CPU_IDLE_STAT_HALT, sysctl_cpu_idle_cnt, "Q", "Idle loop entry halts");
1110 SYSCTL_PROC(_machdep, OID_AUTO, cpu_idle_spincnt, (CTLTYPE_QUAD | CTLFLAG_RW),
1111     0, CPU_IDLE_STAT_SPIN, sysctl_cpu_idle_cnt, "Q", "Idle loop entry spins");
1112 
1113 static void
cpu_idle_default_hook(void)1114 cpu_idle_default_hook(void)
1115 {
1116           /*
1117            * We must guarentee that hlt is exactly the instruction
1118            * following the sti.
1119            */
1120           __asm __volatile("sti; hlt");
1121 }
1122 
1123 /* Other subsystems (e.g., ACPI) can hook this later. */
1124 void (*cpu_idle_hook)(void) = cpu_idle_default_hook;
1125 
1126 static __inline int
cpu_mwait_cx_hint(struct cpu_idle_stat * stat)1127 cpu_mwait_cx_hint(struct cpu_idle_stat *stat)
1128 {
1129           int hint, cx_idx;
1130           u_int idx;
1131 
1132           hint = stat->hint;
1133           if (hint >= 0)
1134                     goto done;
1135 
1136           idx = (stat->repeat + stat->repeat_last + stat->repeat_delta) >>
1137               cpu_mwait_repeat_shift;
1138           if (idx >= cpu_mwait_c1_hints_cnt) {
1139                     /* Step up faster, once we walked through all C1 states */
1140                     stat->repeat_delta += 1 << (cpu_mwait_repeat_shift + 1);
1141           }
1142           if (hint == CPU_MWAIT_HINT_AUTODEEP) {
1143                     if (idx >= cpu_mwait_deep_hints_cnt)
1144                               idx = cpu_mwait_deep_hints_cnt - 1;
1145                     hint = cpu_mwait_deep_hints[idx];
1146           } else {
1147                     if (idx >= cpu_mwait_hints_cnt)
1148                               idx = cpu_mwait_hints_cnt - 1;
1149                     hint = cpu_mwait_hints[idx];
1150           }
1151 done:
1152           cx_idx = MWAIT_EAX_TO_CX(hint);
1153           if (cx_idx >= 0 && cx_idx < CPU_MWAIT_CX_MAX)
1154                     stat->mwait_cx[cx_idx]++;
1155           return hint;
1156 }
1157 
1158 void
cpu_idle(void)1159 cpu_idle(void)
1160 {
1161           globaldata_t gd = mycpu;
1162           struct cpu_idle_stat *stat = &cpu_idle_stats[gd->gd_cpuid];
1163           struct thread *td __debugvar = gd->gd_curthread;
1164           int reqflags;
1165 
1166           stat->repeat = stat->repeat_last = cpu_idle_repeat_max;
1167 
1168           crit_exit();
1169           KKASSERT(td->td_critcount == 0);
1170 
1171           for (;;) {
1172                     /*
1173                      * See if there are any LWKTs ready to go.
1174                      */
1175                     lwkt_switch();
1176 
1177                     /*
1178                      * When halting inside a cli we must check for reqflags
1179                      * races, particularly [re]schedule requests.  Running
1180                      * splz() does the job.
1181                      *
1182                      * cpu_idle_hlt:
1183                      *        0         Never halt, just spin
1184                      *
1185                      *        1         Always use MONITOR/MWAIT if avail, HLT
1186                      *                  otherwise.
1187                      *
1188                      *                  Better default for modern (Haswell+) Intel
1189                      *                  cpus.
1190                      *
1191                      *        2         Use HLT/MONITOR/MWAIT up to a point and then
1192                      *                  use the ACPI halt (default).  This is a hybrid
1193                      *                  approach.  See machdep.cpu_idle_repeat.
1194                      *
1195                      *                  Better default for modern AMD cpus and older
1196                      *                  Intel cpus.
1197                      *
1198                      *        3         Always use the ACPI halt.  This typically
1199                      *                  eats the least amount of power but the cpu
1200                      *                  will be slow waking up.  Slows down e.g.
1201                      *                  compiles and other pipe/event oriented stuff.
1202                      *
1203                      *                  Usually the best default for AMD cpus.
1204                      *
1205                      *        4         Always use HLT.
1206                      *
1207                      *        5         Always spin.
1208                      *
1209                      * NOTE: Interrupts are enabled and we are not in a critical
1210                      *         section.
1211                      *
1212                      * NOTE: Preemptions do not reset gd_idle_repeat.   Also we
1213                      *         don't bother capping gd_idle_repeat, it is ok if
1214                      *         it overflows (we do make it unsigned, however).
1215                      *
1216                      * Implement optimized invltlb operations when halted
1217                      * in idle.  By setting the bit in smp_idleinvl_mask
1218                      * we inform other cpus that they can set _reqs to
1219                      * request an invltlb.  Current the code to do that
1220                      * sets the bits in _reqs anyway, but then check _mask
1221                      * to determine if they can assume the invltlb will execute.
1222                      *
1223                      * A critical section is required to ensure that interrupts
1224                      * do not fully run until after we've had a chance to execute
1225                      * the request.
1226                      */
1227                     if (gd->gd_idle_repeat == 0) {
1228                               stat->repeat = (stat->repeat + stat->repeat_last) >> 1;
1229                               if (stat->repeat > cpu_idle_repeat_max)
1230                                         stat->repeat = cpu_idle_repeat_max;
1231                               stat->repeat_last = 0;
1232                               stat->repeat_delta = 0;
1233                     }
1234                     ++stat->repeat_last;
1235 
1236                     /*
1237                      * General idle thread halt code
1238                      *
1239                      * IBRS NOTES - IBRS is a SPECTRE mitigation.  When going
1240                      *                  idle, disable IBRS to reduce hyperthread
1241                      *                  overhead.
1242                      */
1243                     ++gd->gd_idle_repeat;
1244 
1245                     switch(cpu_idle_hlt) {
1246                     default:
1247                     case 0:
1248                               /*
1249                                * Always spin
1250                                */
1251                               ;
1252 do_spin:
1253                               splz();
1254                               __asm __volatile("sti");
1255                               stat->spin++;
1256                               crit_enter_gd(gd);
1257                               crit_exit_gd(gd);
1258                               break;
1259                     case 2:
1260                               /*
1261                                * Use MONITOR/MWAIT (or HLT) for a few cycles,
1262                                * then start using the ACPI halt code if we
1263                                * continue to be idle.
1264                                */
1265                               if (gd->gd_idle_repeat >= cpu_idle_repeat)
1266                                         goto do_acpi;
1267                               /* FALL THROUGH */
1268                     case 1:
1269                               /*
1270                                * Always use MONITOR/MWAIT (will use HLT if
1271                                * MONITOR/MWAIT not available).
1272                                */
1273                               if (cpu_mi_feature & CPU_MI_MONITOR) {
1274                                         splz(); /* XXX */
1275                                         reqflags = gd->gd_reqflags;
1276                                         if (reqflags & RQF_IDLECHECK_WK_MASK)
1277                                                   goto do_spin;
1278                                         crit_enter_gd(gd);
1279                                         ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask, gd->gd_cpuid);
1280                                         /*
1281                                          * IBRS/STIBP
1282                                          */
1283                                         if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1284                                             SPEC_CTRL_DUMMY_ENABLE) {
1285                                                   wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1286                                         }
1287                                         cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1288                                                               cpu_mwait_cx_hint(stat), 0);
1289                                         if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1290                                             SPEC_CTRL_DUMMY_ENABLE) {
1291                                                   wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1292                                         }
1293                                         stat->halt++;
1294                                         ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask, gd->gd_cpuid);
1295                                         if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1296                                                                             gd->gd_cpuid)) {
1297                                                   cpu_invltlb();
1298                                                   cpu_mfence();
1299                                         }
1300                                         crit_exit_gd(gd);
1301                                         break;
1302                               }
1303                               /* FALLTHROUGH */
1304                     case 4:
1305                               /*
1306                                * Use HLT
1307                                */
1308                               __asm __volatile("cli");
1309                               splz();
1310                               crit_enter_gd(gd);
1311                               if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1312                                         ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1313                                                                  gd->gd_cpuid);
1314                                         if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1315                                             SPEC_CTRL_DUMMY_ENABLE) {
1316                                                   wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1317                                         }
1318                                         cpu_idle_default_hook();
1319                                         if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1320                                             SPEC_CTRL_DUMMY_ENABLE) {
1321                                                   wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1322                                         }
1323                                         ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1324                                                                    gd->gd_cpuid);
1325                                         if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1326                                                                             gd->gd_cpuid)) {
1327                                                   cpu_invltlb();
1328                                                   cpu_mfence();
1329                                         }
1330                               }
1331                               __asm __volatile("sti");
1332                               stat->halt++;
1333                               crit_exit_gd(gd);
1334                               break;
1335                     case 3:
1336                               /*
1337                                * Use ACPI halt
1338                                */
1339                               ;
1340 do_acpi:
1341                               __asm __volatile("cli");
1342                               splz();
1343                               crit_enter_gd(gd);
1344                               if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1345                                         ATOMIC_CPUMASK_ORBIT(smp_idleinvl_mask,
1346                                                                  gd->gd_cpuid);
1347                                         if (pscpu->trampoline.tr_pcb_spec_ctrl[1] &
1348                                             SPEC_CTRL_DUMMY_ENABLE) {
1349                                                   wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[1] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1350                                         }
1351                                         cpu_idle_hook();
1352                                         if (pscpu->trampoline.tr_pcb_spec_ctrl[0] &
1353                                             SPEC_CTRL_DUMMY_ENABLE) {
1354                                                   wrmsr(MSR_SPEC_CTRL, pscpu->trampoline.tr_pcb_spec_ctrl[0] & (SPEC_CTRL_IBRS|SPEC_CTRL_STIBP));
1355                                         }
1356                                         ATOMIC_CPUMASK_NANDBIT(smp_idleinvl_mask,
1357                                                                    gd->gd_cpuid);
1358                                         if (ATOMIC_CPUMASK_TESTANDCLR(smp_idleinvl_reqs,
1359                                                                             gd->gd_cpuid)) {
1360                                                   cpu_invltlb();
1361                                                   cpu_mfence();
1362                                         }
1363                               }
1364                               __asm __volatile("sti");
1365                               stat->halt++;
1366                               crit_exit_gd(gd);
1367                               break;
1368                     }
1369           }
1370 }
1371 
1372 /*
1373  * Called from deep ACPI via cpu_idle_hook() (see above) to actually halt
1374  * the cpu in C1.  ACPI might use other halt methods for deeper states
1375  * and not reach here.
1376  *
1377  * For now we always use HLT as we are not sure what ACPI may have actually
1378  * done.  MONITOR/MWAIT might not be appropriate.
1379  *
1380  * NOTE: MONITOR/MWAIT does not appear to throttle AMD cpus, while HLT
1381  *         does.  On Intel, MONITOR/MWAIT does appear to throttle the cpu.
1382  */
1383 void
cpu_idle_halt(void)1384 cpu_idle_halt(void)
1385 {
1386           globaldata_t gd;
1387 
1388           gd = mycpu;
1389 #if 0
1390           /* DISABLED FOR NOW */
1391           struct cpu_idle_stat *stat;
1392           int reqflags;
1393 
1394 
1395           if ((cpu_idle_hlt == 1 || cpu_idle_hlt == 2) &&
1396               (cpu_mi_feature & CPU_MI_MONITOR) &&
1397               cpu_vendor_id != CPU_VENDOR_AMD) {
1398                     /*
1399                      * Use MONITOR/MWAIT
1400                      *
1401                      * (NOTE: On ryzen, MWAIT does not throttle clocks, so we
1402                      *          have to use HLT)
1403                      */
1404                     stat = &cpu_idle_stats[gd->gd_cpuid];
1405                     reqflags = gd->gd_reqflags;
1406                     if ((reqflags & RQF_IDLECHECK_WK_MASK) == 0) {
1407                               __asm __volatile("sti");
1408                               cpu_mmw_pause_int(&gd->gd_reqflags, reqflags,
1409                                                     cpu_mwait_cx_hint(stat), 0);
1410                     } else {
1411                               __asm __volatile("sti; pause");
1412                     }
1413           } else
1414 #endif
1415           {
1416                     /*
1417                      * Use HLT
1418                      */
1419                     if ((gd->gd_reqflags & RQF_IDLECHECK_WK_MASK) == 0)
1420                               __asm __volatile("sti; hlt");
1421                     else
1422                               __asm __volatile("sti; pause");
1423           }
1424 }
1425 
1426 
1427 /*
1428  * Called in a loop indirectly via Xcpustop
1429  */
1430 void
cpu_smp_stopped(void)1431 cpu_smp_stopped(void)
1432 {
1433           globaldata_t gd = mycpu;
1434           volatile __uint64_t *ptr;
1435           __uint64_t ovalue;
1436 
1437           ptr = CPUMASK_ADDR(started_cpus, gd->gd_cpuid);
1438           ovalue = *ptr;
1439           if ((ovalue & CPUMASK_SIMPLE(gd->gd_cpuid & 63)) == 0) {
1440                     if (cpu_mi_feature & CPU_MI_MONITOR) {
1441                               if (cpu_mwait_hints) {
1442                                         cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1443                                                      ovalue,
1444                                                      cpu_mwait_hints[
1445                                                             cpu_mwait_hints_cnt - 1], 0);
1446                               } else {
1447                                         cpu_mmw_pause_long(__DEVOLATILE(void *, ptr),
1448                                                      ovalue, 0, 0);
1449                               }
1450                     } else {
1451                               cpu_halt();         /* depend on lapic timer */
1452                     }
1453           }
1454 }
1455 
1456 /*
1457  * This routine is called if a spinlock has been held through the
1458  * exponential backoff period and is seriously contested.  On a real cpu
1459  * we let it spin.
1460  */
1461 void
cpu_spinlock_contested(void)1462 cpu_spinlock_contested(void)
1463 {
1464           cpu_pause();
1465 }
1466 
1467 /*
1468  * Clear registers on exec
1469  */
1470 void
exec_setregs(u_long entry,u_long stack,u_long ps_strings)1471 exec_setregs(u_long entry, u_long stack, u_long ps_strings)
1472 {
1473           struct thread *td = curthread;
1474           struct lwp *lp = td->td_lwp;
1475           struct pcb *pcb = td->td_pcb;
1476           struct trapframe *regs = lp->lwp_md.md_regs;
1477 
1478           user_ldt_free(pcb);
1479 
1480           clear_quickret();
1481           bzero((char *)regs, sizeof(struct trapframe));
1482           regs->tf_rip = entry;
1483           regs->tf_rsp = ((stack - 8) & ~0xFul) + 8; /* align the stack */
1484           regs->tf_rdi = stack;                   /* argv */
1485           regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T);
1486           regs->tf_ss = _udatasel;
1487           regs->tf_cs = _ucodesel;
1488           regs->tf_rbx = ps_strings;
1489 
1490           /*
1491            * Reset the hardware debug registers if they were in use.
1492            * They won't have any meaning for the newly exec'd process.
1493            */
1494           if (pcb->pcb_flags & PCB_DBREGS) {
1495                     pcb->pcb_dr0 = 0;
1496                     pcb->pcb_dr1 = 0;
1497                     pcb->pcb_dr2 = 0;
1498                     pcb->pcb_dr3 = 0;
1499                     pcb->pcb_dr6 = 0;
1500                     pcb->pcb_dr7 = 0; /* JG set bit 10? */
1501                     if (pcb == td->td_pcb) {
1502                               /*
1503                                * Clear the debug registers on the running
1504                                * CPU, otherwise they will end up affecting
1505                                * the next process we switch to.
1506                                */
1507                               reset_dbregs();
1508                     }
1509                     pcb->pcb_flags &= ~PCB_DBREGS;
1510           }
1511 
1512           /*
1513            * Initialize the math emulator (if any) for the current process.
1514            * Actually, just clear the bit that says that the emulator has
1515            * been initialized.  Initialization is delayed until the process
1516            * traps to the emulator (if it is done at all) mainly because
1517            * emulators don't provide an entry point for initialization.
1518            */
1519           pcb->pcb_flags &= ~FP_SOFTFP;
1520 
1521           /*
1522            * NOTE: do not set CR0_TS here.  npxinit() must do it after clearing
1523            *         gd_npxthread.  Otherwise a preemptive interrupt thread
1524            *         may panic in npxdna().
1525            */
1526           crit_enter();
1527           load_cr0(rcr0() | CR0_MP);
1528 
1529           /*
1530            * NOTE: The MSR values must be correct so we can return to
1531            *         userland.  gd_user_fs/gs must be correct so the switch
1532            *         code knows what the current MSR values are.
1533            */
1534           pcb->pcb_fsbase = 0;          /* Values loaded from PCB on switch */
1535           pcb->pcb_gsbase = 0;
1536           mdcpu->gd_user_fs = 0;        /* Cache of current MSR values */
1537           mdcpu->gd_user_gs = 0;
1538           wrmsr(MSR_FSBASE, 0);         /* Set MSR values for return to userland */
1539           wrmsr(MSR_KGSBASE, 0);
1540 
1541           /* Initialize the npx (if any) for the current process. */
1542           npxinit();
1543           crit_exit();
1544 
1545           pcb->pcb_ds = _udatasel;
1546           pcb->pcb_es = _udatasel;
1547           pcb->pcb_fs = _udatasel;
1548           pcb->pcb_gs = _udatasel;
1549 }
1550 
1551 void
cpu_setregs(void)1552 cpu_setregs(void)
1553 {
1554           register_t cr0;
1555 
1556           cr0 = rcr0();
1557           cr0 |= CR0_NE;                          /* Done by npxinit() */
1558           cr0 |= CR0_MP | CR0_TS;                 /* Done at every execve() too. */
1559           cr0 |= CR0_WP | CR0_AM;
1560           load_cr0(cr0);
1561           load_gs(_udatasel);
1562 }
1563 
1564 static int
sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)1565 sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS)
1566 {
1567           int error;
1568           error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2,
1569                     req);
1570           if (!error && req->newptr)
1571                     resettodr();
1572           return (error);
1573 }
1574 
1575 SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
1576           &adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
1577 
1578 SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
1579           CTLFLAG_RW, &disable_rtc_set, 0, "");
1580 
1581 #if 0 /* JG */
1582 SYSCTL_STRUCT(_machdep, CPU_BOOTINFO, bootinfo,
1583           CTLFLAG_RD, &bootinfo, bootinfo, "");
1584 #endif
1585 
1586 SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
1587           CTLFLAG_RW, &wall_cmos_clock, 0, "");
1588 
1589 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1590 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1591 {
1592           struct efi_map_header *efihdr;
1593           caddr_t kmdp;
1594           uint32_t efisize;
1595 
1596           kmdp = preload_search_by_type("elf kernel");
1597           if (kmdp == NULL)
1598                     kmdp = preload_search_by_type("elf64 kernel");
1599           efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1600               MODINFO_METADATA | MODINFOMD_EFI_MAP);
1601           if (efihdr == NULL)
1602                     return (0);
1603           efisize = *((uint32_t *)efihdr - 1);
1604           return (SYSCTL_OUT(req, efihdr, efisize));
1605 }
1606 SYSCTL_PROC(_machdep, OID_AUTO, efi_map, CTLTYPE_OPAQUE|CTLFLAG_RD, NULL, 0,
1607     efi_map_sysctl_handler, "S,efi_map_header", "Raw EFI Memory Map");
1608 
1609 /*
1610  * Initialize x86 and configure to run kernel
1611  */
1612 
1613 /*
1614  * Initialize segments & interrupt table
1615  */
1616 
1617 int _default_ldt;
1618 struct user_segment_descriptor gdt_cpu0[MAXGDT_COUNT];
1619 struct gate_descriptor idt_arr[MAXCPU][NIDT];
1620 #if 0 /* JG */
1621 union descriptor ldt[NLDT];             /* local descriptor table */
1622 #endif
1623 
1624 /* table descriptors - used to load tables by cpu */
1625 struct region_descriptor r_gdt;
1626 struct region_descriptor r_idt_arr[MAXCPU];
1627 
1628 /* JG proc0paddr is a virtual address */
1629 void *proc0paddr;
1630 /* JG alignment? */
1631 char proc0paddr_buff[LWKT_THREAD_STACK];
1632 
1633 
1634 /* software prototypes -- in more palatable form */
1635 struct soft_segment_descriptor gdt_segs[] = {
1636 /* GNULL_SEL        0 Null Descriptor */
1637 {         0x0,                          /* segment base address  */
1638           0x0,                          /* length */
1639           0,                            /* segment type */
1640           0,                            /* segment descriptor priority level */
1641           0,                            /* segment descriptor present */
1642           0,                            /* long */
1643           0,                            /* default 32 vs 16 bit size */
1644           0                             /* limit granularity (byte/page units)*/ },
1645 /* GCODE_SEL        1 Code Descriptor for kernel */
1646 {         0x0,                          /* segment base address  */
1647           0xfffff,            /* length - all address space */
1648           SDT_MEMERA,                   /* segment type */
1649           SEL_KPL,            /* segment descriptor priority level */
1650           1,                            /* segment descriptor present */
1651           1,                            /* long */
1652           0,                            /* default 32 vs 16 bit size */
1653           1                             /* limit granularity (byte/page units)*/ },
1654 /* GDATA_SEL        2 Data Descriptor for kernel */
1655 {         0x0,                          /* segment base address  */
1656           0xfffff,            /* length - all address space */
1657           SDT_MEMRWA,                   /* segment type */
1658           SEL_KPL,            /* segment descriptor priority level */
1659           1,                            /* segment descriptor present */
1660           1,                            /* long */
1661           0,                            /* default 32 vs 16 bit size */
1662           1                             /* limit granularity (byte/page units)*/ },
1663 /* GUCODE32_SEL     3 32 bit Code Descriptor for user */
1664 {         0x0,                          /* segment base address  */
1665           0xfffff,            /* length - all address space */
1666           SDT_MEMERA,                   /* segment type */
1667           SEL_UPL,            /* segment descriptor priority level */
1668           1,                            /* segment descriptor present */
1669           0,                            /* long */
1670           1,                            /* default 32 vs 16 bit size */
1671           1                             /* limit granularity (byte/page units)*/ },
1672 /* GUDATA_SEL       4 32/64 bit Data Descriptor for user */
1673 {         0x0,                          /* segment base address  */
1674           0xfffff,            /* length - all address space */
1675           SDT_MEMRWA,                   /* segment type */
1676           SEL_UPL,            /* segment descriptor priority level */
1677           1,                            /* segment descriptor present */
1678           0,                            /* long */
1679           1,                            /* default 32 vs 16 bit size */
1680           1                             /* limit granularity (byte/page units)*/ },
1681 /* GUCODE_SEL       5 64 bit Code Descriptor for user */
1682 {         0x0,                          /* segment base address  */
1683           0xfffff,            /* length - all address space */
1684           SDT_MEMERA,                   /* segment type */
1685           SEL_UPL,            /* segment descriptor priority level */
1686           1,                            /* segment descriptor present */
1687           1,                            /* long */
1688           0,                            /* default 32 vs 16 bit size */
1689           1                             /* limit granularity (byte/page units)*/ },
1690 /* GPROC0_SEL       6 Proc 0 Tss Descriptor */
1691 {
1692           0x0,                          /* segment base address */
1693           sizeof(struct x86_64tss)-1,/* length - all address space */
1694           SDT_SYSTSS,                   /* segment type */
1695           SEL_KPL,            /* segment descriptor priority level */
1696           1,                            /* segment descriptor present */
1697           0,                            /* long */
1698           0,                            /* unused - default 32 vs 16 bit size */
1699           0                             /* limit granularity (byte/page units)*/ },
1700 /* Actually, the TSS is a system descriptor which is double size */
1701 {         0x0,                          /* segment base address  */
1702           0x0,                          /* length */
1703           0,                            /* segment type */
1704           0,                            /* segment descriptor priority level */
1705           0,                            /* segment descriptor present */
1706           0,                            /* long */
1707           0,                            /* default 32 vs 16 bit size */
1708           0                             /* limit granularity (byte/page units)*/ },
1709 /* GUGS32_SEL       8 32 bit GS Descriptor for user */
1710 {         0x0,                          /* segment base address  */
1711           0xfffff,            /* length - all address space */
1712           SDT_MEMRWA,                   /* segment type */
1713           SEL_UPL,            /* segment descriptor priority level */
1714           1,                            /* segment descriptor present */
1715           0,                            /* long */
1716           1,                            /* default 32 vs 16 bit size */
1717           1                             /* limit granularity (byte/page units)*/ },
1718 };
1719 
1720 void
setidt_global(int idx,inthand_t * func,int typ,int dpl,int ist)1721 setidt_global(int idx, inthand_t *func, int typ, int dpl, int ist)
1722 {
1723           int cpu;
1724 
1725           for (cpu = 0; cpu < MAXCPU; ++cpu) {
1726                     struct gate_descriptor *ip = &idt_arr[cpu][idx];
1727 
1728                     ip->gd_looffset = (uintptr_t)func;
1729                     ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1730                     ip->gd_ist = ist;
1731                     ip->gd_xx = 0;
1732                     ip->gd_type = typ;
1733                     ip->gd_dpl = dpl;
1734                     ip->gd_p = 1;
1735                     ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1736           }
1737 }
1738 
1739 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist,int cpu)1740 setidt(int idx, inthand_t *func, int typ, int dpl, int ist, int cpu)
1741 {
1742           struct gate_descriptor *ip;
1743 
1744           KASSERT(cpu >= 0 && cpu < ncpus, ("invalid cpu %d", cpu));
1745 
1746           ip = &idt_arr[cpu][idx];
1747           ip->gd_looffset = (uintptr_t)func;
1748           ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
1749           ip->gd_ist = ist;
1750           ip->gd_xx = 0;
1751           ip->gd_type = typ;
1752           ip->gd_dpl = dpl;
1753           ip->gd_p = 1;
1754           ip->gd_hioffset = ((uintptr_t)func)>>16 ;
1755 }
1756 
1757 #define   IDTVEC(name)        __CONCAT(X,name)
1758 
1759 extern inthand_t
1760           IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
1761           IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
1762           IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
1763           IDTVEC(page), IDTVEC(mchk), IDTVEC(fpu), IDTVEC(align),
1764           IDTVEC(xmm), IDTVEC(dblfault),
1765           IDTVEC(fast_syscall), IDTVEC(fast_syscall32);
1766 
1767 extern inthand_t
1768           IDTVEC(rsvd00), IDTVEC(rsvd01), IDTVEC(rsvd02), IDTVEC(rsvd03),
1769           IDTVEC(rsvd04), IDTVEC(rsvd05), IDTVEC(rsvd06), IDTVEC(rsvd07),
1770           IDTVEC(rsvd08), IDTVEC(rsvd09), IDTVEC(rsvd0a), IDTVEC(rsvd0b),
1771           IDTVEC(rsvd0c), IDTVEC(rsvd0d), IDTVEC(rsvd0e), IDTVEC(rsvd0f),
1772           IDTVEC(rsvd10), IDTVEC(rsvd11), IDTVEC(rsvd12), IDTVEC(rsvd13),
1773           IDTVEC(rsvd14), IDTVEC(rsvd15), IDTVEC(rsvd16), IDTVEC(rsvd17),
1774           IDTVEC(rsvd18), IDTVEC(rsvd19), IDTVEC(rsvd1a), IDTVEC(rsvd1b),
1775           IDTVEC(rsvd1c), IDTVEC(rsvd1d), IDTVEC(rsvd1e), IDTVEC(rsvd1f),
1776           IDTVEC(rsvd20), IDTVEC(rsvd21), IDTVEC(rsvd22), IDTVEC(rsvd23),
1777           IDTVEC(rsvd24), IDTVEC(rsvd25), IDTVEC(rsvd26), IDTVEC(rsvd27),
1778           IDTVEC(rsvd28), IDTVEC(rsvd29), IDTVEC(rsvd2a), IDTVEC(rsvd2b),
1779           IDTVEC(rsvd2c), IDTVEC(rsvd2d), IDTVEC(rsvd2e), IDTVEC(rsvd2f),
1780           IDTVEC(rsvd30), IDTVEC(rsvd31), IDTVEC(rsvd32), IDTVEC(rsvd33),
1781           IDTVEC(rsvd34), IDTVEC(rsvd35), IDTVEC(rsvd36), IDTVEC(rsvd37),
1782           IDTVEC(rsvd38), IDTVEC(rsvd39), IDTVEC(rsvd3a), IDTVEC(rsvd3b),
1783           IDTVEC(rsvd3c), IDTVEC(rsvd3d), IDTVEC(rsvd3e), IDTVEC(rsvd3f),
1784           IDTVEC(rsvd40), IDTVEC(rsvd41), IDTVEC(rsvd42), IDTVEC(rsvd43),
1785           IDTVEC(rsvd44), IDTVEC(rsvd45), IDTVEC(rsvd46), IDTVEC(rsvd47),
1786           IDTVEC(rsvd48), IDTVEC(rsvd49), IDTVEC(rsvd4a), IDTVEC(rsvd4b),
1787           IDTVEC(rsvd4c), IDTVEC(rsvd4d), IDTVEC(rsvd4e), IDTVEC(rsvd4f),
1788           IDTVEC(rsvd50), IDTVEC(rsvd51), IDTVEC(rsvd52), IDTVEC(rsvd53),
1789           IDTVEC(rsvd54), IDTVEC(rsvd55), IDTVEC(rsvd56), IDTVEC(rsvd57),
1790           IDTVEC(rsvd58), IDTVEC(rsvd59), IDTVEC(rsvd5a), IDTVEC(rsvd5b),
1791           IDTVEC(rsvd5c), IDTVEC(rsvd5d), IDTVEC(rsvd5e), IDTVEC(rsvd5f),
1792           IDTVEC(rsvd60), IDTVEC(rsvd61), IDTVEC(rsvd62), IDTVEC(rsvd63),
1793           IDTVEC(rsvd64), IDTVEC(rsvd65), IDTVEC(rsvd66), IDTVEC(rsvd67),
1794           IDTVEC(rsvd68), IDTVEC(rsvd69), IDTVEC(rsvd6a), IDTVEC(rsvd6b),
1795           IDTVEC(rsvd6c), IDTVEC(rsvd6d), IDTVEC(rsvd6e), IDTVEC(rsvd6f),
1796           IDTVEC(rsvd70), IDTVEC(rsvd71), IDTVEC(rsvd72), IDTVEC(rsvd73),
1797           IDTVEC(rsvd74), IDTVEC(rsvd75), IDTVEC(rsvd76), IDTVEC(rsvd77),
1798           IDTVEC(rsvd78), IDTVEC(rsvd79), IDTVEC(rsvd7a), IDTVEC(rsvd7b),
1799           IDTVEC(rsvd7c), IDTVEC(rsvd7d), IDTVEC(rsvd7e), IDTVEC(rsvd7f),
1800           IDTVEC(rsvd80), IDTVEC(rsvd81), IDTVEC(rsvd82), IDTVEC(rsvd83),
1801           IDTVEC(rsvd84), IDTVEC(rsvd85), IDTVEC(rsvd86), IDTVEC(rsvd87),
1802           IDTVEC(rsvd88), IDTVEC(rsvd89), IDTVEC(rsvd8a), IDTVEC(rsvd8b),
1803           IDTVEC(rsvd8c), IDTVEC(rsvd8d), IDTVEC(rsvd8e), IDTVEC(rsvd8f),
1804           IDTVEC(rsvd90), IDTVEC(rsvd91), IDTVEC(rsvd92), IDTVEC(rsvd93),
1805           IDTVEC(rsvd94), IDTVEC(rsvd95), IDTVEC(rsvd96), IDTVEC(rsvd97),
1806           IDTVEC(rsvd98), IDTVEC(rsvd99), IDTVEC(rsvd9a), IDTVEC(rsvd9b),
1807           IDTVEC(rsvd9c), IDTVEC(rsvd9d), IDTVEC(rsvd9e), IDTVEC(rsvd9f),
1808           IDTVEC(rsvda0), IDTVEC(rsvda1), IDTVEC(rsvda2), IDTVEC(rsvda3),
1809           IDTVEC(rsvda4), IDTVEC(rsvda5), IDTVEC(rsvda6), IDTVEC(rsvda7),
1810           IDTVEC(rsvda8), IDTVEC(rsvda9), IDTVEC(rsvdaa), IDTVEC(rsvdab),
1811           IDTVEC(rsvdac), IDTVEC(rsvdad), IDTVEC(rsvdae), IDTVEC(rsvdaf),
1812           IDTVEC(rsvdb0), IDTVEC(rsvdb1), IDTVEC(rsvdb2), IDTVEC(rsvdb3),
1813           IDTVEC(rsvdb4), IDTVEC(rsvdb5), IDTVEC(rsvdb6), IDTVEC(rsvdb7),
1814           IDTVEC(rsvdb8), IDTVEC(rsvdb9), IDTVEC(rsvdba), IDTVEC(rsvdbb),
1815           IDTVEC(rsvdbc), IDTVEC(rsvdbd), IDTVEC(rsvdbe), IDTVEC(rsvdbf),
1816           IDTVEC(rsvdc0), IDTVEC(rsvdc1), IDTVEC(rsvdc2), IDTVEC(rsvdc3),
1817           IDTVEC(rsvdc4), IDTVEC(rsvdc5), IDTVEC(rsvdc6), IDTVEC(rsvdc7),
1818           IDTVEC(rsvdc8), IDTVEC(rsvdc9), IDTVEC(rsvdca), IDTVEC(rsvdcb),
1819           IDTVEC(rsvdcc), IDTVEC(rsvdcd), IDTVEC(rsvdce), IDTVEC(rsvdcf),
1820           IDTVEC(rsvdd0), IDTVEC(rsvdd1), IDTVEC(rsvdd2), IDTVEC(rsvdd3),
1821           IDTVEC(rsvdd4), IDTVEC(rsvdd5), IDTVEC(rsvdd6), IDTVEC(rsvdd7),
1822           IDTVEC(rsvdd8), IDTVEC(rsvdd9), IDTVEC(rsvdda), IDTVEC(rsvddb),
1823           IDTVEC(rsvddc), IDTVEC(rsvddd), IDTVEC(rsvdde), IDTVEC(rsvddf),
1824           IDTVEC(rsvde0), IDTVEC(rsvde1), IDTVEC(rsvde2), IDTVEC(rsvde3),
1825           IDTVEC(rsvde4), IDTVEC(rsvde5), IDTVEC(rsvde6), IDTVEC(rsvde7),
1826           IDTVEC(rsvde8), IDTVEC(rsvde9), IDTVEC(rsvdea), IDTVEC(rsvdeb),
1827           IDTVEC(rsvdec), IDTVEC(rsvded), IDTVEC(rsvdee), IDTVEC(rsvdef),
1828           IDTVEC(rsvdf0), IDTVEC(rsvdf1), IDTVEC(rsvdf2), IDTVEC(rsvdf3),
1829           IDTVEC(rsvdf4), IDTVEC(rsvdf5), IDTVEC(rsvdf6), IDTVEC(rsvdf7),
1830           IDTVEC(rsvdf8), IDTVEC(rsvdf9), IDTVEC(rsvdfa), IDTVEC(rsvdfb),
1831           IDTVEC(rsvdfc), IDTVEC(rsvdfd), IDTVEC(rsvdfe), IDTVEC(rsvdff);
1832 
1833 inthand_t *rsvdary[NIDT] = {
1834           &IDTVEC(rsvd00), &IDTVEC(rsvd01), &IDTVEC(rsvd02), &IDTVEC(rsvd03),
1835           &IDTVEC(rsvd04), &IDTVEC(rsvd05), &IDTVEC(rsvd06), &IDTVEC(rsvd07),
1836           &IDTVEC(rsvd08), &IDTVEC(rsvd09), &IDTVEC(rsvd0a), &IDTVEC(rsvd0b),
1837           &IDTVEC(rsvd0c), &IDTVEC(rsvd0d), &IDTVEC(rsvd0e), &IDTVEC(rsvd0f),
1838           &IDTVEC(rsvd10), &IDTVEC(rsvd11), &IDTVEC(rsvd12), &IDTVEC(rsvd13),
1839           &IDTVEC(rsvd14), &IDTVEC(rsvd15), &IDTVEC(rsvd16), &IDTVEC(rsvd17),
1840           &IDTVEC(rsvd18), &IDTVEC(rsvd19), &IDTVEC(rsvd1a), &IDTVEC(rsvd1b),
1841           &IDTVEC(rsvd1c), &IDTVEC(rsvd1d), &IDTVEC(rsvd1e), &IDTVEC(rsvd1f),
1842           &IDTVEC(rsvd20), &IDTVEC(rsvd21), &IDTVEC(rsvd22), &IDTVEC(rsvd23),
1843           &IDTVEC(rsvd24), &IDTVEC(rsvd25), &IDTVEC(rsvd26), &IDTVEC(rsvd27),
1844           &IDTVEC(rsvd28), &IDTVEC(rsvd29), &IDTVEC(rsvd2a), &IDTVEC(rsvd2b),
1845           &IDTVEC(rsvd2c), &IDTVEC(rsvd2d), &IDTVEC(rsvd2e), &IDTVEC(rsvd2f),
1846           &IDTVEC(rsvd30), &IDTVEC(rsvd31), &IDTVEC(rsvd32), &IDTVEC(rsvd33),
1847           &IDTVEC(rsvd34), &IDTVEC(rsvd35), &IDTVEC(rsvd36), &IDTVEC(rsvd37),
1848           &IDTVEC(rsvd38), &IDTVEC(rsvd39), &IDTVEC(rsvd3a), &IDTVEC(rsvd3b),
1849           &IDTVEC(rsvd3c), &IDTVEC(rsvd3d), &IDTVEC(rsvd3e), &IDTVEC(rsvd3f),
1850           &IDTVEC(rsvd40), &IDTVEC(rsvd41), &IDTVEC(rsvd42), &IDTVEC(rsvd43),
1851           &IDTVEC(rsvd44), &IDTVEC(rsvd45), &IDTVEC(rsvd46), &IDTVEC(rsvd47),
1852           &IDTVEC(rsvd48), &IDTVEC(rsvd49), &IDTVEC(rsvd4a), &IDTVEC(rsvd4b),
1853           &IDTVEC(rsvd4c), &IDTVEC(rsvd4d), &IDTVEC(rsvd4e), &IDTVEC(rsvd4f),
1854           &IDTVEC(rsvd50), &IDTVEC(rsvd51), &IDTVEC(rsvd52), &IDTVEC(rsvd53),
1855           &IDTVEC(rsvd54), &IDTVEC(rsvd55), &IDTVEC(rsvd56), &IDTVEC(rsvd57),
1856           &IDTVEC(rsvd58), &IDTVEC(rsvd59), &IDTVEC(rsvd5a), &IDTVEC(rsvd5b),
1857           &IDTVEC(rsvd5c), &IDTVEC(rsvd5d), &IDTVEC(rsvd5e), &IDTVEC(rsvd5f),
1858           &IDTVEC(rsvd60), &IDTVEC(rsvd61), &IDTVEC(rsvd62), &IDTVEC(rsvd63),
1859           &IDTVEC(rsvd64), &IDTVEC(rsvd65), &IDTVEC(rsvd66), &IDTVEC(rsvd67),
1860           &IDTVEC(rsvd68), &IDTVEC(rsvd69), &IDTVEC(rsvd6a), &IDTVEC(rsvd6b),
1861           &IDTVEC(rsvd6c), &IDTVEC(rsvd6d), &IDTVEC(rsvd6e), &IDTVEC(rsvd6f),
1862           &IDTVEC(rsvd70), &IDTVEC(rsvd71), &IDTVEC(rsvd72), &IDTVEC(rsvd73),
1863           &IDTVEC(rsvd74), &IDTVEC(rsvd75), &IDTVEC(rsvd76), &IDTVEC(rsvd77),
1864           &IDTVEC(rsvd78), &IDTVEC(rsvd79), &IDTVEC(rsvd7a), &IDTVEC(rsvd7b),
1865           &IDTVEC(rsvd7c), &IDTVEC(rsvd7d), &IDTVEC(rsvd7e), &IDTVEC(rsvd7f),
1866           &IDTVEC(rsvd80), &IDTVEC(rsvd81), &IDTVEC(rsvd82), &IDTVEC(rsvd83),
1867           &IDTVEC(rsvd84), &IDTVEC(rsvd85), &IDTVEC(rsvd86), &IDTVEC(rsvd87),
1868           &IDTVEC(rsvd88), &IDTVEC(rsvd89), &IDTVEC(rsvd8a), &IDTVEC(rsvd8b),
1869           &IDTVEC(rsvd8c), &IDTVEC(rsvd8d), &IDTVEC(rsvd8e), &IDTVEC(rsvd8f),
1870           &IDTVEC(rsvd90), &IDTVEC(rsvd91), &IDTVEC(rsvd92), &IDTVEC(rsvd93),
1871           &IDTVEC(rsvd94), &IDTVEC(rsvd95), &IDTVEC(rsvd96), &IDTVEC(rsvd97),
1872           &IDTVEC(rsvd98), &IDTVEC(rsvd99), &IDTVEC(rsvd9a), &IDTVEC(rsvd9b),
1873           &IDTVEC(rsvd9c), &IDTVEC(rsvd9d), &IDTVEC(rsvd9e), &IDTVEC(rsvd9f),
1874           &IDTVEC(rsvda0), &IDTVEC(rsvda1), &IDTVEC(rsvda2), &IDTVEC(rsvda3),
1875           &IDTVEC(rsvda4), &IDTVEC(rsvda5), &IDTVEC(rsvda6), &IDTVEC(rsvda7),
1876           &IDTVEC(rsvda8), &IDTVEC(rsvda9), &IDTVEC(rsvdaa), &IDTVEC(rsvdab),
1877           &IDTVEC(rsvdac), &IDTVEC(rsvdad), &IDTVEC(rsvdae), &IDTVEC(rsvdaf),
1878           &IDTVEC(rsvdb0), &IDTVEC(rsvdb1), &IDTVEC(rsvdb2), &IDTVEC(rsvdb3),
1879           &IDTVEC(rsvdb4), &IDTVEC(rsvdb5), &IDTVEC(rsvdb6), &IDTVEC(rsvdb7),
1880           &IDTVEC(rsvdb8), &IDTVEC(rsvdb9), &IDTVEC(rsvdba), &IDTVEC(rsvdbb),
1881           &IDTVEC(rsvdbc), &IDTVEC(rsvdbd), &IDTVEC(rsvdbe), &IDTVEC(rsvdbf),
1882           &IDTVEC(rsvdc0), &IDTVEC(rsvdc1), &IDTVEC(rsvdc2), &IDTVEC(rsvdc3),
1883           &IDTVEC(rsvdc4), &IDTVEC(rsvdc5), &IDTVEC(rsvdc6), &IDTVEC(rsvdc7),
1884           &IDTVEC(rsvdc8), &IDTVEC(rsvdc9), &IDTVEC(rsvdca), &IDTVEC(rsvdcb),
1885           &IDTVEC(rsvdcc), &IDTVEC(rsvdcd), &IDTVEC(rsvdce), &IDTVEC(rsvdcf),
1886           &IDTVEC(rsvdd0), &IDTVEC(rsvdd1), &IDTVEC(rsvdd2), &IDTVEC(rsvdd3),
1887           &IDTVEC(rsvdd4), &IDTVEC(rsvdd5), &IDTVEC(rsvdd6), &IDTVEC(rsvdd7),
1888           &IDTVEC(rsvdd8), &IDTVEC(rsvdd9), &IDTVEC(rsvdda), &IDTVEC(rsvddb),
1889           &IDTVEC(rsvddc), &IDTVEC(rsvddd), &IDTVEC(rsvdde), &IDTVEC(rsvddf),
1890           &IDTVEC(rsvde0), &IDTVEC(rsvde1), &IDTVEC(rsvde2), &IDTVEC(rsvde3),
1891           &IDTVEC(rsvde4), &IDTVEC(rsvde5), &IDTVEC(rsvde6), &IDTVEC(rsvde7),
1892           &IDTVEC(rsvde8), &IDTVEC(rsvde9), &IDTVEC(rsvdea), &IDTVEC(rsvdeb),
1893           &IDTVEC(rsvdec), &IDTVEC(rsvded), &IDTVEC(rsvdee), &IDTVEC(rsvdef),
1894           &IDTVEC(rsvdf0), &IDTVEC(rsvdf1), &IDTVEC(rsvdf2), &IDTVEC(rsvdf3),
1895           &IDTVEC(rsvdf4), &IDTVEC(rsvdf5), &IDTVEC(rsvdf6), &IDTVEC(rsvdf7),
1896           &IDTVEC(rsvdf8), &IDTVEC(rsvdf9), &IDTVEC(rsvdfa), &IDTVEC(rsvdfb),
1897           &IDTVEC(rsvdfc), &IDTVEC(rsvdfd), &IDTVEC(rsvdfe), &IDTVEC(rsvdff)
1898 };
1899 
1900 void
sdtossd(struct user_segment_descriptor * sd,struct soft_segment_descriptor * ssd)1901 sdtossd(struct user_segment_descriptor *sd, struct soft_segment_descriptor *ssd)
1902 {
1903           ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
1904           ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
1905           ssd->ssd_type  = sd->sd_type;
1906           ssd->ssd_dpl   = sd->sd_dpl;
1907           ssd->ssd_p     = sd->sd_p;
1908           ssd->ssd_def32 = sd->sd_def32;
1909           ssd->ssd_gran  = sd->sd_gran;
1910 }
1911 
1912 void
ssdtosd(struct soft_segment_descriptor * ssd,struct user_segment_descriptor * sd)1913 ssdtosd(struct soft_segment_descriptor *ssd, struct user_segment_descriptor *sd)
1914 {
1915 
1916           sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1917           sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
1918           sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1919           sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1920           sd->sd_type  = ssd->ssd_type;
1921           sd->sd_dpl   = ssd->ssd_dpl;
1922           sd->sd_p     = ssd->ssd_p;
1923           sd->sd_long  = ssd->ssd_long;
1924           sd->sd_def32 = ssd->ssd_def32;
1925           sd->sd_gran  = ssd->ssd_gran;
1926 }
1927 
1928 void
ssdtosyssd(struct soft_segment_descriptor * ssd,struct system_segment_descriptor * sd)1929 ssdtosyssd(struct soft_segment_descriptor *ssd,
1930     struct system_segment_descriptor *sd)
1931 {
1932 
1933           sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
1934           sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
1935           sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
1936           sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
1937           sd->sd_type  = ssd->ssd_type;
1938           sd->sd_dpl   = ssd->ssd_dpl;
1939           sd->sd_p     = ssd->ssd_p;
1940           sd->sd_gran  = ssd->ssd_gran;
1941 }
1942 
1943 /*
1944  * Populate the (physmap) array with base/bound pairs describing the
1945  * available physical memory in the system, then test this memory and
1946  * build the phys_avail array describing the actually-available memory.
1947  *
1948  * If we cannot accurately determine the physical memory map, then use
1949  * value from the 0xE801 call, and failing that, the RTC.
1950  *
1951  * Total memory size may be set by the kernel environment variable
1952  * hw.physmem or the compile-time define MAXMEM.
1953  *
1954  * Memory is aligned to PHYSMAP_ALIGN which must be a multiple
1955  * of PAGE_SIZE.  This also greatly reduces the memory test time
1956  * which would otherwise be excessive on machines with > 8G of ram.
1957  *
1958  * XXX first should be vm_paddr_t.
1959  */
1960 
1961 #define PHYSMAP_ALIGN                   (vm_paddr_t)(128 * 1024)
1962 #define PHYSMAP_ALIGN_MASK    (vm_paddr_t)(PHYSMAP_ALIGN - 1)
1963 #define PHYSMAP_SIZE                    VM_PHYSSEG_MAX
1964 
1965 vm_paddr_t physmap[PHYSMAP_SIZE];
1966 struct bios_smap *smapbase, *smap, *smapend;
1967 struct efi_map_header *efihdrbase;
1968 u_int32_t smapsize;
1969 
1970 #define PHYSMAP_HANDWAVE      (vm_paddr_t)(2 * 1024 * 1024)
1971 #define PHYSMAP_HANDWAVE_MASK (PHYSMAP_HANDWAVE - 1)
1972 
1973 static void
add_smap_entries(int * physmap_idx)1974 add_smap_entries(int *physmap_idx)
1975 {
1976           int i;
1977 
1978           smapsize = *((u_int32_t *)smapbase - 1);
1979           smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1980 
1981           for (smap = smapbase; smap < smapend; smap++) {
1982                     if (boothowto & RB_VERBOSE)
1983                               kprintf("SMAP type=%02x base=%016lx len=%016lx\n",
1984                                   smap->type, smap->base, smap->length);
1985 
1986                     if (smap->type != SMAP_TYPE_MEMORY)
1987                               continue;
1988 
1989                     if (smap->length == 0)
1990                               continue;
1991 
1992                     for (i = 0; i <= *physmap_idx; i += 2) {
1993                               if (smap->base < physmap[i + 1]) {
1994                                         if (boothowto & RB_VERBOSE) {
1995                                                   kprintf("Overlapping or non-monotonic "
1996                                                             "memory region, ignoring "
1997                                                             "second region\n");
1998                                         }
1999                                         break;
2000                               }
2001                     }
2002                     if (i <= *physmap_idx)
2003                               continue;
2004 
2005                     Realmem += smap->length;
2006 
2007                     /*
2008                      * NOTE: This little bit of code initially expands
2009                      *         physmap[1] as well as later entries.
2010                      */
2011                     if (smap->base == physmap[*physmap_idx + 1]) {
2012                               physmap[*physmap_idx + 1] += smap->length;
2013                               continue;
2014                     }
2015 
2016                     *physmap_idx += 2;
2017                     if (*physmap_idx == PHYSMAP_SIZE) {
2018                               kprintf("Too many segments in the physical "
2019                                         "address map, giving up\n");
2020                               break;
2021                     }
2022                     physmap[*physmap_idx] = smap->base;
2023                     physmap[*physmap_idx + 1] = smap->base + smap->length;
2024           }
2025 }
2026 
2027 static void
add_efi_map_entries(int * physmap_idx)2028 add_efi_map_entries(int *physmap_idx)
2029 {
2030           struct efi_md *map, *p;
2031           const char *type;
2032           size_t efisz;
2033           int i, ndesc;
2034 
2035           static const char *types[] = {
2036                     "Reserved",
2037                     "LoaderCode",
2038                     "LoaderData",
2039                     "BootServicesCode",
2040                     "BootServicesData",
2041                     "RuntimeServicesCode",
2042                     "RuntimeServicesData",
2043                     "ConventionalMemory",
2044                     "UnusableMemory",
2045                     "ACPIReclaimMemory",
2046                     "ACPIMemoryNVS",
2047                     "MemoryMappedIO",
2048                     "MemoryMappedIOPortSpace",
2049                     "PalCode"
2050            };
2051 
2052           /*
2053            * Memory map data provided by UEFI via the GetMemoryMap
2054            * Boot Services API.
2055            */
2056           efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
2057           map = (struct efi_md *)((uint8_t *)efihdrbase + efisz);
2058 
2059           if (efihdrbase->descriptor_size == 0)
2060                     return;
2061           ndesc = efihdrbase->memory_size / efihdrbase->descriptor_size;
2062 
2063           if (boothowto & RB_VERBOSE)
2064                     kprintf("%23s %12s %12s %8s %4s\n",
2065                         "Type", "Physical", "Virtual", "#Pages", "Attr");
2066 
2067           for (i = 0, p = map; i < ndesc; i++,
2068               p = efi_next_descriptor(p, efihdrbase->descriptor_size)) {
2069                     if (boothowto & RB_VERBOSE) {
2070                               if (p->md_type <= EFI_MD_TYPE_PALCODE)
2071                                         type = types[p->md_type];
2072                               else
2073                                         type = "<INVALID>";
2074                               kprintf("%23s %012lx %12p %08lx ", type, p->md_phys,
2075                                   p->md_virt, p->md_pages);
2076                               if (p->md_attr & EFI_MD_ATTR_UC)
2077                                         kprintf("UC ");
2078                               if (p->md_attr & EFI_MD_ATTR_WC)
2079                                         kprintf("WC ");
2080                               if (p->md_attr & EFI_MD_ATTR_WT)
2081                                         kprintf("WT ");
2082                               if (p->md_attr & EFI_MD_ATTR_WB)
2083                                         kprintf("WB ");
2084                               if (p->md_attr & EFI_MD_ATTR_UCE)
2085                                         kprintf("UCE ");
2086                               if (p->md_attr & EFI_MD_ATTR_WP)
2087                                         kprintf("WP ");
2088                               if (p->md_attr & EFI_MD_ATTR_RP)
2089                                         kprintf("RP ");
2090                               if (p->md_attr & EFI_MD_ATTR_XP)
2091                                         kprintf("XP ");
2092                               if (p->md_attr & EFI_MD_ATTR_RT)
2093                                         kprintf("RUNTIME");
2094                               kprintf("\n");
2095                     }
2096 
2097                     switch (p->md_type) {
2098                     case EFI_MD_TYPE_CODE:
2099                     case EFI_MD_TYPE_DATA:
2100                     case EFI_MD_TYPE_BS_CODE:
2101                     case EFI_MD_TYPE_BS_DATA:
2102                     case EFI_MD_TYPE_FREE:
2103                               /*
2104                                * We're allowed to use any entry with these types.
2105                                */
2106                               break;
2107                     default:
2108                               continue;
2109                     }
2110 
2111                     Realmem += p->md_pages * PAGE_SIZE;
2112 
2113                     /*
2114                      * NOTE: This little bit of code initially expands
2115                      *         physmap[1] as well as later entries.
2116                      */
2117                     if (p->md_phys == physmap[*physmap_idx + 1]) {
2118                               physmap[*physmap_idx + 1] += p->md_pages * PAGE_SIZE;
2119                               continue;
2120                     }
2121 
2122                     *physmap_idx += 2;
2123                     if (*physmap_idx == PHYSMAP_SIZE) {
2124                               kprintf("Too many segments in the physical "
2125                                         "address map, giving up\n");
2126                               break;
2127                     }
2128                     physmap[*physmap_idx] = p->md_phys;
2129                     physmap[*physmap_idx + 1] = p->md_phys + p->md_pages * PAGE_SIZE;
2130            }
2131 }
2132 
2133 struct fb_info efi_fb_info;
2134 static int have_efi_framebuffer = 0;
2135 
2136 static void
efi_fb_init_vaddr(int direct_map)2137 efi_fb_init_vaddr(int direct_map)
2138 {
2139           uint64_t sz;
2140           vm_offset_t addr, v;
2141 
2142           v = efi_fb_info.vaddr;
2143           sz = efi_fb_info.stride * efi_fb_info.height;
2144 
2145           if (direct_map) {
2146                     addr = PHYS_TO_DMAP(efi_fb_info.paddr);
2147                     if (addr >= DMAP_MIN_ADDRESS && addr + sz <= DMapMaxAddress)
2148                               efi_fb_info.vaddr = addr;
2149           } else {
2150                     efi_fb_info.vaddr =
2151                               (vm_offset_t)pmap_mapdev_attr(efi_fb_info.paddr,
2152                                                                   sz,
2153                                                                   PAT_WRITE_COMBINING);
2154           }
2155 }
2156 
2157 static u_int
efifb_color_depth(struct efi_fb * efifb)2158 efifb_color_depth(struct efi_fb *efifb)
2159 {
2160           uint32_t mask;
2161           u_int depth;
2162 
2163           mask = efifb->fb_mask_red | efifb->fb_mask_green |
2164               efifb->fb_mask_blue | efifb->fb_mask_reserved;
2165           if (mask == 0)
2166                     return (0);
2167           for (depth = 1; mask != 1; depth++)
2168                     mask >>= 1;
2169           return (depth);
2170 }
2171 
2172 int
probe_efi_fb(int early)2173 probe_efi_fb(int early)
2174 {
2175           struct efi_fb       *efifb;
2176           caddr_t             kmdp;
2177           u_int               depth;
2178 
2179           if (have_efi_framebuffer) {
2180                     if (!early &&
2181                         (efi_fb_info.vaddr == 0 ||
2182                          efi_fb_info.vaddr == PHYS_TO_DMAP(efi_fb_info.paddr)))
2183                               efi_fb_init_vaddr(0);
2184                     return 0;
2185           }
2186 
2187           kmdp = preload_search_by_type("elf kernel");
2188           if (kmdp == NULL)
2189                     kmdp = preload_search_by_type("elf64 kernel");
2190           efifb = (struct efi_fb *)preload_search_info(kmdp,
2191               MODINFO_METADATA | MODINFOMD_EFI_FB);
2192           if (efifb == NULL)
2193                     return 1;
2194 
2195           depth = efifb_color_depth(efifb);
2196           /*
2197            * Our bootloader should already notice, when we won't be able to
2198            * use the UEFI framebuffer.
2199            */
2200           if (depth != 24 && depth != 32)
2201                     return 1;
2202 
2203           have_efi_framebuffer = 1;
2204 
2205           efi_fb_info.is_vga_boot_display = 1;
2206           efi_fb_info.width = efifb->fb_width;
2207           efi_fb_info.height = efifb->fb_height;
2208           efi_fb_info.depth = depth;
2209           efi_fb_info.stride = efifb->fb_stride * (depth / 8);
2210           efi_fb_info.paddr = efifb->fb_addr;
2211           if (early) {
2212                     efi_fb_info.vaddr = 0;
2213           } else {
2214                     efi_fb_init_vaddr(0);
2215           }
2216           efi_fb_info.fbops.fb_set_par = NULL;
2217           efi_fb_info.fbops.fb_blank = NULL;
2218           efi_fb_info.fbops.fb_debug_enter = NULL;
2219           efi_fb_info.device = NULL;
2220 
2221           return 0;
2222 }
2223 
2224 static void
efifb_startup(void * arg)2225 efifb_startup(void *arg)
2226 {
2227           probe_efi_fb(0);
2228 }
2229 
2230 SYSINIT(efi_fb_info, SI_BOOT1_POST, SI_ORDER_FIRST, efifb_startup, NULL);
2231 
2232 static void
getmemsize(caddr_t kmdp,u_int64_t first)2233 getmemsize(caddr_t kmdp, u_int64_t first)
2234 {
2235           int off, physmap_idx, pa_indx, da_indx;
2236           int i, j;
2237           vm_paddr_t pa;
2238           vm_paddr_t msgbuf_size;
2239           u_long physmem_tunable;
2240           pt_entry_t *pte;
2241           quad_t dcons_addr, dcons_size;
2242 
2243           bzero(physmap, sizeof(physmap));
2244           physmap_idx = 0;
2245 
2246           /*
2247            * get memory map from INT 15:E820, kindly supplied by the loader.
2248            *
2249            * subr_module.c says:
2250            * "Consumer may safely assume that size value precedes data."
2251            * ie: an int32_t immediately precedes smap.
2252            */
2253           efihdrbase = (struct efi_map_header *)preload_search_info(kmdp,
2254                          MODINFO_METADATA | MODINFOMD_EFI_MAP);
2255           smapbase = (struct bios_smap *)preload_search_info(kmdp,
2256                        MODINFO_METADATA | MODINFOMD_SMAP);
2257           if (smapbase == NULL && efihdrbase == NULL)
2258                     panic("No BIOS smap or EFI map info from loader!");
2259 
2260           if (efihdrbase == NULL)
2261                     add_smap_entries(&physmap_idx);
2262           else
2263                     add_efi_map_entries(&physmap_idx);
2264 
2265           base_memory = physmap[1] / 1024;
2266           /* make hole for AP bootstrap code */
2267           physmap[1] = mp_bootaddress(base_memory);
2268 
2269           /* Save EBDA address, if any */
2270           ebda_addr = (u_long)(*(u_short *)(KERNBASE + 0x40e));
2271           ebda_addr <<= 4;
2272 
2273           /*
2274            * Maxmem isn't the "maximum memory", it's one larger than the
2275            * highest page of the physical address space.  It should be
2276            * called something like "Maxphyspage".  We may adjust this
2277            * based on ``hw.physmem'' and the results of the memory test.
2278            */
2279           Maxmem = atop(physmap[physmap_idx + 1]);
2280 
2281 #ifdef MAXMEM
2282           Maxmem = MAXMEM / 4;
2283 #endif
2284 
2285           if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
2286                     Maxmem = atop(physmem_tunable);
2287 
2288           /*
2289            * Don't allow MAXMEM or hw.physmem to extend the amount of memory
2290            * in the system.
2291            */
2292           if (Maxmem > atop(physmap[physmap_idx + 1]))
2293                     Maxmem = atop(physmap[physmap_idx + 1]);
2294 
2295           /*
2296            * Blowing out the DMAP will blow up the system.
2297            */
2298           if (Maxmem > atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS)) {
2299                     kprintf("Limiting Maxmem due to DMAP size\n");
2300                     Maxmem = atop(DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS);
2301           }
2302 
2303           if (atop(physmap[physmap_idx + 1]) != Maxmem &&
2304               (boothowto & RB_VERBOSE)) {
2305                     kprintf("Physical memory use set to %ldK\n", Maxmem * 4);
2306           }
2307 
2308           /*
2309            * Call pmap initialization to make new kernel address space
2310            *
2311            * Mask off page 0.
2312            */
2313           pmap_bootstrap(&first);
2314           physmap[0] = PAGE_SIZE;
2315 
2316           /*
2317            * Align the physmap to PHYSMAP_ALIGN and cut out anything
2318            * exceeding Maxmem.
2319            */
2320           for (i = j = 0; i <= physmap_idx; i += 2) {
2321                     if (physmap[i+1] > ptoa(Maxmem))
2322                               physmap[i+1] = ptoa(Maxmem);
2323                     physmap[i] = (physmap[i] + PHYSMAP_ALIGN_MASK) &
2324                                    ~PHYSMAP_ALIGN_MASK;
2325                     physmap[i+1] = physmap[i+1] & ~PHYSMAP_ALIGN_MASK;
2326 
2327                     physmap[j] = physmap[i];
2328                     physmap[j+1] = physmap[i+1];
2329 
2330                     if (physmap[i] < physmap[i+1])
2331                               j += 2;
2332           }
2333           physmap_idx = j - 2;
2334 
2335           /*
2336            * Align anything else used in the validation loop.
2337            *
2338            * Also make sure that our 2MB kernel text+data+bss mappings
2339            * do not overlap potentially allocatable space.
2340            */
2341           first = (first + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2342 
2343           /*
2344            * Size up each available chunk of physical memory.
2345            */
2346           pa_indx = 0;
2347           da_indx = 0;
2348           phys_avail[pa_indx].phys_beg = physmap[0];
2349           phys_avail[pa_indx].phys_end = physmap[0];
2350           dump_avail[da_indx].phys_beg = 0;
2351           dump_avail[da_indx].phys_end = physmap[0];
2352           pte = CMAP1;
2353 
2354           /*
2355            * Get dcons buffer address
2356            */
2357           if (kgetenv_quad("dcons.addr", &dcons_addr) == 0 ||
2358               kgetenv_quad("dcons.size", &dcons_size) == 0)
2359                     dcons_addr = 0;
2360 
2361           /*
2362            * Validate the physical memory.  The physical memory segments
2363            * have already been aligned to PHYSMAP_ALIGN which is a multiple
2364            * of PAGE_SIZE.
2365            *
2366            * We no longer perform an exhaustive memory test.  Instead we
2367            * simply test the first and last word in each physmap[]
2368            * segment.
2369            */
2370           for (i = 0; i <= physmap_idx; i += 2) {
2371                     vm_paddr_t end;
2372                     vm_paddr_t incr;
2373 
2374                     end = physmap[i + 1];
2375 
2376                     for (pa = physmap[i]; pa < end; pa += incr) {
2377                               int page_bad, full;
2378                               volatile uint64_t *ptr = (uint64_t *)CADDR1;
2379                               uint64_t tmp;
2380 
2381                               full = FALSE;
2382 
2383                               /*
2384                                * Calculate incr.  Just test the first and
2385                                * last page in each physmap[] segment.
2386                                */
2387                               if (pa == end - PAGE_SIZE)
2388                                         incr = PAGE_SIZE;
2389                               else
2390                                         incr = end - pa - PAGE_SIZE;
2391 
2392                               /*
2393                                * Make sure we don't skip blacked out areas.
2394                                */
2395                               if (pa < 0x200000 && 0x200000 < end) {
2396                                         incr = 0x200000 - pa;
2397                               }
2398                               if (dcons_addr > 0 &&
2399                                   pa < dcons_addr &&
2400                                   dcons_addr < end) {
2401                                         incr = dcons_addr - pa;
2402                               }
2403 
2404                               /*
2405                                * Block out kernel memory as not available.
2406                                */
2407                               if (pa >= 0x200000 && pa < first) {
2408                                         incr = first - pa;
2409                                         if (pa + incr > end)
2410                                                   incr = end - pa;
2411                                         goto do_dump_avail;
2412                               }
2413 
2414                               /*
2415                                * Block out the dcons buffer if it exists.
2416                                */
2417                               if (dcons_addr > 0 &&
2418                                   pa >= trunc_page(dcons_addr) &&
2419                                   pa < dcons_addr + dcons_size) {
2420                                         incr = dcons_addr + dcons_size - pa;
2421                                         incr = (incr + PAGE_MASK) &
2422                                                ~(vm_paddr_t)PAGE_MASK;
2423                                         if (pa + incr > end)
2424                                                   incr = end - pa;
2425                                         goto do_dump_avail;
2426                               }
2427 
2428                               page_bad = FALSE;
2429 
2430                               /*
2431                                * Map the page non-cacheable for the memory
2432                                * test.
2433                                */
2434                               *pte = pa |
2435                                   kernel_pmap->pmap_bits[PG_V_IDX] |
2436                                   kernel_pmap->pmap_bits[PG_RW_IDX] |
2437                                   kernel_pmap->pmap_bits[PG_N_IDX];
2438                               cpu_invlpg(__DEVOLATILE(void *, ptr));
2439                               cpu_mfence();
2440 
2441                               /*
2442                                * Save original value for restoration later.
2443                                */
2444                               tmp = *ptr;
2445 
2446                               /*
2447                                * Test for alternating 1's and 0's
2448                                */
2449                               *ptr = 0xaaaaaaaaaaaaaaaaLLU;
2450                               cpu_mfence();
2451                               if (*ptr != 0xaaaaaaaaaaaaaaaaLLU)
2452                                         page_bad = TRUE;
2453                               /*
2454                                * Test for alternating 0's and 1's
2455                                */
2456                               *ptr = 0x5555555555555555LLU;
2457                               cpu_mfence();
2458                               if (*ptr != 0x5555555555555555LLU)
2459                                         page_bad = TRUE;
2460                               /*
2461                                * Test for all 1's
2462                                */
2463                               *ptr = 0xffffffffffffffffLLU;
2464                               cpu_mfence();
2465                               if (*ptr != 0xffffffffffffffffLLU)
2466                                         page_bad = TRUE;
2467                               /*
2468                                * Test for all 0's
2469                                */
2470                               *ptr = 0x0;
2471                               cpu_mfence();
2472                               if (*ptr != 0x0)
2473                                         page_bad = TRUE;
2474 
2475                               /*
2476                                * Restore original value.
2477                                */
2478                               *ptr = tmp;
2479 
2480                               /*
2481                                * Adjust array of valid/good pages.
2482                                */
2483                               if (page_bad == TRUE) {
2484                                         incr = PAGE_SIZE;
2485                                         continue;
2486                               }
2487 
2488                               /*
2489                                * Collapse page address into phys_avail[].  Do a
2490                                * continuation of the current phys_avail[] index
2491                                * when possible.
2492                                */
2493                               if (phys_avail[pa_indx].phys_end == pa) {
2494                                         /*
2495                                          * Continuation
2496                                          */
2497                                         phys_avail[pa_indx].phys_end += incr;
2498                               } else if (phys_avail[pa_indx].phys_beg ==
2499                                            phys_avail[pa_indx].phys_end) {
2500                                         /*
2501                                          * Current phys_avail is completely empty,
2502                                          * reuse the index.
2503                                          */
2504                                         phys_avail[pa_indx].phys_beg = pa;
2505                                         phys_avail[pa_indx].phys_end = pa + incr;
2506                               } else {
2507                                         /*
2508                                          * Allocate next phys_avail index.
2509                                          */
2510                                         ++pa_indx;
2511                                         if (pa_indx == PHYS_AVAIL_ARRAY_END) {
2512                                                   kprintf(
2513                     "Too many holes in the physical address space, giving up\n");
2514                                                   --pa_indx;
2515                                                   full = TRUE;
2516                                                   goto do_dump_avail;
2517                                         }
2518                                         phys_avail[pa_indx].phys_beg = pa;
2519                                         phys_avail[pa_indx].phys_end = pa + incr;
2520                               }
2521                               physmem += incr / PAGE_SIZE;
2522 
2523                               /*
2524                                * pa available for dumping
2525                                */
2526 do_dump_avail:
2527                               if (dump_avail[da_indx].phys_end == pa) {
2528                                         dump_avail[da_indx].phys_end += incr;
2529                               } else {
2530                                         ++da_indx;
2531                                         if (da_indx == DUMP_AVAIL_ARRAY_END) {
2532                                                   --da_indx;
2533                                                   goto do_next;
2534                                         }
2535                                         dump_avail[da_indx].phys_beg = pa;
2536                                         dump_avail[da_indx].phys_end = pa + incr;
2537                               }
2538 do_next:
2539                               if (full)
2540                                         break;
2541                     }
2542           }
2543           *pte = 0;
2544           cpu_invltlb();
2545           cpu_mfence();
2546 
2547           /*
2548            * The last chunk must contain at least one page plus the message
2549            * buffer to avoid complicating other code (message buffer address
2550            * calculation, etc.).
2551            */
2552           msgbuf_size = (MSGBUF_SIZE + PHYSMAP_ALIGN_MASK) & ~PHYSMAP_ALIGN_MASK;
2553 
2554           while (phys_avail[pa_indx].phys_beg + PHYSMAP_ALIGN + msgbuf_size >=
2555                  phys_avail[pa_indx].phys_end) {
2556                     physmem -= atop(phys_avail[pa_indx].phys_end -
2557                                         phys_avail[pa_indx].phys_beg);
2558                     phys_avail[pa_indx].phys_beg = 0;
2559                     phys_avail[pa_indx].phys_end = 0;
2560                     --pa_indx;
2561           }
2562 
2563           Maxmem = atop(phys_avail[pa_indx].phys_end);
2564 
2565           /* Trim off space for the message buffer. */
2566           phys_avail[pa_indx].phys_end -= msgbuf_size;
2567 
2568           avail_end = phys_avail[pa_indx].phys_end;
2569 
2570           /* Map the message buffer. */
2571           for (off = 0; off < msgbuf_size; off += PAGE_SIZE) {
2572                     pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2573           }
2574 
2575           /*
2576            * Try to get EFI framebuffer working as early as possible.
2577            *
2578            * WARN: Some BIOSes do not list the EFI framebuffer memory, causing
2579            * the pmap probe code to create a DMAP that does not cover its
2580            * physical address space, efi_fb_init_vaddr(1) might not return
2581            * an initialized framebuffer base pointer.  In this situation the
2582            * later efi_fb_init_vaddr(0) call will deal with it.
2583            */
2584           if (have_efi_framebuffer)
2585                     efi_fb_init_vaddr(1);
2586 }
2587 
2588 struct machintr_abi MachIntrABI;
2589 
2590 /*
2591  * IDT VECTORS:
2592  *        0         Divide by zero
2593  *        1         Debug
2594  *        2         NMI
2595  *        3         BreakPoint
2596  *        4         OverFlow
2597  *        5         Bound-Range
2598  *        6         Invalid OpCode
2599  *        7         Device Not Available (x87)
2600  *        8         Double-Fault
2601  *        9         Coprocessor Segment overrun (unsupported, reserved)
2602  *        10        Invalid-TSS
2603  *        11        Segment not present
2604  *        12        Stack
2605  *        13        General Protection
2606  *        14        Page Fault
2607  *        15        Reserved
2608  *        16        x87 FP Exception pending
2609  *        17        Alignment Check
2610  *        18        Machine Check
2611  *        19        SIMD floating point
2612  *        20-31     reserved
2613  *        32-255    INTn/external sources
2614  */
2615 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)2616 hammer_time(u_int64_t modulep, u_int64_t physfree)
2617 {
2618           caddr_t kmdp;
2619           int gsel_tss, x, cpu;
2620 #if 0 /* JG */
2621           int metadata_missing, off;
2622 #endif
2623           struct mdglobaldata *gd;
2624           struct privatespace *ps;
2625           u_int64_t msr;
2626 
2627           /*
2628            * Prevent lowering of the ipl if we call tsleep() early.
2629            */
2630           gd = &CPU_prvspace[0]->mdglobaldata;
2631           ps = (struct privatespace *)gd;
2632           bzero(gd, sizeof(*gd));
2633           bzero(&ps->common_tss, sizeof(ps->common_tss));
2634 
2635           /*
2636            * Note: on both UP and SMP curthread must be set non-NULL
2637            * early in the boot sequence because the system assumes
2638            * that 'curthread' is never NULL.
2639            */
2640 
2641           gd->mi.gd_curthread = &thread0;
2642           thread0.td_gd = &gd->mi;
2643 
2644           atdevbase = ISA_HOLE_START + PTOV_OFFSET;
2645 
2646 #if 0 /* JG */
2647           metadata_missing = 0;
2648           if (bootinfo.bi_modulep) {
2649                     preload_metadata = (caddr_t)bootinfo.bi_modulep + KERNBASE;
2650                     preload_bootstrap_relocate(KERNBASE);
2651           } else {
2652                     metadata_missing = 1;
2653           }
2654           if (bootinfo.bi_envp)
2655                     kern_envp = (caddr_t)bootinfo.bi_envp + KERNBASE;
2656 #endif
2657 
2658           preload_metadata = (caddr_t)(uintptr_t)(modulep + PTOV_OFFSET);
2659           preload_bootstrap_relocate(PTOV_OFFSET);
2660           kmdp = preload_search_by_type("elf kernel");
2661           if (kmdp == NULL)
2662                     kmdp = preload_search_by_type("elf64 kernel");
2663           boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
2664           kern_envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *) + PTOV_OFFSET;
2665 #ifdef DDB
2666           ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
2667           ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
2668 #endif
2669           efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
2670 
2671           if (boothowto & RB_VERBOSE)
2672                     bootverbose++;
2673 
2674           /*
2675            * Default MachIntrABI to ICU
2676            */
2677           MachIntrABI = MachIntrABI_ICU;
2678 
2679           /*
2680            * start with one cpu.  Note: with one cpu, ncpus_fit_mask remain 0.
2681            */
2682           ncpus = 1;
2683           ncpus_fit = 1;
2684           /* Init basic tunables, hz etc */
2685           init_param1();
2686 
2687           /*
2688            * make gdt memory segments
2689            */
2690           gdt_segs[GPROC0_SEL].ssd_base =
2691                     (uintptr_t) &CPU_prvspace[0]->common_tss;
2692 
2693           gd->mi.gd_prvspace = CPU_prvspace[0];
2694 
2695           for (x = 0; x < NGDT; x++) {
2696                     if (x != GPROC0_SEL && x != (GPROC0_SEL + 1))
2697                               ssdtosd(&gdt_segs[x], &gdt_cpu0[x]);
2698           }
2699           ssdtosyssd(&gdt_segs[GPROC0_SEL],
2700               (struct system_segment_descriptor *)&gdt_cpu0[GPROC0_SEL]);
2701 
2702           /*
2703            * WARNING!  Due to an Intel quirk, VMX exits set the gdt[] table
2704            *             limit to 0xFFFF.  To avoid having to do a heavy-weight
2705            *             reload, we just make ours maximally sized.
2706            */
2707           r_gdt.rd_limit = MAXGDT_LIMIT - 1;
2708           r_gdt.rd_base = (long)gdt_cpu0;
2709           lgdt(&r_gdt);
2710 
2711           wrmsr(MSR_FSBASE, 0);                   /* User value */
2712           wrmsr(MSR_GSBASE, (u_int64_t)&gd->mi);
2713           wrmsr(MSR_KGSBASE, 0);                  /* User value while in the kernel */
2714 
2715           mi_gdinit(&gd->mi, 0);
2716           cpu_gdinit(gd, 0);
2717           proc0paddr = proc0paddr_buff;
2718           mi_proc0init(&gd->mi, proc0paddr);
2719           safepri = TDPRI_MAX;
2720 
2721           /* spinlocks and the BGL */
2722           init_locks();
2723 
2724           /* exceptions */
2725           for (x = 0; x < NIDT; x++)
2726                     setidt_global(x, rsvdary[x], SDT_SYSIGT, SEL_KPL, 0);
2727           setidt_global(IDT_DE, &IDTVEC(div),  SDT_SYSIGT, SEL_KPL, 0);
2728           setidt_global(IDT_DB, &IDTVEC(dbg),  SDT_SYSIGT, SEL_KPL, 2);
2729           setidt_global(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 1);
2730           setidt_global(IDT_BP, &IDTVEC(bpt),  SDT_SYSIGT, SEL_UPL, 0);
2731           setidt_global(IDT_OF, &IDTVEC(ofl),  SDT_SYSIGT, SEL_KPL, 0);
2732           setidt_global(IDT_BR, &IDTVEC(bnd),  SDT_SYSIGT, SEL_KPL, 0);
2733           setidt_global(IDT_UD, &IDTVEC(ill),  SDT_SYSIGT, SEL_KPL, 0);
2734           setidt_global(IDT_NM, &IDTVEC(dna),  SDT_SYSIGT, SEL_KPL, 0);
2735           setidt_global(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
2736           setidt_global(IDT_FPUGP, &IDTVEC(fpusegm),  SDT_SYSIGT, SEL_KPL, 0);
2737           setidt_global(IDT_TS, &IDTVEC(tss),  SDT_SYSIGT, SEL_KPL, 0);
2738           setidt_global(IDT_NP, &IDTVEC(missing),  SDT_SYSIGT, SEL_KPL, 0);
2739           setidt_global(IDT_SS, &IDTVEC(stk),  SDT_SYSIGT, SEL_KPL, 0);
2740           setidt_global(IDT_GP, &IDTVEC(prot),  SDT_SYSIGT, SEL_KPL, 0);
2741           setidt_global(IDT_PF, &IDTVEC(page),  SDT_SYSIGT, SEL_KPL, 0);
2742           setidt_global(IDT_MF, &IDTVEC(fpu),  SDT_SYSIGT, SEL_KPL, 0);
2743           setidt_global(IDT_AC, &IDTVEC(align), SDT_SYSIGT, SEL_KPL, 0);
2744           setidt_global(IDT_MC, &IDTVEC(mchk),  SDT_SYSIGT, SEL_KPL, 0);
2745           setidt_global(IDT_XF, &IDTVEC(xmm), SDT_SYSIGT, SEL_KPL, 0);
2746 
2747           for (cpu = 0; cpu < MAXCPU; ++cpu) {
2748                     r_idt_arr[cpu].rd_limit = sizeof(idt_arr[cpu]) - 1;
2749                     r_idt_arr[cpu].rd_base = (long) &idt_arr[cpu][0];
2750           }
2751 
2752           lidt(&r_idt_arr[0]);
2753 
2754           /*
2755            * Initialize the console before we print anything out.
2756            */
2757           cninit();
2758 
2759 #if 0 /* JG */
2760           if (metadata_missing)
2761                     kprintf("WARNING: loader(8) metadata is missing!\n");
2762 #endif
2763 
2764 #if       NISA >0
2765           elcr_probe();
2766           isa_defaultirq();
2767 #endif
2768           rand_initialize();
2769 
2770           /*
2771            * Initialize IRQ mapping
2772            *
2773            * NOTE:
2774            * SHOULD be after elcr_probe()
2775            */
2776           MachIntrABI_ICU.initmap();
2777           MachIntrABI_IOAPIC.initmap();
2778 
2779 #ifdef DDB
2780           kdb_init();
2781           if (boothowto & RB_KDB)
2782                     Debugger("Boot flags requested debugger");
2783 #endif
2784 
2785           identify_cpu();               /* Final stage of CPU initialization */
2786           initializecpu(0);   /* Initialize CPU registers */
2787 
2788           /*
2789            * On modern Intel cpus, haswell or later, cpu_idle_hlt=1 is better
2790            * because the cpu does significant power management in MWAIT
2791            * (also suggested is to set sysctl machdep.mwait.CX.idle=AUTODEEP).
2792            *
2793            * On many AMD cpus cpu_idle_hlt=3 is better, because the cpu does
2794            * significant power management only when using ACPI halt mode.
2795            * (However, on Ryzen, mode 4 (HLT) also does power management).
2796            *
2797            * On older AMD or Intel cpus, cpu_idle_hlt=2 is better because ACPI
2798            * is needed to reduce power consumption, but wakeup times are often
2799            * too long.
2800            */
2801           if (cpu_vendor_id == CPU_VENDOR_INTEL &&
2802               CPUID_TO_MODEL(cpu_id) >= 0x3C) {   /* Haswell or later */
2803                     cpu_idle_hlt = 1;
2804           }
2805           if (cpu_vendor_id == CPU_VENDOR_AMD) {
2806                     if (CPUID_TO_FAMILY(cpu_id) >= 0x17) {
2807                               /* Ryzen or later */
2808                               cpu_idle_hlt = 3;
2809                     } else if (CPUID_TO_FAMILY(cpu_id) >= 0x14) {
2810                               /* Bobcat or later */
2811                               cpu_idle_hlt = 3;
2812                     }
2813           }
2814 
2815           TUNABLE_INT_FETCH("hw.apic_io_enable", &ioapic_enable); /* for compat */
2816           TUNABLE_INT_FETCH("hw.ioapic_enable", &ioapic_enable);
2817           TUNABLE_INT_FETCH("hw.lapic_enable", &lapic_enable);
2818           TUNABLE_INT_FETCH("machdep.cpu_idle_hlt", &cpu_idle_hlt);
2819 
2820           /*
2821            * By default always enable the ioapic.  Certain virtual machines
2822            * may not work with the I/O apic enabled and can be specified in
2823            * the case statement below.  On the other hand, if the ioapic is
2824            * disabled for virtual machines which DO work with the I/O apic,
2825            * the virtual machine can implode if we disable the I/O apic.
2826            *
2827            * For now enable the ioapic for all guests.
2828            *
2829            * NOTE: This must be done after identify_cpu(), which sets
2830            *         'cpu_feature2'.
2831            */
2832           if (ioapic_enable < 0) {
2833                     ioapic_enable = 1;
2834                     switch(vmm_guest) {
2835                     case VMM_GUEST_NONE:          /* should be enabled on real HW */
2836                     case VMM_GUEST_KVM: /* must be enabled or VM implodes */
2837                               ioapic_enable = 1;
2838                               break;
2839                     default:            /* enable by default for other VMs */
2840                               ioapic_enable = 1;
2841                               break;
2842                     }
2843           }
2844 
2845           /*
2846            * TSS entry point for interrupts, traps, and exceptions
2847            * (sans NMI).  This will always go to near the top of the pcpu
2848            * trampoline area.  Hardware-pushed data will be copied into
2849            * the trap-frame on entry, and (if necessary) returned to the
2850            * trampoline on exit.
2851            *
2852            * We store some pcb data for the trampoline code above the
2853            * stack the cpu hw pushes into, and arrange things so the
2854            * address of tr_pcb_rsp is the same as the desired top of
2855            * stack.
2856            */
2857           ps->common_tss.tss_rsp0 = (register_t)&ps->trampoline.tr_pcb_rsp;
2858           ps->trampoline.tr_pcb_rsp = ps->common_tss.tss_rsp0;
2859           ps->trampoline.tr_pcb_gs_kernel = (register_t)gd;
2860           ps->trampoline.tr_pcb_cr3 = KPML4phys;  /* adj to user cr3 live */
2861           ps->dbltramp.tr_pcb_gs_kernel = (register_t)gd;
2862           ps->dbltramp.tr_pcb_cr3 = KPML4phys;
2863           ps->dbgtramp.tr_pcb_gs_kernel = (register_t)gd;
2864           ps->dbgtramp.tr_pcb_cr3 = KPML4phys;
2865 
2866           /* double fault stack */
2867           ps->common_tss.tss_ist1 = (register_t)&ps->dbltramp.tr_pcb_rsp;
2868           /* #DB debugger needs its own stack */
2869           ps->common_tss.tss_ist2 = (register_t)&ps->dbgtramp.tr_pcb_rsp;
2870 
2871           /* Set the IO permission bitmap (empty due to tss seg limit) */
2872           ps->common_tss.tss_iobase = sizeof(struct x86_64tss);
2873 
2874           gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
2875           gd->gd_gdt = &gdt_cpu0[0];
2876           gd->gd_tss_gdt = &gd->gd_gdt[GPROC0_SEL];
2877           gd->gd_common_tssd = *gd->gd_tss_gdt;
2878           ltr(gsel_tss);
2879 
2880           /* Set up the fast syscall stuff */
2881           msr = rdmsr(MSR_EFER) | EFER_SCE;
2882           wrmsr(MSR_EFER, msr);
2883           wrmsr(MSR_LSTAR, (u_int64_t)IDTVEC(fast_syscall));
2884           wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
2885           msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
2886                 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
2887           wrmsr(MSR_STAR, msr);
2888           wrmsr(MSR_SF_MASK, PSL_NT|PSL_T|PSL_I|PSL_C|PSL_D|PSL_IOPL|PSL_AC);
2889 
2890           getmemsize(kmdp, physfree);
2891           init_param2(physmem);
2892 
2893           /* now running on new page tables, configured,and u/iom is accessible */
2894 
2895           /* Map the message buffer. */
2896 #if 0 /* JG */
2897           for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE)
2898                     pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off);
2899 #endif
2900 
2901           msgbufinit(msgbufp, MSGBUF_SIZE);
2902 
2903 
2904           /* transfer to user mode */
2905 
2906           _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
2907           _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
2908           _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
2909 
2910           load_ds(_udatasel);
2911           load_es(_udatasel);
2912           load_fs(_udatasel);
2913 
2914           /* setup proc 0's pcb */
2915           thread0.td_pcb->pcb_flags = 0;
2916           thread0.td_pcb->pcb_cr3 = KPML4phys;
2917           thread0.td_pcb->pcb_cr3_iso = 0;
2918           thread0.td_pcb->pcb_ext = NULL;
2919           lwp0.lwp_md.md_regs = &proc0_tf;        /* XXX needed? */
2920 
2921           /* Location of kernel stack for locore */
2922           return ((u_int64_t)thread0.td_pcb);
2923 }
2924 
2925 /*
2926  * Initialize machine-dependant portions of the global data structure.
2927  * Note that the global data area and cpu0's idlestack in the private
2928  * data space were allocated in locore.
2929  *
2930  * Note: the idlethread's cpl is 0
2931  *
2932  * WARNING!  Called from early boot, 'mycpu' may not work yet.
2933  */
2934 void
cpu_gdinit(struct mdglobaldata * gd,int cpu)2935 cpu_gdinit(struct mdglobaldata *gd, int cpu)
2936 {
2937           if (cpu)
2938                     gd->mi.gd_curthread = &gd->mi.gd_idlethread;
2939 
2940           lwkt_init_thread(&gd->mi.gd_idlethread,
2941                               gd->mi.gd_prvspace->idlestack,
2942                               sizeof(gd->mi.gd_prvspace->idlestack),
2943                               0, &gd->mi);
2944           lwkt_set_comm(&gd->mi.gd_idlethread, "idle_%d", cpu);
2945           gd->mi.gd_idlethread.td_switch = cpu_lwkt_switch;
2946           gd->mi.gd_idlethread.td_sp -= sizeof(void *);
2947           *(void **)gd->mi.gd_idlethread.td_sp = cpu_idle_restore;
2948 }
2949 
2950 /*
2951  * We only have to check for DMAP bounds, the globaldata space is
2952  * actually part of the kernel_map so we don't have to waste time
2953  * checking CPU_prvspace[*].
2954  */
2955 int
is_globaldata_space(vm_offset_t saddr,vm_offset_t eaddr)2956 is_globaldata_space(vm_offset_t saddr, vm_offset_t eaddr)
2957 {
2958 #if 0
2959           if (saddr >= (vm_offset_t)&CPU_prvspace[0] &&
2960               eaddr <= (vm_offset_t)&CPU_prvspace[MAXCPU]) {
2961                     return (TRUE);
2962           }
2963 #endif
2964           if (saddr >= DMAP_MIN_ADDRESS && eaddr <= DMAP_MAX_ADDRESS)
2965                     return (TRUE);
2966           return (FALSE);
2967 }
2968 
2969 struct globaldata *
globaldata_find(int cpu)2970 globaldata_find(int cpu)
2971 {
2972           KKASSERT(cpu >= 0 && cpu < ncpus);
2973           return(&CPU_prvspace[cpu]->mdglobaldata.mi);
2974 }
2975 
2976 /*
2977  * This path should be safe from the SYSRET issue because only stopped threads
2978  * can have their %rip adjusted this way (and all heavy weight thread switches
2979  * clear QUICKREF and thus do not use SYSRET).  However, the code path is
2980  * convoluted so add a safety by forcing %rip to be cannonical.
2981  */
2982 int
ptrace_set_pc(struct lwp * lp,unsigned long addr)2983 ptrace_set_pc(struct lwp *lp, unsigned long addr)
2984 {
2985           if (addr & 0x0000800000000000LLU)
2986                     lp->lwp_md.md_regs->tf_rip = addr | 0xFFFF000000000000LLU;
2987           else
2988                     lp->lwp_md.md_regs->tf_rip = addr & 0x0000FFFFFFFFFFFFLLU;
2989           return (0);
2990 }
2991 
2992 int
ptrace_single_step(struct lwp * lp)2993 ptrace_single_step(struct lwp *lp)
2994 {
2995           lp->lwp_md.md_regs->tf_rflags |= PSL_T;
2996           return (0);
2997 }
2998 
2999 int
fill_regs(struct lwp * lp,struct reg * regs)3000 fill_regs(struct lwp *lp, struct reg *regs)
3001 {
3002           struct trapframe *tp;
3003 
3004           if ((tp = lp->lwp_md.md_regs) == NULL)
3005                     return EINVAL;
3006           bcopy(&tp->tf_rdi, &regs->r_rdi, sizeof(*regs));
3007           return (0);
3008 }
3009 
3010 int
set_regs(struct lwp * lp,struct reg * regs)3011 set_regs(struct lwp *lp, struct reg *regs)
3012 {
3013           struct trapframe *tp;
3014 
3015           tp = lp->lwp_md.md_regs;
3016           if (!EFL_SECURE(regs->r_rflags, tp->tf_rflags) ||
3017               !CS_SECURE(regs->r_cs))
3018                     return (EINVAL);
3019           bcopy(&regs->r_rdi, &tp->tf_rdi, sizeof(*regs));
3020           clear_quickret();
3021           return (0);
3022 }
3023 
3024 static void
fill_fpregs_xmm(struct savexmm * sv_xmm,struct save87 * sv_87)3025 fill_fpregs_xmm(struct savexmm *sv_xmm, struct save87 *sv_87)
3026 {
3027           struct env87 *penv_87 = &sv_87->sv_env;
3028           struct envxmm *penv_xmm = &sv_xmm->sv_env;
3029           int i;
3030 
3031           /* FPU control/status */
3032           penv_87->en_cw = penv_xmm->en_cw;
3033           penv_87->en_sw = penv_xmm->en_sw;
3034           penv_87->en_tw = penv_xmm->en_tw;
3035           penv_87->en_fip = penv_xmm->en_fip;
3036           penv_87->en_fcs = penv_xmm->en_fcs;
3037           penv_87->en_opcode = penv_xmm->en_opcode;
3038           penv_87->en_foo = penv_xmm->en_foo;
3039           penv_87->en_fos = penv_xmm->en_fos;
3040 
3041           /* FPU registers */
3042           for (i = 0; i < 8; ++i)
3043                     sv_87->sv_ac[i] = sv_xmm->sv_fp[i].fp_acc;
3044 }
3045 
3046 static void
set_fpregs_xmm(struct save87 * sv_87,struct savexmm * sv_xmm)3047 set_fpregs_xmm(struct save87 *sv_87, struct savexmm *sv_xmm)
3048 {
3049           struct env87 *penv_87 = &sv_87->sv_env;
3050           struct envxmm *penv_xmm = &sv_xmm->sv_env;
3051           int i;
3052 
3053           /* FPU control/status */
3054           penv_xmm->en_cw = penv_87->en_cw;
3055           penv_xmm->en_sw = penv_87->en_sw;
3056           penv_xmm->en_tw = penv_87->en_tw;
3057           penv_xmm->en_fip = penv_87->en_fip;
3058           penv_xmm->en_fcs = penv_87->en_fcs;
3059           penv_xmm->en_opcode = penv_87->en_opcode;
3060           penv_xmm->en_foo = penv_87->en_foo;
3061           penv_xmm->en_fos = penv_87->en_fos;
3062 
3063           /* FPU registers */
3064           for (i = 0; i < 8; ++i)
3065                     sv_xmm->sv_fp[i].fp_acc = sv_87->sv_ac[i];
3066 }
3067 
3068 int
fill_fpregs(struct lwp * lp,struct fpreg * fpregs)3069 fill_fpregs(struct lwp *lp, struct fpreg *fpregs)
3070 {
3071           if (lp->lwp_thread == NULL || lp->lwp_thread->td_pcb == NULL)
3072                     return EINVAL;
3073           if (cpu_fxsr) {
3074                     fill_fpregs_xmm(&lp->lwp_thread->td_pcb->pcb_save.sv_xmm,
3075                                         (struct save87 *)fpregs);
3076                     return (0);
3077           }
3078           bcopy(&lp->lwp_thread->td_pcb->pcb_save.sv_87, fpregs, sizeof *fpregs);
3079           return (0);
3080 }
3081 
3082 int
set_fpregs(struct lwp * lp,struct fpreg * fpregs)3083 set_fpregs(struct lwp *lp, struct fpreg *fpregs)
3084 {
3085           if (cpu_fxsr) {
3086                     set_fpregs_xmm((struct save87 *)fpregs,
3087                                      &lp->lwp_thread->td_pcb->pcb_save.sv_xmm);
3088                     return (0);
3089           }
3090           bcopy(fpregs, &lp->lwp_thread->td_pcb->pcb_save.sv_87, sizeof *fpregs);
3091           return (0);
3092 }
3093 
3094 int
fill_dbregs(struct lwp * lp,struct dbreg * dbregs)3095 fill_dbregs(struct lwp *lp, struct dbreg *dbregs)
3096 {
3097           struct pcb *pcb;
3098 
3099           if (lp == NULL) {
3100                     dbregs->dr[0] = rdr0();
3101                     dbregs->dr[1] = rdr1();
3102                     dbregs->dr[2] = rdr2();
3103                     dbregs->dr[3] = rdr3();
3104                     dbregs->dr[4] = rdr4();
3105                     dbregs->dr[5] = rdr5();
3106                     dbregs->dr[6] = rdr6();
3107                     dbregs->dr[7] = rdr7();
3108                     return (0);
3109           }
3110           if (lp->lwp_thread == NULL || (pcb = lp->lwp_thread->td_pcb) == NULL)
3111                     return EINVAL;
3112           dbregs->dr[0] = pcb->pcb_dr0;
3113           dbregs->dr[1] = pcb->pcb_dr1;
3114           dbregs->dr[2] = pcb->pcb_dr2;
3115           dbregs->dr[3] = pcb->pcb_dr3;
3116           dbregs->dr[4] = 0;
3117           dbregs->dr[5] = 0;
3118           dbregs->dr[6] = pcb->pcb_dr6;
3119           dbregs->dr[7] = pcb->pcb_dr7;
3120           return (0);
3121 }
3122 
3123 int
set_dbregs(struct lwp * lp,struct dbreg * dbregs)3124 set_dbregs(struct lwp *lp, struct dbreg *dbregs)
3125 {
3126           if (lp == NULL) {
3127                     load_dr0(dbregs->dr[0]);
3128                     load_dr1(dbregs->dr[1]);
3129                     load_dr2(dbregs->dr[2]);
3130                     load_dr3(dbregs->dr[3]);
3131                     load_dr4(dbregs->dr[4]);
3132                     load_dr5(dbregs->dr[5]);
3133                     load_dr6(dbregs->dr[6]);
3134                     load_dr7(dbregs->dr[7]);
3135           } else {
3136                     struct pcb *pcb;
3137                     struct ucred *ucred;
3138                     int i;
3139                     uint64_t mask1, mask2;
3140 
3141                     /*
3142                      * Don't let an illegal value for dr7 get set.    Specifically,
3143                      * check for undefined settings.  Setting these bit patterns
3144                      * result in undefined behaviour and can lead to an unexpected
3145                      * TRCTRAP.
3146                      */
3147                     /* JG this loop looks unreadable */
3148                     /* Check 4 2-bit fields for invalid patterns.
3149                      * These fields are R/Wi, for i = 0..3
3150                      */
3151                     /* Is 10 in LENi allowed when running in compatibility mode? */
3152                     /* Pattern 10 in R/Wi might be used to indicate
3153                      * breakpoint on I/O. Further analysis should be
3154                      * carried to decide if it is safe and useful to
3155                      * provide access to that capability
3156                      */
3157                     for (i = 0, mask1 = 0x3<<16, mask2 = 0x2<<16; i < 4;
3158                          i++, mask1 <<= 4, mask2 <<= 4)
3159                               if ((dbregs->dr[7] & mask1) == mask2)
3160                                         return (EINVAL);
3161 
3162                     pcb = lp->lwp_thread->td_pcb;
3163                     ucred = lp->lwp_proc->p_ucred;
3164 
3165                     /*
3166                      * Don't let a process set a breakpoint that is not within the
3167                      * process's address space.  If a process could do this, it
3168                      * could halt the system by setting a breakpoint in the kernel
3169                      * (if ddb was enabled).  Thus, we need to check to make sure
3170                      * that no breakpoints are being enabled for addresses outside
3171                      * process's address space, unless, perhaps, we were called by
3172                      * uid 0.
3173                      *
3174                      * XXX - what about when the watched area of the user's
3175                      * address space is written into from within the kernel
3176                      * ... wouldn't that still cause a breakpoint to be generated
3177                      * from within kernel mode?
3178                      */
3179 
3180                     if (caps_priv_check(ucred, SYSCAP_RESTRICTEDROOT) != 0) {
3181                               if (dbregs->dr[7] & 0x3) {
3182                                         /* dr0 is enabled */
3183                                         if (dbregs->dr[0] >= VM_MAX_USER_ADDRESS)
3184                                                   return (EINVAL);
3185                               }
3186 
3187                               if (dbregs->dr[7] & (0x3<<2)) {
3188                                         /* dr1 is enabled */
3189                                         if (dbregs->dr[1] >= VM_MAX_USER_ADDRESS)
3190                                                   return (EINVAL);
3191                               }
3192 
3193                               if (dbregs->dr[7] & (0x3<<4)) {
3194                                         /* dr2 is enabled */
3195                                         if (dbregs->dr[2] >= VM_MAX_USER_ADDRESS)
3196                                                   return (EINVAL);
3197                               }
3198 
3199                               if (dbregs->dr[7] & (0x3<<6)) {
3200                                         /* dr3 is enabled */
3201                                         if (dbregs->dr[3] >= VM_MAX_USER_ADDRESS)
3202                                                   return (EINVAL);
3203                               }
3204                     }
3205 
3206                     pcb->pcb_dr0 = dbregs->dr[0];
3207                     pcb->pcb_dr1 = dbregs->dr[1];
3208                     pcb->pcb_dr2 = dbregs->dr[2];
3209                     pcb->pcb_dr3 = dbregs->dr[3];
3210                     pcb->pcb_dr6 = dbregs->dr[6];
3211                     pcb->pcb_dr7 = dbregs->dr[7];
3212 
3213                     pcb->pcb_flags |= PCB_DBREGS;
3214           }
3215 
3216           return (0);
3217 }
3218 
3219 /*
3220  * Return > 0 if a hardware breakpoint has been hit, and the
3221  * breakpoint was in user space.  Return 0, otherwise.
3222  */
3223 int
user_dbreg_trap(void)3224 user_dbreg_trap(void)
3225 {
3226           u_int64_t dr7, dr6; /* debug registers dr6 and dr7 */
3227           u_int64_t bp;       /* breakpoint bits extracted from dr6 */
3228           int nbp;            /* number of breakpoints that triggered */
3229           caddr_t addr[4];    /* breakpoint addresses */
3230           int i;
3231 
3232           dr7 = rdr7();
3233           if ((dr7 & 0xff) == 0) {
3234                     /*
3235                      * all GE and LE bits in the dr7 register are zero,
3236                      * thus the trap couldn't have been caused by the
3237                      * hardware debug registers
3238                      */
3239                     return 0;
3240           }
3241 
3242           nbp = 0;
3243           dr6 = rdr6();
3244           bp = dr6 & 0xf;
3245 
3246           if (bp == 0) {
3247                     /*
3248                      * None of the breakpoint bits are set meaning this
3249                      * trap was not caused by any of the debug registers
3250                      */
3251                     return 0;
3252           }
3253 
3254           /*
3255            * at least one of the breakpoints were hit, check to see
3256            * which ones and if any of them are user space addresses
3257            */
3258 
3259           if (bp & 0x01) {
3260                     addr[nbp++] = (caddr_t)rdr0();
3261           }
3262           if (bp & 0x02) {
3263                     addr[nbp++] = (caddr_t)rdr1();
3264           }
3265           if (bp & 0x04) {
3266                     addr[nbp++] = (caddr_t)rdr2();
3267           }
3268           if (bp & 0x08) {
3269                     addr[nbp++] = (caddr_t)rdr3();
3270           }
3271 
3272           for (i = 0; i < nbp; i++) {
3273                     if (addr[i] < (caddr_t)VM_MAX_USER_ADDRESS) {
3274                               /*
3275                                * addr[i] is in user space
3276                                */
3277                               return nbp;
3278                     }
3279           }
3280 
3281           /*
3282            * None of the breakpoints are in user space.
3283            */
3284           return 0;
3285 }
3286 
3287 
3288 #ifndef DDB
3289 void
Debugger(const char * msg)3290 Debugger(const char *msg)
3291 {
3292           kprintf("Debugger(\"%s\") called.\n", msg);
3293 }
3294 #endif /* no DDB */
3295 
3296 #ifdef DDB
3297 
3298 /*
3299  * Provide inb() and outb() as functions.  They are normally only
3300  * available as macros calling inlined functions, thus cannot be
3301  * called inside DDB.
3302  *
3303  * The actual code is stolen from <machine/cpufunc.h>, and de-inlined.
3304  */
3305 
3306 #undef inb
3307 #undef outb
3308 
3309 /* silence compiler warnings */
3310 u_char inb(u_int);
3311 void outb(u_int, u_char);
3312 
3313 u_char
inb(u_int port)3314 inb(u_int port)
3315 {
3316           u_char    data;
3317           /*
3318            * We use %%dx and not %1 here because i/o is done at %dx and not at
3319            * %edx, while gcc generates inferior code (movw instead of movl)
3320            * if we tell it to load (u_short) port.
3321            */
3322           __asm __volatile("inb %%dx,%0" : "=a" (data) : "d" (port));
3323           return (data);
3324 }
3325 
3326 void
outb(u_int port,u_char data)3327 outb(u_int port, u_char data)
3328 {
3329           u_char    al;
3330           /*
3331            * Use an unnecessary assignment to help gcc's register allocator.
3332            * This make a large difference for gcc-1.40 and a tiny difference
3333            * for gcc-2.6.0.  For gcc-1.40, al had to be ``asm("ax")'' for
3334            * best results.  gcc-2.6.0 can't handle this.
3335            */
3336           al = data;
3337           __asm __volatile("outb %0,%%dx" : : "a" (al), "d" (port));
3338 }
3339 
3340 #endif /* DDB */
3341 
3342 
3343 
3344 /*
3345  * initialize all the SMP locks
3346  */
3347 
3348 /* critical region when masking or unmasking interupts */
3349 struct spinlock_deprecated imen_spinlock;
3350 
3351 /* locks com (tty) data/hardware accesses: a FASTINTR() */
3352 struct spinlock_deprecated com_spinlock;
3353 
3354 /* lock regions around the clock hardware */
3355 struct spinlock_deprecated clock_spinlock;
3356 
3357 static void
init_locks(void)3358 init_locks(void)
3359 {
3360           /*
3361            * Get the initial mplock with a count of 1 for the BSP.
3362            * This uses a LOGICAL cpu ID, ie BSP == 0.
3363            */
3364           cpu_get_initial_mplock();
3365           /* DEPRECATED */
3366           spin_init_deprecated(&imen_spinlock);
3367           spin_init_deprecated(&com_spinlock);
3368           spin_init_deprecated(&clock_spinlock);
3369 
3370           /* our token pool needs to work early */
3371           lwkt_token_pool_init();
3372 }
3373 
3374 boolean_t
cpu_mwait_hint_valid(uint32_t hint)3375 cpu_mwait_hint_valid(uint32_t hint)
3376 {
3377           int cx_idx, sub;
3378 
3379           cx_idx = MWAIT_EAX_TO_CX(hint);
3380           if (cx_idx >= CPU_MWAIT_CX_MAX)
3381                     return FALSE;
3382 
3383           sub = MWAIT_EAX_TO_CX_SUB(hint);
3384           if (sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3385                     return FALSE;
3386 
3387           return TRUE;
3388 }
3389 
3390 void
cpu_mwait_cx_no_bmsts(void)3391 cpu_mwait_cx_no_bmsts(void)
3392 {
3393           atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_STS);
3394 }
3395 
3396 void
cpu_mwait_cx_no_bmarb(void)3397 cpu_mwait_cx_no_bmarb(void)
3398 {
3399           atomic_clear_int(&cpu_mwait_c3_preamble, CPU_MWAIT_C3_PREAMBLE_BM_ARB);
3400 }
3401 
3402 static int
cpu_mwait_cx_hint2name(int hint,char * name,int namelen,boolean_t allow_auto)3403 cpu_mwait_cx_hint2name(int hint, char *name, int namelen, boolean_t allow_auto)
3404 {
3405           int old_cx_idx, sub = 0;
3406 
3407           if (hint >= 0) {
3408                     old_cx_idx = MWAIT_EAX_TO_CX(hint);
3409                     sub = MWAIT_EAX_TO_CX_SUB(hint);
3410           } else if (hint == CPU_MWAIT_HINT_AUTO) {
3411                     old_cx_idx = allow_auto ? CPU_MWAIT_C2 : CPU_MWAIT_CX_MAX;
3412           } else if (hint == CPU_MWAIT_HINT_AUTODEEP) {
3413                     old_cx_idx = allow_auto ? CPU_MWAIT_C3 : CPU_MWAIT_CX_MAX;
3414           } else {
3415                     old_cx_idx = CPU_MWAIT_CX_MAX;
3416           }
3417 
3418           if (!CPU_MWAIT_HAS_CX)
3419                     strlcpy(name, "NONE", namelen);
3420           else if (allow_auto && hint == CPU_MWAIT_HINT_AUTO)
3421                     strlcpy(name, "AUTO", namelen);
3422           else if (allow_auto && hint == CPU_MWAIT_HINT_AUTODEEP)
3423                     strlcpy(name, "AUTODEEP", namelen);
3424           else if (old_cx_idx >= CPU_MWAIT_CX_MAX ||
3425               sub >= cpu_mwait_cx_info[old_cx_idx].subcnt)
3426                     strlcpy(name, "INVALID", namelen);
3427           else
3428                     ksnprintf(name, namelen, "C%d/%d", old_cx_idx, sub);
3429 
3430           return old_cx_idx;
3431 }
3432 
3433 static int
cpu_mwait_cx_name2hint(char * name,int * hint0,boolean_t allow_auto)3434 cpu_mwait_cx_name2hint(char *name, int *hint0, boolean_t allow_auto)
3435 {
3436           int cx_idx, sub, hint;
3437           char *ptr, *start;
3438 
3439           if (allow_auto && strcmp(name, "AUTO") == 0) {
3440                     hint = CPU_MWAIT_HINT_AUTO;
3441                     cx_idx = CPU_MWAIT_C2;
3442                     goto done;
3443           }
3444           if (allow_auto && strcmp(name, "AUTODEEP") == 0) {
3445                     hint = CPU_MWAIT_HINT_AUTODEEP;
3446                     cx_idx = CPU_MWAIT_C3;
3447                     goto done;
3448           }
3449 
3450           if (strlen(name) < 4 || toupper(name[0]) != 'C')
3451                     return -1;
3452           start = &name[1];
3453           ptr = NULL;
3454 
3455           cx_idx = strtol(start, &ptr, 10);
3456           if (ptr == start || *ptr != '/')
3457                     return -1;
3458           if (cx_idx < 0 || cx_idx >= CPU_MWAIT_CX_MAX)
3459                     return -1;
3460 
3461           start = ptr + 1;
3462           ptr = NULL;
3463 
3464           sub = strtol(start, &ptr, 10);
3465           if (*ptr != '\0')
3466                     return -1;
3467           if (sub < 0 || sub >= cpu_mwait_cx_info[cx_idx].subcnt)
3468                     return -1;
3469 
3470           hint = MWAIT_EAX_HINT(cx_idx, sub);
3471 done:
3472           *hint0 = hint;
3473           return cx_idx;
3474 }
3475 
3476 static int
cpu_mwait_cx_transit(int old_cx_idx,int cx_idx)3477 cpu_mwait_cx_transit(int old_cx_idx, int cx_idx)
3478 {
3479           if (cx_idx >= CPU_MWAIT_C3 && cpu_mwait_c3_preamble)
3480                     return EOPNOTSUPP;
3481           if (old_cx_idx < CPU_MWAIT_C3 && cx_idx >= CPU_MWAIT_C3) {
3482                     int error;
3483 
3484                     error = cputimer_intr_powersave_addreq();
3485                     if (error)
3486                               return error;
3487           } else if (old_cx_idx >= CPU_MWAIT_C3 && cx_idx < CPU_MWAIT_C3) {
3488                     cputimer_intr_powersave_remreq();
3489           }
3490           return 0;
3491 }
3492 
3493 static int
cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS,int * hint0,boolean_t allow_auto)3494 cpu_mwait_cx_select_sysctl(SYSCTL_HANDLER_ARGS, int *hint0,
3495     boolean_t allow_auto)
3496 {
3497           int error, cx_idx, old_cx_idx, hint;
3498           char name[CPU_MWAIT_CX_NAMELEN];
3499 
3500           hint = *hint0;
3501           old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name),
3502               allow_auto);
3503 
3504           error = sysctl_handle_string(oidp, name, sizeof(name), req);
3505           if (error != 0 || req->newptr == NULL)
3506                     return error;
3507 
3508           if (!CPU_MWAIT_HAS_CX)
3509                     return EOPNOTSUPP;
3510 
3511           cx_idx = cpu_mwait_cx_name2hint(name, &hint, allow_auto);
3512           if (cx_idx < 0)
3513                     return EINVAL;
3514 
3515           error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3516           if (error)
3517                     return error;
3518 
3519           *hint0 = hint;
3520           return 0;
3521 }
3522 
3523 static int
cpu_mwait_cx_setname(struct cpu_idle_stat * stat,const char * cx_name)3524 cpu_mwait_cx_setname(struct cpu_idle_stat *stat, const char *cx_name)
3525 {
3526           int error, cx_idx, old_cx_idx, hint;
3527           char name[CPU_MWAIT_CX_NAMELEN];
3528 
3529           KASSERT(CPU_MWAIT_HAS_CX, ("cpu does not support mwait CX extension"));
3530 
3531           hint = stat->hint;
3532           old_cx_idx = cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3533 
3534           strlcpy(name, cx_name, sizeof(name));
3535           cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3536           if (cx_idx < 0)
3537                     return EINVAL;
3538 
3539           error = cpu_mwait_cx_transit(old_cx_idx, cx_idx);
3540           if (error)
3541                     return error;
3542 
3543           stat->hint = hint;
3544           return 0;
3545 }
3546 
3547 static int
cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)3548 cpu_mwait_cx_idle_sysctl(SYSCTL_HANDLER_ARGS)
3549 {
3550           int hint = cpu_mwait_halt_global;
3551           int error, cx_idx, cpu;
3552           char name[CPU_MWAIT_CX_NAMELEN], cx_name[CPU_MWAIT_CX_NAMELEN];
3553 
3554           cpu_mwait_cx_hint2name(hint, name, sizeof(name), TRUE);
3555 
3556           error = sysctl_handle_string(oidp, name, sizeof(name), req);
3557           if (error != 0 || req->newptr == NULL)
3558                     return error;
3559 
3560           if (!CPU_MWAIT_HAS_CX)
3561                     return EOPNOTSUPP;
3562 
3563           /* Save name for later per-cpu CX configuration */
3564           strlcpy(cx_name, name, sizeof(cx_name));
3565 
3566           cx_idx = cpu_mwait_cx_name2hint(name, &hint, TRUE);
3567           if (cx_idx < 0)
3568                     return EINVAL;
3569 
3570           /* Change per-cpu CX configuration */
3571           for (cpu = 0; cpu < ncpus; ++cpu) {
3572                     error = cpu_mwait_cx_setname(&cpu_idle_stats[cpu], cx_name);
3573                     if (error)
3574                               return error;
3575           }
3576 
3577           cpu_mwait_halt_global = hint;
3578           return 0;
3579 }
3580 
3581 static int
cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)3582 cpu_mwait_cx_pcpu_idle_sysctl(SYSCTL_HANDLER_ARGS)
3583 {
3584           struct cpu_idle_stat *stat = arg1;
3585           int error;
3586 
3587           error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3588               &stat->hint, TRUE);
3589           return error;
3590 }
3591 
3592 static int
cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)3593 cpu_mwait_cx_spin_sysctl(SYSCTL_HANDLER_ARGS)
3594 {
3595           int error;
3596 
3597           error = cpu_mwait_cx_select_sysctl(oidp, arg1, arg2, req,
3598               &cpu_mwait_spin, FALSE);
3599           return error;
3600 }
3601 
3602 /*
3603  * This manual debugging code is called unconditionally from Xtimer
3604  * (the per-cpu timer interrupt) whether the current thread is in a
3605  * critical section or not) and can be useful in tracking down lockups.
3606  *
3607  * NOTE: MANUAL DEBUG CODE
3608  */
3609 #if 0
3610 static int saveticks[SMP_MAXCPU];
3611 static int savecounts[SMP_MAXCPU];
3612 #endif
3613 static tsc_uclock_t last_tsc[SMP_MAXCPU];
3614 
3615 void
pcpu_timer_always(struct intrframe * frame)3616 pcpu_timer_always(struct intrframe *frame)
3617 {
3618           globaldata_t gd;
3619           thread_t td;
3620           char *top;
3621           char *bot;
3622           char *rbp;
3623           char *rip;
3624           int n;
3625           tsc_uclock_t tsc;
3626 
3627           if (flame_poll_debug == 0)
3628                     return;
3629           gd = mycpu;
3630           tsc = rdtsc() - last_tsc[gd->gd_cpuid];
3631           if (tsc_frequency == 0 || tsc < tsc_frequency)
3632                     return;
3633           last_tsc[gd->gd_cpuid] = rdtsc();
3634 
3635           td = gd->gd_curthread;
3636           if (td == NULL)
3637                     return;
3638           bot = (char *)td->td_kstack + PAGE_SIZE;        /* skip guard */
3639           top = (char *)td->td_kstack + td->td_kstack_size;
3640           if (bot >= top)
3641                     return;
3642 
3643           rip = (char *)(intptr_t)frame->if_rip;
3644           kprintf("POLL%02d %016lx", gd->gd_cpuid, (intptr_t)rip);
3645           rbp = (char *)(intptr_t)frame->if_rbp;
3646 
3647           for (n = 1; n < 8; ++n) {
3648                     if (rbp < bot || rbp > top - 8 || ((intptr_t)rbp & 7))
3649                               break;
3650                     kprintf("<-%016lx", (intptr_t)*(char **)(rbp + 8));
3651                     if (*(char **)rbp <= rbp)
3652                               break;
3653                     rbp = *(char **)rbp;
3654           }
3655           kprintf("\n");
3656           cpu_sfence();
3657 }
3658 
3659 SET_DECLARE(smap_open, char);
3660 SET_DECLARE(smap_close, char);
3661 
3662 static void
cpu_implement_smap(void)3663 cpu_implement_smap(void)
3664 {
3665           char **scan;
3666 
3667           for (scan = SET_BEGIN(smap_open);                 /* nop -> stac */
3668                scan < SET_LIMIT(smap_open); ++scan) {
3669                     (*scan)[0] = 0x0F;
3670                     (*scan)[1] = 0x01;
3671                     (*scan)[2] = 0xCB;
3672           }
3673           for (scan = SET_BEGIN(smap_close);                /* nop -> clac */
3674                scan < SET_LIMIT(smap_close); ++scan) {
3675                     (*scan)[0] = 0x0F;
3676                     (*scan)[1] = 0x01;
3677                     (*scan)[2] = 0xCA;
3678           }
3679 }
3680 
3681 /*
3682  * From a hard interrupt
3683  */
3684 int
cpu_interrupt_running(struct thread * td)3685 cpu_interrupt_running(struct thread *td)
3686 {
3687           struct mdglobaldata *gd = mdcpu;
3688 
3689           if (clock_debug1 > 0) {
3690                     --clock_debug1;
3691                     kprintf("%d %016lx %016lx %016lx\n",
3692                               ((td->td_flags & TDF_INTTHREAD) != 0),
3693                               gd->gd_ipending[0],
3694                               gd->gd_ipending[1],
3695                               gd->gd_ipending[2]);
3696                     if (td->td_flags & TDF_CLKTHREAD) {
3697                               kprintf("CLKTD %s PREEMPT %s\n",
3698                                         td->td_comm,
3699                                         (td->td_preempted ?
3700                                          td->td_preempted->td_comm : ""));
3701                     } else {
3702                               kprintf("NORTD %s\n", td->td_comm);
3703                     }
3704           }
3705           if ((td->td_flags & TDF_INTTHREAD) ||
3706               gd->gd_ipending[0] ||
3707               gd->gd_ipending[1] ||
3708               gd->gd_ipending[2]) {
3709                     return 1;
3710           } else {
3711                     return 0;
3712           }
3713 }
3714