1 /*        $NetBSD: svs.c,v 1.42 2022/09/24 11:05:18 riastradh Exp $   */
2 
3 /*
4  * Copyright (c) 2018-2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.42 2022/09/24 11:05:18 riastradh Exp $");
34 
35 #include "opt_svs.h"
36 #include "opt_user_ldt.h"
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 #include <sys/cpu.h>
42 #include <sys/kauth.h>
43 #include <sys/sysctl.h>
44 #include <sys/xcall.h>
45 #include <sys/reboot.h>
46 
47 #include <x86/cputypes.h>
48 
49 #include <machine/cpuvar.h>
50 #include <machine/frameasm.h>
51 #include <machine/gdt.h>
52 #include <machine/pmap_private.h>
53 
54 #include <uvm/uvm.h>
55 #include <uvm/uvm_page.h>
56 
57 /*
58  * Separate Virtual Space
59  *
60  * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context
61  * switch to a user pmap, the lower half of updirpa is populated with the
62  * entries containing the userland pages.
63  *
64  * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
65  *
66  * We use a special per-cpu page that we call UTLS, for User Thread Local
67  * Storage. Each CPU has one UTLS page. This page has two VAs:
68  *
69  *  o When the user page tables are loaded in CR3, the VA to access this
70  *    page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is
71  *    _constant_ across CPUs, but in the user page tables this VA points to
72  *    the physical page of the UTLS that is _local_ to the CPU.
73  *
74  *  o When the kernel page tables are loaded in CR3, the VA to access this
75  *    page is ci->ci_svs_utls.
76  *
77  * +----------------------------------------------------------------------+
78  * | CPU0 Local Data                                      (Physical Page) |
79  * | +------------------+                                 +-------------+ |
80  * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | |
81  * | +------------------+                                 +-------------+ |
82  * +-------------------------------------------------------------^--------+
83  *                                                               |
84  *                                                               +----------+
85  *                                                                          |
86  * +----------------------------------------------------------------------+ |
87  * | CPU1 Local Data                                      (Physical Page) | |
88  * | +------------------+                                 +-------------+ | |
89  * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | |
90  * | +------------------+                                 +-------------+ | |
91  * +-------------------------------------------------------------^--------+ |
92  *                                                               |          |
93  *   +------------------+                 /----------------------+          |
94  *   | Kern Page Tables | ci->ci_svs_utls                                   |
95  *   +------------------+                 \---------------------------------+
96  *
97  * The goal of the UTLS page is to provide an area where we can store whatever
98  * we want, in a way that it is accessible both when the Kernel and when the
99  * User page tables are loaded in CR3.
100  *
101  * We store in the UTLS page three 64bit values:
102  *
103  *  o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel
104  *    page tables.
105  *
106  *  o UTLS_SCRATCH: a dummy place where we temporarily store a value during
107  *    the syscall entry procedure.
108  *
109  *  o UTLS_RSP0: the value we must put in RSP in order to have a stack where
110  *    we can push the register states. This is used only during the syscall
111  *    entry procedure, because there the CPU does not automatically switch
112  *    RSP (it does not use the TSS.rsp0 mechanism described below).
113  *
114  * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~
115  *
116  * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between
117  * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to
118  * the stack of the new LWP. Then the execution continues. At some point, the
119  * user LWP we context-switched to will perform a syscall or will receive an
120  * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a
121  * stack. The kernel then pushes the register states on this stack, and
122  * executes in kernel mode normally.
123  *
124  * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore,
125  * when an interrupt is received while we were in kernel mode, the CPU does not
126  * read TSS.rsp0. Instead, it just uses the current stack.
127  *
128  * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~
129  *
130  * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU
131  * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do
132  * _not_ have associated physical addresses. They are only two VAs.
133  *
134  * The first page is unmapped and acts as a redzone. The second page is
135  * dynamically kentered into the highest page of the real per-lwp kernel stack;
136  * but pay close attention, it is kentered _only_ in the user page tables.
137  * That is to say, the VA of this second page is mapped when the user page
138  * tables are loaded, but not mapped when the kernel page tables are loaded.
139  *
140  * During a context switch, svs_lwp_switch() gets called first. This function
141  * does the kenter job described above, not in the kernel page tables (that
142  * are currently loaded), but in the user page tables (that are not loaded).
143  *
144  *           VIRTUAL ADDRESSES                     PHYSICAL ADDRESSES
145  *
146  * +-----------------------------+
147  * |      KERNEL PAGE TABLES     |
148  * |    +-------------------+    |                +-------------------+
149  * |    | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) |
150  * |    +-------------------+    |                +-------------------+
151  * |    | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) |
152  * |    +-------------------+    |                +-------------------+
153  * |    | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) |
154  * |    +-------------------+    |                +-------------------+
155  * |    | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) |
156  * |    +-------------------+    |            +-> +-------------------+
157  * +-----------------------------+            |
158  *                                            |
159  * +---------------------------------------+  |
160  * |           USER PAGE TABLES            |  |
161  * | +----------------------------------+  |  |
162  * | | pcpuarea->ent[cid].rsp0 (page 0) |  |  |
163  * | +----------------------------------+  |  |
164  * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+
165  * | +----------------------------------+  |
166  * +---------------------------------------+
167  *
168  * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1)
169  * in TSS.rsp0. Later, when returning to userland on the lwp we context-
170  * switched to, we will load the user page tables and execute in userland
171  * normally.
172  *
173  * Next time an interrupt or syscall is received, the CPU will automatically
174  * use TSS.rsp0 as a stack. Here it is executing with the user page tables
175  * loaded, and therefore TSS.rsp0 is _mapped_.
176  *
177  * As part of the kernel entry procedure, we now switch CR3 to load the kernel
178  * page tables. Here, we are still using the stack pointer we set in TSS.rsp0.
179  *
180  * Remember that it was only one page of stack which was mapped only in the
181  * user page tables. We just switched to the kernel page tables, so we must
182  * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so,
183  * without touching the stack (since it is now unmapped, touching it would
184  * fault).
185  *
186  * After we updated RSP, we can continue execution exactly as in the non-SVS
187  * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if
188  * we updated RSP to a totally different VA, this VA points to the same
189  * physical page as TSS.rsp0. So in the end, the values the CPU pushed are
190  * still here even with the new RSP.
191  *
192  * Thanks to this double-kenter optimization, we don't need to copy the
193  * trapframe during each user<->kernel transition.
194  *
195  * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~
196  *
197  *  o Touching ci_svs_updir without holding ci_svs_mtx first is *not*
198  *    allowed.
199  *
200  *  o pm_kernel_cpus contains the set of CPUs that have the pmap loaded
201  *    in their CR3 register. It must *not* be replaced by pm_cpus.
202  *
203  *  o When a context switch on the current CPU is made from a user LWP
204  *    towards a kernel LWP, CR3 is not updated. Therefore, the pmap's
205  *    pm_kernel_cpus still contains the current CPU. It implies that the
206  *    remote CPUs that execute other threads of the user process we just
207  *    left will keep synchronizing us against their changes.
208  *
209  * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~
210  *
211  *  o PTE Space
212  *  o Direct Map
213  *  o Remote PCPU Areas
214  *  o Kernel Heap
215  *  o Kernel Image
216  *
217  * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
218  *
219  * Ordered from highest priority to lowest:
220  *
221  *  o The NMI stack is not double-entered. Therefore if we ever receive an NMI
222  *    and leave it, the content of the stack will be visible to userland (via
223  *    Meltdown). Normally we never leave NMIs, unless a privileged user
224  *    launched PMCs. That's unlikely to happen, our PMC support is pretty
225  *    minimal, and privileged only.
226  *
227  *  o Narrow down the entry points: hide the 'jmp handler' instructions. This
228  *    makes sense on GENERIC_KASLR kernels.
229  */
230 
231 /* -------------------------------------------------------------------------- */
232 
233 /* SVS_ENTER. */
234 extern uint8_t svs_enter, svs_enter_end;
235 static const struct x86_hotpatch_source hp_svs_enter_source = {
236           .saddr = &svs_enter,
237           .eaddr = &svs_enter_end
238 };
239 static const struct x86_hotpatch_descriptor hp_svs_enter_desc = {
240           .name = HP_NAME_SVS_ENTER,
241           .nsrc = 1,
242           .srcs = { &hp_svs_enter_source }
243 };
244 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_desc);
245 
246 /* SVS_ENTER_ALT. */
247 extern uint8_t svs_enter_altstack, svs_enter_altstack_end;
248 static const struct x86_hotpatch_source hp_svs_enter_altstack_source = {
249           .saddr = &svs_enter_altstack,
250           .eaddr = &svs_enter_altstack_end
251 };
252 static const struct x86_hotpatch_descriptor hp_svs_enter_altstack_desc = {
253           .name = HP_NAME_SVS_ENTER_ALT,
254           .nsrc = 1,
255           .srcs = { &hp_svs_enter_altstack_source }
256 };
257 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_altstack_desc);
258 
259 /* SVS_ENTER_NMI. */
260 extern uint8_t svs_enter_nmi, svs_enter_nmi_end;
261 static const struct x86_hotpatch_source hp_svs_enter_nmi_source = {
262           .saddr = &svs_enter_nmi,
263           .eaddr = &svs_enter_nmi_end
264 };
265 static const struct x86_hotpatch_descriptor hp_svs_enter_nmi_desc = {
266           .name = HP_NAME_SVS_ENTER_NMI,
267           .nsrc = 1,
268           .srcs = { &hp_svs_enter_nmi_source }
269 };
270 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_nmi_desc);
271 
272 /* SVS_LEAVE. */
273 extern uint8_t svs_leave, svs_leave_end;
274 static const struct x86_hotpatch_source hp_svs_leave_source = {
275           .saddr = &svs_leave,
276           .eaddr = &svs_leave_end
277 };
278 static const struct x86_hotpatch_descriptor hp_svs_leave_desc = {
279           .name = HP_NAME_SVS_LEAVE,
280           .nsrc = 1,
281           .srcs = { &hp_svs_leave_source }
282 };
283 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_desc);
284 
285 /* SVS_LEAVE_ALT. */
286 extern uint8_t svs_leave_altstack, svs_leave_altstack_end;
287 static const struct x86_hotpatch_source hp_svs_leave_altstack_source = {
288           .saddr = &svs_leave_altstack,
289           .eaddr = &svs_leave_altstack_end
290 };
291 static const struct x86_hotpatch_descriptor hp_svs_leave_altstack_desc = {
292           .name = HP_NAME_SVS_LEAVE_ALT,
293           .nsrc = 1,
294           .srcs = { &hp_svs_leave_altstack_source }
295 };
296 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_altstack_desc);
297 
298 /* SVS_LEAVE_NMI. */
299 extern uint8_t svs_leave_nmi, svs_leave_nmi_end;
300 static const struct x86_hotpatch_source hp_svs_leave_nmi_source = {
301           .saddr = &svs_leave_nmi,
302           .eaddr = &svs_leave_nmi_end
303 };
304 static const struct x86_hotpatch_descriptor hp_svs_leave_nmi_desc = {
305           .name = HP_NAME_SVS_LEAVE_NMI,
306           .nsrc = 1,
307           .srcs = { &hp_svs_leave_nmi_source }
308 };
309 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_nmi_desc);
310 
311 /* -------------------------------------------------------------------------- */
312 
313 bool svs_enabled __read_mostly = false;
314 bool svs_pcid __read_mostly = false;
315 
316 static uint64_t svs_pcid_kcr3 __read_mostly;
317 static uint64_t svs_pcid_ucr3 __read_mostly;
318 
319 struct svs_utls {
320           paddr_t kpdirpa;
321           uint64_t scratch;
322           vaddr_t rsp0;
323 };
324 
325 static pd_entry_t *
svs_tree_add(struct cpu_info * ci,vaddr_t va)326 svs_tree_add(struct cpu_info *ci, vaddr_t va)
327 {
328           extern const vaddr_t ptp_masks[];
329           extern const int ptp_shifts[];
330           pd_entry_t *dstpde;
331           struct vm_page *pg;
332           size_t i, pidx;
333           paddr_t pa;
334 
335           dstpde = ci->ci_svs_updir;
336 
337           for (i = PTP_LEVELS; i > 1; i--) {
338                     pidx = pl_pi(va, i);
339 
340                     if (!pmap_valid_entry(dstpde[pidx])) {
341                               pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
342                               if (pg == 0)
343                                         panic("%s: failed to allocate PA for CPU %d\n",
344                                                   __func__, cpu_index(ci));
345                               pa = VM_PAGE_TO_PHYS(pg);
346 
347                               dstpde[pidx] = PTE_P | PTE_W | pa;
348                     }
349 
350                     pa = (paddr_t)(dstpde[pidx] & PTE_FRAME);
351                     dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa);
352           }
353 
354           return dstpde;
355 }
356 
357 static void
svs_page_add(struct cpu_info * ci,vaddr_t va,bool global)358 svs_page_add(struct cpu_info *ci, vaddr_t va, bool global)
359 {
360           pd_entry_t *srcpde, *dstpde, pde;
361           size_t idx, pidx;
362           paddr_t pa;
363 
364           /* Create levels L4, L3 and L2. */
365           dstpde = svs_tree_add(ci, va);
366 
367           pidx = pl1_pi(va);
368 
369           /*
370            * If 'va' is in a large page, we need to compute its physical
371            * address manually.
372            */
373           idx = pl2_i(va);
374           srcpde = L2_BASE;
375           if (!pmap_valid_entry(srcpde[idx])) {
376                     panic("%s: L2 page not mapped", __func__);
377           }
378           if (srcpde[idx] & PTE_PS) {
379                     KASSERT(!global);
380                     pa = srcpde[idx] & PTE_2MFRAME;
381                     pa += (paddr_t)(va % NBPD_L2);
382                     pde = (srcpde[idx] & ~(PTE_PS|PTE_2MFRAME)) | pa;
383 
384                     if (pmap_valid_entry(dstpde[pidx])) {
385                               panic("%s: L1 page already mapped", __func__);
386                     }
387                     dstpde[pidx] = pde;
388                     return;
389           }
390 
391           /*
392            * Normal page, just copy the PDE.
393            */
394           idx = pl1_i(va);
395           srcpde = L1_BASE;
396           if (!pmap_valid_entry(srcpde[idx])) {
397                     panic("%s: L1 page not mapped", __func__);
398           }
399           if (pmap_valid_entry(dstpde[pidx])) {
400                     panic("%s: L1 page already mapped", __func__);
401           }
402           dstpde[pidx] = srcpde[idx];
403 
404           /*
405            * If we want a global translation, mark both the src and dst with
406            * PTE_G.
407            */
408           if (global) {
409                     srcpde[idx] |= PTE_G;
410                     dstpde[pidx] |= PTE_G;
411                     tlbflushg();
412           }
413 }
414 
415 static void
svs_rsp0_init(struct cpu_info * ci)416 svs_rsp0_init(struct cpu_info *ci)
417 {
418           const cpuid_t cid = cpu_index(ci);
419           vaddr_t va, rsp0;
420           pd_entry_t *pd;
421           size_t pidx;
422 
423           rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0;
424 
425           /* The first page is a redzone. */
426           va = rsp0 + PAGE_SIZE;
427 
428           /* Create levels L4, L3 and L2. */
429           pd = svs_tree_add(ci, va);
430 
431           /* Get the info for L1. */
432           pidx = pl1_i(va % NBPD_L2);
433           if (pmap_valid_entry(pd[pidx])) {
434                     panic("%s: rsp0 page already mapped", __func__);
435           }
436 
437           ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx];
438           ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe);
439           ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe);
440           ci->ci_svs_krsp0 = 0;
441 }
442 
443 static void
svs_utls_init(struct cpu_info * ci)444 svs_utls_init(struct cpu_info *ci)
445 {
446           const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls;
447           struct svs_utls *utls;
448           struct vm_page *pg;
449           pd_entry_t *pd;
450           size_t pidx;
451           paddr_t pa;
452           vaddr_t va;
453 
454           /* Create levels L4, L3 and L2 of the UTLS page. */
455           pd = svs_tree_add(ci, utlsva);
456 
457           /* Allocate L1. */
458           pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
459           if (pg == 0)
460                     panic("%s: failed to allocate PA for CPU %d\n", __func__,
461                         cpu_index(ci));
462           pa = VM_PAGE_TO_PHYS(pg);
463 
464           /* Enter L1. */
465           if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) {
466                     panic("%s: local page already mapped", __func__);
467           }
468           pidx = pl1_pi(utlsva);
469           if (pmap_valid_entry(pd[pidx])) {
470                     panic("%s: L1 page already mapped", __func__);
471           }
472           pd[pidx] = PTE_P | PTE_W | pmap_pg_nx | pa;
473 
474           /*
475            * Now, allocate a VA in the kernel map, that points to the UTLS
476            * page. After that, the UTLS page will be accessible in kernel
477            * mode via ci_svs_utls.
478            */
479           va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
480               UVM_KMF_VAONLY|UVM_KMF_NOWAIT);
481           if (va == 0) {
482                     panic("%s: unable to allocate VA\n", __func__);
483           }
484           pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
485           pmap_update(pmap_kernel());
486 
487           ci->ci_svs_utls = va;
488 
489           /* Initialize the constant fields of the UTLS page */
490           utls = (struct svs_utls *)ci->ci_svs_utls;
491           utls->rsp0 = ci->ci_svs_rsp0;
492 }
493 
494 static void
svs_pcid_init(struct cpu_info * ci)495 svs_pcid_init(struct cpu_info *ci)
496 {
497           if (!svs_pcid) {
498                     return;
499           }
500 
501           svs_pcid_ucr3 = __SHIFTIN(PMAP_PCID_USER, CR3_PCID) | CR3_NO_TLB_FLUSH;
502           svs_pcid_kcr3 = __SHIFTIN(PMAP_PCID_KERN, CR3_PCID) | CR3_NO_TLB_FLUSH;
503 
504           ci->ci_svs_updirpa |= svs_pcid_ucr3;
505 }
506 
507 static void
svs_range_add(struct cpu_info * ci,vaddr_t va,size_t size,bool global)508 svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size, bool global)
509 {
510           size_t i, n;
511 
512           KASSERT(size % PAGE_SIZE == 0);
513           n = size / PAGE_SIZE;
514           for (i = 0; i < n; i++) {
515                     svs_page_add(ci, va + i * PAGE_SIZE, global);
516           }
517 }
518 
519 void
cpu_svs_init(struct cpu_info * ci)520 cpu_svs_init(struct cpu_info *ci)
521 {
522           extern char __text_user_start;
523           extern char __text_user_end;
524           extern vaddr_t idt_vaddr;
525           const cpuid_t cid = cpu_index(ci);
526           struct vm_page *pg;
527 
528           KASSERT(ci != NULL);
529 
530           pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
531           if (pg == 0)
532                     panic("%s: failed to allocate L4 PA for CPU %d\n",
533                               __func__, cpu_index(ci));
534           ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg);
535 
536           ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
537                     UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
538           if (ci->ci_svs_updir == NULL)
539                     panic("%s: failed to allocate L4 VA for CPU %d\n",
540                               __func__, cpu_index(ci));
541 
542           pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa,
543                     VM_PROT_READ | VM_PROT_WRITE, 0);
544 
545           pmap_update(pmap_kernel());
546 
547           mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM);
548 
549           if (cid == cpu_index(&cpu_info_primary) || !idt_vec_is_pcpu())
550                     svs_page_add(ci, idt_vaddr, true);
551           svs_page_add(ci, (vaddr_t)&pcpuarea->ldt, true);
552           svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid],
553               offsetof(struct pcpu_entry, rsp0), true);
554           svs_range_add(ci, (vaddr_t)&__text_user_start,
555               (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start, false);
556 
557           svs_rsp0_init(ci);
558           svs_utls_init(ci);
559           svs_pcid_init(ci);
560 
561 #ifdef USER_LDT
562           mutex_enter(&cpu_lock);
563           ci->ci_svs_ldt_sel = ldt_alloc(&pcpuarea->ent[cid].ldt,
564               MAX_USERLDT_SIZE);
565           mutex_exit(&cpu_lock);
566 #endif
567 }
568 
569 void
svs_pmap_sync(struct pmap * pmap,int index)570 svs_pmap_sync(struct pmap *pmap, int index)
571 {
572           CPU_INFO_ITERATOR cii;
573           struct cpu_info *ci;
574           cpuid_t cid;
575 
576           KASSERT(pmap != NULL);
577           KASSERT(pmap != pmap_kernel());
578           KASSERT(pmap_is_user(pmap));
579           KASSERT(mutex_owned(&pmap->pm_lock));
580           KASSERT(kpreempt_disabled());
581           KASSERT(index < PDIR_SLOT_USERLIM);
582 
583           ci = curcpu();
584           cid = cpu_index(ci);
585 
586           mutex_enter(&ci->ci_svs_mtx);
587           KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
588           ci->ci_svs_updir[index] = pmap->pm_pdir[index];
589           mutex_exit(&ci->ci_svs_mtx);
590 
591           if (!kcpuset_isotherset(pmap->pm_kernel_cpus, cid)) {
592                     return;
593           }
594 
595           for (CPU_INFO_FOREACH(cii, ci)) {
596                     cid = cpu_index(ci);
597 
598                     if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
599                               continue;
600                     }
601 
602                     /* take the lock and check again */
603                     mutex_enter(&ci->ci_svs_mtx);
604                     if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
605                               ci->ci_svs_updir[index] = pmap->pm_pdir[index];
606                     }
607                     mutex_exit(&ci->ci_svs_mtx);
608           }
609 }
610 
611 void
svs_ldt_sync(struct pmap * pmap)612 svs_ldt_sync(struct pmap *pmap)
613 {
614           struct cpu_info *ci = curcpu();
615           void *ldt;
616           int sel;
617 
618           KASSERT(kpreempt_disabled());
619 
620           /*
621            * Another LWP could concurrently modify the LDT via x86_set_ldt1().
622            * The LWP will wait for pmap_ldt_sync() to finish before destroying
623            * the outdated LDT.
624            *
625            * We have preemption disabled here, so it is guaranteed that even
626            * if the LDT we are syncing is the outdated one, it is still valid.
627            *
628            * pmap_ldt_sync() will execute later once we have preemption enabled,
629            * and will install the new LDT.
630            */
631           sel = atomic_load_relaxed(&pmap->pm_ldt_sel);
632           if (__predict_false(sel != GSYSSEL(GLDT_SEL, SEL_KPL))) {
633                     ldt = atomic_load_relaxed(&pmap->pm_ldt);
634                     memcpy(&pcpuarea->ent[cpu_index(ci)].ldt, ldt,
635                         MAX_USERLDT_SIZE);
636                     sel = ci->ci_svs_ldt_sel;
637           }
638 
639           lldt(sel);
640 }
641 
642 void
svs_lwp_switch(struct lwp * oldlwp,struct lwp * newlwp)643 svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp)
644 {
645           struct cpu_info *ci = curcpu();
646           struct svs_utls *utls;
647           struct pcb *pcb;
648           pt_entry_t *pte;
649           uintptr_t rsp0;
650           vaddr_t va;
651 
652           if (newlwp->l_flag & LW_SYSTEM) {
653                     return;
654           }
655 
656 #ifdef DIAGNOSTIC
657           if (!(oldlwp->l_flag & LW_SYSTEM)) {
658                     pcb = lwp_getpcb(oldlwp);
659                     rsp0 = pcb->pcb_rsp0;
660                     va = rounddown(rsp0, PAGE_SIZE);
661                     KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe));
662                     pte = ci->ci_svs_rsp0_pte;
663                     KASSERT(*pte == L1_BASE[pl1_i(va)]);
664           }
665 #endif
666 
667           pcb = lwp_getpcb(newlwp);
668           rsp0 = pcb->pcb_rsp0;
669           va = rounddown(rsp0, PAGE_SIZE);
670 
671           /* Update the kernel rsp0 in cpu_info */
672           ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe);
673           KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) ==
674               (ci->ci_svs_ursp0 % PAGE_SIZE));
675 
676           utls = (struct svs_utls *)ci->ci_svs_utls;
677           utls->scratch = 0;
678 
679           /*
680            * Enter the user rsp0. If we're using PCID we must flush the user VA,
681            * if we aren't it will be flushed during the next CR3 reload.
682            */
683           pte = ci->ci_svs_rsp0_pte;
684           *pte = L1_BASE[pl1_i(va)];
685           if (svs_pcid) {
686                     invpcid(INVPCID_ADDRESS, PMAP_PCID_USER, ci->ci_svs_rsp0);
687           }
688 }
689 
690 /*
691  * We may come here with the pmap unlocked.  If a remote CPU is updating
692  * them at the same time, it's not a problem: the remote CPU will call
693  * svs_pmap_sync afterwards, and our updirpa will be synchronized properly.
694  */
695 void
svs_pdir_switch(struct pmap * pmap)696 svs_pdir_switch(struct pmap *pmap)
697 {
698           struct cpu_info *ci = curcpu();
699           struct svs_utls *utls;
700 
701           KASSERT(kpreempt_disabled());
702           KASSERT(pmap != pmap_kernel());
703           KASSERT(pmap_is_user(pmap));
704 
705           /* Update the info in the UTLS page */
706           utls = (struct svs_utls *)ci->ci_svs_utls;
707           utls->kpdirpa = pmap_pdirpa(pmap, 0) | svs_pcid_kcr3;
708 
709           /* Copy user slots. */
710           mutex_enter(&ci->ci_svs_mtx);
711           svs_quad_copy(ci->ci_svs_updir, pmap->pm_pdir, PDIR_SLOT_USERLIM);
712           mutex_exit(&ci->ci_svs_mtx);
713 
714           if (svs_pcid) {
715                     invpcid(INVPCID_CONTEXT, PMAP_PCID_USER, 0);
716           }
717 }
718 
719 static void
svs_enable(void)720 svs_enable(void)
721 {
722           svs_enabled = true;
723 
724           x86_hotpatch(HP_NAME_SVS_ENTER, 0);
725           x86_hotpatch(HP_NAME_SVS_ENTER_ALT, 0);
726           x86_hotpatch(HP_NAME_SVS_ENTER_NMI, 0);
727 
728           x86_hotpatch(HP_NAME_SVS_LEAVE, 0);
729           x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, 0);
730           x86_hotpatch(HP_NAME_SVS_LEAVE_NMI, 0);
731 }
732 
733 void
svs_init(void)734 svs_init(void)
735 {
736           uint64_t msr;
737 
738           if (cpu_vendor != CPUVENDOR_INTEL) {
739                     return;
740           }
741           if (boothowto & RB_MD3) {
742                     return;
743           }
744           if (cpu_info_primary.ci_feat_val[7] & CPUID_SEF_ARCH_CAP) {
745                     msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
746                     if (msr & IA32_ARCH_RDCL_NO) {
747                               /*
748                                * The processor indicates it is not vulnerable to the
749                                * Rogue Data Cache Load (Meltdown) flaw.
750                                */
751                               return;
752                     }
753           }
754 
755           if ((cpu_info_primary.ci_feat_val[1] & CPUID2_PCID) &&
756               (cpu_info_primary.ci_feat_val[5] & CPUID_SEF_INVPCID)) {
757                     svs_pcid = true;
758                     lcr4(rcr4() | CR4_PCIDE);
759           }
760 
761           svs_enable();
762 }
763