1 /*        $NetBSD: fpu.c,v 1.91 2025/04/28 13:01:27 riastradh Exp $   */
2 
3 /*
4  * Copyright (c) 2008, 2019 The NetBSD Foundation, Inc.  All
5  * rights reserved.
6  *
7  * This code is derived from software developed for The NetBSD Foundation
8  * by Andrew Doran and Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * Copyright (c) 1991 The Regents of the University of California.
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. Neither the name of the University nor the names of its contributors
45  *    may be used to endorse or promote products derived from this software
46  *    without specific prior written permission.
47  *
48  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58  * SUCH DAMAGE.
59  *
60  *        @(#)npx.c 7.2 (Berkeley) 5/12/91
61  */
62 
63 /*
64  * Copyright (c) 1994, 1995, 1998 Charles M. Hannum.  All rights reserved.
65  * Copyright (c) 1990 William Jolitz.
66  *
67  * Redistribution and use in source and binary forms, with or without
68  * modification, are permitted provided that the following conditions
69  * are met:
70  * 1. Redistributions of source code must retain the above copyright
71  *    notice, this list of conditions and the following disclaimer.
72  * 2. Redistributions in binary form must reproduce the above copyright
73  *    notice, this list of conditions and the following disclaimer in the
74  *    documentation and/or other materials provided with the distribution.
75  * 3. All advertising materials mentioning features or use of this software
76  *    must display the following acknowledgement:
77  *        This product includes software developed by the University of
78  *        California, Berkeley and its contributors.
79  * 4. Neither the name of the University nor the names of its contributors
80  *    may be used to endorse or promote products derived from this software
81  *    without specific prior written permission.
82  *
83  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
84  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
85  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
86  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
87  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
88  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
89  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
90  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
91  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
92  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
93  * SUCH DAMAGE.
94  *
95  *        @(#)npx.c 7.2 (Berkeley) 5/12/91
96  */
97 
98 #include <sys/cdefs.h>
99 __KERNEL_RCSID(0, "$NetBSD: fpu.c,v 1.91 2025/04/28 13:01:27 riastradh Exp $");
100 
101 #include "opt_ddb.h"
102 #include "opt_multiprocessor.h"
103 
104 #include <sys/param.h>
105 #include <sys/systm.h>
106 #include <sys/conf.h>
107 #include <sys/cpu.h>
108 #include <sys/file.h>
109 #include <sys/proc.h>
110 #include <sys/kernel.h>
111 #include <sys/sysctl.h>
112 #include <sys/xcall.h>
113 
114 #include <machine/cpu.h>
115 #include <machine/cpuvar.h>
116 #include <machine/cputypes.h>
117 #include <machine/intr.h>
118 #include <machine/cpufunc.h>
119 #include <machine/pcb.h>
120 #include <machine/trap.h>
121 #include <machine/specialreg.h>
122 #include <x86/cpu.h>
123 #include <x86/fpu.h>
124 
125 #ifdef DDB
126 #include <ddb/ddb.h>
127 #endif
128 
129 #ifdef XENPV
130 #define clts() HYPERVISOR_fpu_taskswitch(0)
131 #define stts() HYPERVISOR_fpu_taskswitch(1)
132 #endif
133 
134 void fpu_handle_deferred(void);
135 void fpu_switch(struct lwp *, struct lwp *);
136 
137 uint32_t x86_fpu_mxcsr_mask __read_mostly = 0;
138 
139 static const union savefpu safe_fpu_storage __aligned(64) = {
140           .sv_xmm = {
141                     .fx_mxcsr = __SAFE_MXCSR__,
142           },
143 };
144 static const union savefpu zero_fpu_storage __aligned(64);
145 
146 static const void *safe_fpu __read_mostly = &safe_fpu_storage;
147 static const void *zero_fpu __read_mostly = &zero_fpu_storage;
148 
149 /*
150  * x86_fpu_save_separate_p()
151  *
152  *        True if we allocate the FPU save space separately, outside the
153  *        struct pcb itself, because it doesn't fit in a single page.
154  */
155 bool
x86_fpu_save_separate_p(void)156 x86_fpu_save_separate_p(void)
157 {
158 
159           return x86_fpu_save_size >
160               PAGE_SIZE - offsetof(struct pcb, pcb_savefpusmall);
161 }
162 
163 static inline union savefpu *
fpu_lwp_area(struct lwp * l)164 fpu_lwp_area(struct lwp *l)
165 {
166           struct pcb *pcb = lwp_getpcb(l);
167           union savefpu *area = pcb->pcb_savefpu;
168 
169           KASSERT((l->l_flag & LW_SYSTEM) == 0);
170           if (l == curlwp) {
171                     fpu_save();
172           }
173           KASSERT(!(l->l_md.md_flags & MDL_FPU_IN_CPU));
174 
175           return area;
176 }
177 
178 static inline void
fpu_save_lwp(struct lwp * l)179 fpu_save_lwp(struct lwp *l)
180 {
181           struct pcb *pcb = lwp_getpcb(l);
182           union savefpu *area = pcb->pcb_savefpu;
183           int s;
184 
185           s = splvm();
186           if (l->l_md.md_flags & MDL_FPU_IN_CPU) {
187                     KASSERT((l->l_flag & LW_SYSTEM) == 0);
188                     fpu_area_save(area, x86_xsave_features, !(l->l_proc->p_flag & PK_32));
189                     l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
190           }
191           splx(s);
192 }
193 
194 /*
195  * Bring curlwp's FPU state in memory. It will get installed back in the CPU
196  * when returning to userland.
197  */
198 void
fpu_save(void)199 fpu_save(void)
200 {
201           fpu_save_lwp(curlwp);
202 }
203 
204 void
fpuinit(struct cpu_info * ci)205 fpuinit(struct cpu_info *ci)
206 {
207           /*
208            * This might not be strictly necessary since it will be initialized
209            * for each process. However it does no harm.
210            */
211           clts();
212           fninit();
213           stts();
214 }
215 
216 /*
217  * fpuinit_mxcsr_mask()
218  *
219  *        Called once by cpu_init on the primary CPU.  Initializes
220  *        x86_fpu_mxcsr_mask based on the initial FPU state, and
221  *        initializes save_fpu and zero_fpu if necessary when the
222  *        hardware's FPU save size is larger than union savefpu.
223  *
224  *        XXX Rename this function!
225  */
226 void
fpuinit_mxcsr_mask(void)227 fpuinit_mxcsr_mask(void)
228 {
229           /*
230            * If the CPU's x86 fpu save size is larger than union savefpu,
231            * we have to allocate larger buffers for the safe and zero FPU
232            * states used here and by fpu_kern_enter/leave.
233            *
234            * Note: This is NOT the same as x86_fpu_save_separate_p(),
235            * which may have a little more space than union savefpu.
236            */
237           const bool allocfpusave = x86_fpu_save_size > sizeof(union savefpu);
238           vaddr_t va;
239 
240 #if defined XENPV
241           if (x86_fpu_save_separate_p()) {
242                     /*
243                      * XXX Temporary workaround for PR kern/59371 until we
244                      * work out the implications.
245                      */
246                     panic("NetBSD/xen does not support fpu save size %u",
247                         x86_fpu_save_size);
248           }
249 #elif defined __i386__
250           if (x86_fpu_save_separate_p()) {
251                     /*
252                      * XXX Need to teach cpu_uarea_alloc/free to allocate a
253                      * separate fpu save space, and make pcb_savefpu a
254                      * pointer indirection -- currently only done on amd64,
255                      * not on i386.
256                      *
257                      * But the primary motivation on amd64 is the 8192-byte
258                      * TILEDATA state for Intel AMX (Advanced Matrix
259                      * Extensions), which doesn't work in 32-bit mode
260                      * anyway, so on such machines we ought to just disable
261                      * it in the first place and keep x86_fpu_save_size
262                      * down:
263                      *
264                      *        While Intel AMX instructions can be executed
265                      *        only in 64-bit mode, instructions of the XSAVE
266                      *        feature set can operate on TILECFG and TILEDATA
267                      *        in any mode.  It is recommended that only
268                      *        64-bit operating systems enable Intel AMX by
269                      *        setting XCR0[18:17].
270                      *
271                      *        --Intel 64 and IA-32 Architectures Software
272                      *        Developer's Manual, Volume 1: Basic
273                      *        Architecture, Order Number: 253665-087US, March
274                      *        2025, Sec. 13.3 `Enabling the XSAVE feature set
275                      *        and XSAVE-enabled features', p. 13-6.
276                      *        https://cdrdv2.intel.com/v1/dl/getContent/671436
277                      *        https://web.archive.org/web/20250404141850/https://cdrdv2-public.intel.com/851056/253665-087-sdm-vol-1.pdf
278                      *        https://web.archive.org/web/20250404141850if_/https://cdrdv2-public.intel.com/851056/253665-087-sdm-vol-1.pdf#page=324
279                      */
280                     panic("NetBSD/i386 does not support fpu save size %u",
281                         x86_fpu_save_size);
282           }
283 #endif
284 
285 #ifndef XENPV
286           union savefpu fpusave_stack __aligned(64);
287           union savefpu *fpusave;
288           u_long psl;
289 
290           /*
291            * Allocate a temporary save space from the stack if it fits,
292            * or from the heap otherwise, so we can query its mxcsr mask.
293            */
294           if (allocfpusave) {
295                     /*
296                      * Need 64-byte alignment for XSAVE instructions.
297                      * kmem_* doesn't guarantee that and we don't have a
298                      * handy posix_memalign in the kernel unless we hack it
299                      * ourselves with vmem(9), so just ask for page
300                      * alignment with uvm_km(9).
301                      */
302                     __CTASSERT(PAGE_SIZE >= 64);
303                     va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
304                         UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
305                     fpusave = (void *)va;
306           } else {
307                     fpusave = &fpusave_stack;
308                     memset(fpusave, 0, sizeof(*fpusave));
309           }
310 
311           /* Disable interrupts, and enable FPU */
312           psl = x86_read_psl();
313           x86_disable_intr();
314           clts();
315 
316           /* Fill in the FPU area */
317           fxsave(fpusave);
318 
319           /* Restore previous state */
320           stts();
321           x86_write_psl(psl);
322 
323           if (fpusave->sv_xmm.fx_mxcsr_mask == 0) {
324                     x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
325           } else {
326                     x86_fpu_mxcsr_mask = fpusave->sv_xmm.fx_mxcsr_mask;
327           }
328 
329           /*
330            * Free the temporary save space.
331            */
332           if (allocfpusave) {
333                     uvm_km_free(kernel_map, va, x86_fpu_save_size, UVM_KMF_WIRED);
334                     fpusave = NULL;
335                     va = 0;
336           }
337 #else
338           /*
339            * XXX XXX XXX: On Xen the FXSAVE above faults. That's because
340            * &fpusave is not 16-byte aligned. Stack alignment problem
341            * somewhere, it seems.
342            */
343           x86_fpu_mxcsr_mask = __INITIAL_MXCSR_MASK__;
344 #endif
345 
346           /*
347            * If necessary, allocate FPU save spaces for safe or zero FPU
348            * state, for fpu_kern_enter/leave.
349            */
350           if (allocfpusave) {
351                     __CTASSERT(PAGE_SIZE >= 64);
352 
353                     va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
354                         UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
355                     memcpy((void *)va, &safe_fpu_storage,
356                         sizeof(safe_fpu_storage));
357                     uvm_km_protect(kernel_map, va, x86_fpu_save_size,
358                         VM_PROT_READ);
359                     safe_fpu = (void *)va;
360 
361                     va = uvm_km_alloc(kernel_map, x86_fpu_save_size, PAGE_SIZE,
362                         UVM_KMF_WIRED|UVM_KMF_ZERO|UVM_KMF_WAITVA);
363                     /*
364                      * No initialization -- just want zeroes!  In fact we
365                      * could share this with other all-zero pages.
366                      */
367                     uvm_km_protect(kernel_map, va, x86_fpu_save_size,
368                         VM_PROT_READ);
369                     zero_fpu = (void *)va;
370           }
371 }
372 
373 static inline void
fpu_errata_amd(void)374 fpu_errata_amd(void)
375 {
376           uint16_t sw;
377 
378           /*
379            * AMD FPUs do not restore FIP, FDP, and FOP on fxrstor and xrstor
380            * when FSW.ES=0, leaking other threads' execution history.
381            *
382            * Clear them manually by loading a zero (fldummy). We do this
383            * unconditionally, regardless of FSW.ES.
384            *
385            * Before that, clear the ES bit in the x87 status word if it is
386            * currently set, in order to avoid causing a fault in the
387            * upcoming load.
388            *
389            * Newer generations of AMD CPUs have CPUID_Fn80000008_EBX[2],
390            * which indicates that FIP/FDP/FOP are restored (same behavior
391            * as Intel). We're not using it though.
392            */
393           fnstsw(&sw);
394           if (sw & 0x80)
395                     fnclex();
396           fldummy();
397 }
398 
399 #ifdef __x86_64__
400 #define XS64(x) (is_64bit ? x##64 : x)
401 #else
402 #define XS64(x) x
403 #endif
404 
405 void
fpu_area_save(void * area,uint64_t xsave_features,bool is_64bit)406 fpu_area_save(void *area, uint64_t xsave_features, bool is_64bit)
407 {
408           switch (x86_fpu_save) {
409           case FPU_SAVE_FSAVE:
410                     fnsave(area);
411                     break;
412           case FPU_SAVE_FXSAVE:
413                     XS64(fxsave)(area);
414                     break;
415           case FPU_SAVE_XSAVE:
416                     XS64(xsave)(area, xsave_features);
417                     break;
418           case FPU_SAVE_XSAVEOPT:
419                     XS64(xsaveopt)(area, xsave_features);
420                     break;
421           }
422 
423           stts();
424 }
425 
426 void
fpu_area_restore(const void * area,uint64_t xsave_features,bool is_64bit)427 fpu_area_restore(const void *area, uint64_t xsave_features, bool is_64bit)
428 {
429           clts();
430 
431           switch (x86_fpu_save) {
432           case FPU_SAVE_FSAVE:
433                     frstor(area);
434                     break;
435           case FPU_SAVE_FXSAVE:
436                     if (cpu_vendor == CPUVENDOR_AMD)
437                               fpu_errata_amd();
438                     XS64(fxrstor)(area);
439                     break;
440           case FPU_SAVE_XSAVE:
441           case FPU_SAVE_XSAVEOPT:
442                     if (cpu_vendor == CPUVENDOR_AMD)
443                               fpu_errata_amd();
444                     XS64(xrstor)(area, xsave_features);
445                     break;
446           }
447 }
448 
449 void
fpu_handle_deferred(void)450 fpu_handle_deferred(void)
451 {
452           struct pcb *pcb = lwp_getpcb(curlwp);
453           fpu_area_restore(pcb->pcb_savefpu, x86_xsave_features,
454               !(curlwp->l_proc->p_flag & PK_32));
455 }
456 
457 void
fpu_switch(struct lwp * oldlwp,struct lwp * newlwp)458 fpu_switch(struct lwp *oldlwp, struct lwp *newlwp)
459 {
460           struct cpu_info *ci __diagused = curcpu();
461           struct pcb *pcb;
462 
463           KASSERTMSG(ci->ci_ilevel >= IPL_SCHED, "cpu%d ilevel=%d",
464               cpu_index(ci), ci->ci_ilevel);
465 
466           if (oldlwp->l_md.md_flags & MDL_FPU_IN_CPU) {
467                     KASSERT(!(oldlwp->l_flag & LW_SYSTEM));
468                     pcb = lwp_getpcb(oldlwp);
469                     fpu_area_save(pcb->pcb_savefpu, x86_xsave_features,
470                         !(oldlwp->l_proc->p_flag & PK_32));
471                     oldlwp->l_md.md_flags &= ~MDL_FPU_IN_CPU;
472           }
473           KASSERT(!(newlwp->l_md.md_flags & MDL_FPU_IN_CPU));
474 }
475 
476 void
fpu_lwp_fork(struct lwp * l1,struct lwp * l2)477 fpu_lwp_fork(struct lwp *l1, struct lwp *l2)
478 {
479           struct pcb *pcb2 = lwp_getpcb(l2);
480           union savefpu *fpu_save;
481 
482           /* Kernel threads have no FPU. */
483           if (__predict_false(l2->l_flag & LW_SYSTEM)) {
484                     return;
485           }
486 
487           /* For init(8). */
488           if (__predict_false(l1->l_flag & LW_SYSTEM)) {
489                     memset(pcb2->pcb_savefpu, 0, x86_fpu_save_size);
490                     return;
491           }
492 
493           fpu_save = fpu_lwp_area(l1);
494           memcpy(pcb2->pcb_savefpu, fpu_save, x86_fpu_save_size);
495           l2->l_md.md_flags &= ~MDL_FPU_IN_CPU;
496 }
497 
498 void
fpu_lwp_abandon(struct lwp * l)499 fpu_lwp_abandon(struct lwp *l)
500 {
501           int s;
502 
503           KASSERT(l == curlwp);
504           s = splvm();
505           l->l_md.md_flags &= ~MDL_FPU_IN_CPU;
506           stts();
507           splx(s);
508 }
509 
510 /* -------------------------------------------------------------------------- */
511 
512 /*
513  * fpu_kern_enter()
514  *
515  *        Begin using the FPU.  Raises to splvm, disabling most
516  *        interrupts and rendering the thread non-preemptible; caller
517  *        should not use this for long periods of time, and must call
518  *        fpu_kern_leave() afterward.  Non-recursive -- you cannot call
519  *        fpu_kern_enter() again without calling fpu_kern_leave() first.
520  *
521  *        Must be used only at IPL_VM or below -- never in IPL_SCHED or
522  *        IPL_HIGH interrupt handlers.
523  */
524 void
fpu_kern_enter(void)525 fpu_kern_enter(void)
526 {
527           struct lwp *l = curlwp;
528           struct cpu_info *ci;
529           int s;
530 
531           s = splvm();
532 
533           ci = curcpu();
534 #if 0
535           /*
536            * Can't assert this because if the caller holds a spin lock at
537            * IPL_VM, and previously held and released a spin lock at
538            * higher IPL, the IPL remains raised above IPL_VM.
539            */
540           KASSERTMSG(ci->ci_ilevel <= IPL_VM || cold, "ilevel=%d",
541               ci->ci_ilevel);
542 #endif
543           KASSERT(ci->ci_kfpu_spl == -1);
544           ci->ci_kfpu_spl = s;
545 
546           /*
547            * If we are in a softint and have a pinned lwp, the fpu state is that
548            * of the pinned lwp, so save it there.
549            */
550           while ((l->l_pflag & LP_INTR) && (l->l_switchto != NULL))
551                     l = l->l_switchto;
552           fpu_save_lwp(l);
553 
554           /*
555            * Clear CR0_TS, which fpu_save_lwp set if it saved anything --
556            * otherwise the CPU will trap if we try to use the FPU under
557            * the false impression that there has been a task switch since
558            * the last FPU usage requiring that we save the FPU state.
559            */
560           clts();
561 
562           /*
563            * Zero the FPU registers and install safe control words.
564            */
565           fpu_area_restore(safe_fpu, x86_xsave_features, /*is_64bit*/false);
566 }
567 
568 /*
569  * fpu_kern_leave()
570  *
571  *        End using the FPU after fpu_kern_enter().
572  */
573 void
fpu_kern_leave(void)574 fpu_kern_leave(void)
575 {
576           struct cpu_info *ci = curcpu();
577           int s;
578 
579 #if 0
580           /*
581            * Can't assert this because if the caller holds a spin lock at
582            * IPL_VM, and previously held and released a spin lock at
583            * higher IPL, the IPL remains raised above IPL_VM.
584            */
585           KASSERT(ci->ci_ilevel == IPL_VM || cold);
586 #endif
587           KASSERT(ci->ci_kfpu_spl != -1);
588 
589           /*
590            * Zero the fpu registers; otherwise we might leak secrets
591            * through Spectre-class attacks to userland, even if there are
592            * no bugs in fpu state management.
593            */
594           fpu_area_restore(zero_fpu, x86_xsave_features, /*is_64bit*/false);
595 
596           /*
597            * Set CR0_TS again so that the kernel can't accidentally use
598            * the FPU.
599            */
600           stts();
601 
602           s = ci->ci_kfpu_spl;
603           ci->ci_kfpu_spl = -1;
604           splx(s);
605 }
606 
607 /* -------------------------------------------------------------------------- */
608 
609 /*
610  * The following table is used to ensure that the FPE_... value
611  * that is passed as a trapcode to the signal handler of the user
612  * process does not have more than one bit set.
613  *
614  * Multiple bits may be set if SSE simd instructions generate errors
615  * on more than one value or if the user process modifies the control
616  * word while a status word bit is already set (which this is a sign
617  * of bad coding).
618  * We have no choice than to narrow them down to one bit, since we must
619  * not send a trapcode that is not exactly one of the FPE_ macros.
620  *
621  * The mechanism has a static table with 127 entries.  Each combination
622  * of the 7 FPU status word exception bits directly translates to a
623  * position in this table, where a single FPE_... value is stored.
624  * This FPE_... value stored there is considered the "most important"
625  * of the exception bits and will be sent as the signal code.  The
626  * precedence of the bits is based upon Intel Document "Numerical
627  * Applications", Chapter "Special Computational Situations".
628  *
629  * The code to choose one of these values does these steps:
630  * 1) Throw away status word bits that cannot be masked.
631  * 2) Throw away the bits currently masked in the control word,
632  *    assuming the user isn't interested in them anymore.
633  * 3) Reinsert status word bit 7 (stack fault) if it is set, which
634  *    cannot be masked but must be preserved.
635  *    'Stack fault' is a sub-class of 'invalid operation'.
636  * 4) Use the remaining bits to point into the trapcode table.
637  *
638  * The 6 maskable bits in order of their preference, as stated in the
639  * above referenced Intel manual:
640  * 1  Invalid operation (FP_X_INV)
641  * 1a   Stack underflow
642  * 1b   Stack overflow
643  * 1c   Operand of unsupported format
644  * 1d   SNaN operand.
645  * 2  QNaN operand (not an exception, irrelevant here)
646  * 3  Any other invalid-operation not mentioned above or zero divide
647  *      (FP_X_INV, FP_X_DZ)
648  * 4  Denormal operand (FP_X_DNML)
649  * 5  Numeric over/underflow (FP_X_OFL, FP_X_UFL)
650  * 6  Inexact result (FP_X_IMP)
651  *
652  * NB: the above seems to mix up the mxscr error bits and the x87 ones.
653  * They are in the same order, but there is no EN_SW_STACK_FAULT in the mmx
654  * status.
655  *
656  * The table is nearly, but not quite, in bit order (ZERODIV and DENORM
657  * are swapped).
658  *
659  * This table assumes that any stack fault is cleared - so that an INVOP
660  * fault will only be reported as FLTSUB once.
661  * This might not happen if the mask is being changed.
662  */
663 #define FPE_xxx1(f) (f & EN_SW_INVOP \
664                     ? (f & EN_SW_STACK_FAULT ? FPE_FLTSUB : FPE_FLTINV) \
665           : f & EN_SW_ZERODIV ? FPE_FLTDIV \
666           : f & EN_SW_DENORM ? FPE_FLTUND \
667           : f & EN_SW_OVERFLOW ? FPE_FLTOVF \
668           : f & EN_SW_UNDERFLOW ? FPE_FLTUND \
669           : f & EN_SW_PRECLOSS ? FPE_FLTRES \
670           : f & EN_SW_STACK_FAULT ? FPE_FLTSUB : 0)
671 #define   FPE_xxx2(f)         FPE_xxx1(f),        FPE_xxx1((f + 1))
672 #define   FPE_xxx4(f)         FPE_xxx2(f),        FPE_xxx2((f + 2))
673 #define   FPE_xxx8(f)         FPE_xxx4(f),        FPE_xxx4((f + 4))
674 #define   FPE_xxx16(f)        FPE_xxx8(f),        FPE_xxx8((f + 8))
675 #define   FPE_xxx32(f)        FPE_xxx16(f),       FPE_xxx16((f + 16))
676 static const uint8_t fpetable[128] = {
677           FPE_xxx32(0), FPE_xxx32(32), FPE_xxx32(64), FPE_xxx32(96)
678 };
679 #undef FPE_xxx1
680 #undef FPE_xxx2
681 #undef FPE_xxx4
682 #undef FPE_xxx8
683 #undef FPE_xxx16
684 #undef FPE_xxx32
685 
686 /*
687  * This is a synchronous trap on either an x87 instruction (due to an unmasked
688  * error on the previous x87 instruction) or on an SSE/SSE2/etc instruction due
689  * to an error on the instruction itself.
690  *
691  * If trap actually generates a signal, then the fpu state is saved and then
692  * copied onto the lwp's user-stack, and then recovered from there when the
693  * signal returns.
694  *
695  * All this code needs to do is save the reason for the trap. For x87 traps the
696  * status word bits need clearing to stop the trap re-occurring. For SSE traps
697  * the mxcsr bits are 'sticky' and need clearing to not confuse a later trap.
698  *
699  * We come here with interrupts disabled.
700  */
701 void
fputrap(struct trapframe * frame)702 fputrap(struct trapframe *frame)
703 {
704           uint32_t statbits;
705           ksiginfo_t ksi;
706 
707           if (__predict_false(!USERMODE(frame->tf_cs))) {
708                     register_t ip = X86_TF_RIP(frame);
709                     char where[128];
710 
711 #ifdef DDB
712                     db_symstr(where, sizeof(where), (db_expr_t)ip, DB_STGY_PROC);
713 #else
714                     snprintf(where, sizeof(where), "%p", (void *)ip);
715 #endif
716                     panic("fpu trap from kernel at %s, trapframe %p\n", where,
717                         frame);
718           }
719 
720           KASSERT(curlwp->l_md.md_flags & MDL_FPU_IN_CPU);
721 
722           if (frame->tf_trapno == T_XMM) {
723                     uint32_t mxcsr;
724                     x86_stmxcsr(&mxcsr);
725                     statbits = mxcsr;
726                     /* Clear the sticky status bits */
727                     mxcsr &= ~0x3f;
728                     x86_ldmxcsr(&mxcsr);
729 
730                     /* Remove masked interrupts and non-status bits */
731                     statbits &= ~(statbits >> 7) & 0x3f;
732                     /* Mark this is an XMM status */
733                     statbits |= 0x10000;
734           } else {
735                     uint16_t cw, sw;
736                     /* Get current control and status words */
737                     fnstcw(&cw);
738                     fnstsw(&sw);
739                     /* Clear any pending exceptions from status word */
740                     fnclex();
741 
742                     /* Remove masked interrupts */
743                     statbits = sw & ~(cw & 0x3f);
744           }
745 
746           /* Doesn't matter now if we get pre-empted */
747           x86_enable_intr();
748 
749           KSI_INIT_TRAP(&ksi);
750           ksi.ksi_signo = SIGFPE;
751           ksi.ksi_addr = (void *)X86_TF_RIP(frame);
752           ksi.ksi_code = fpetable[statbits & 0x7f];
753           ksi.ksi_trap = statbits;
754           (*curlwp->l_proc->p_emul->e_trapsignal)(curlwp, &ksi);
755 }
756 
757 void
fpudna(struct trapframe * frame)758 fpudna(struct trapframe *frame)
759 {
760 #ifdef XENPV
761           /*
762            * Xen produes spurious fpudna traps, just do nothing.
763            */
764           if (USERMODE(frame->tf_cs)) {
765                     clts();
766                     return;
767           }
768 #endif
769           panic("fpudna from %s, ip %p, trapframe %p",
770               USERMODE(frame->tf_cs) ? "userland" : "kernel",
771               (void *)X86_TF_RIP(frame), frame);
772 }
773 
774 /* -------------------------------------------------------------------------- */
775 
776 static inline void
fpu_xstate_reload(union savefpu * fpu_save,uint64_t xstate)777 fpu_xstate_reload(union savefpu *fpu_save, uint64_t xstate)
778 {
779           /*
780            * Force a reload of the given xstate during the next XRSTOR.
781            */
782           if (x86_fpu_save >= FPU_SAVE_XSAVE) {
783                     fpu_save->sv_xsave_hdr.xsh_xstate_bv |= xstate;
784           }
785 }
786 
787 void
fpu_set_default_cw(struct lwp * l,unsigned int x87_cw)788 fpu_set_default_cw(struct lwp *l, unsigned int x87_cw)
789 {
790           union savefpu *fpu_save = fpu_lwp_area(l);
791           struct pcb *pcb = lwp_getpcb(l);
792 
793           if (i386_use_fxsave) {
794                     fpu_save->sv_xmm.fx_cw = x87_cw;
795                     if (x87_cw != __INITIAL_NPXCW__) {
796                               fpu_xstate_reload(fpu_save, XCR0_X87);
797                     }
798           } else {
799                     fpu_save->sv_87.s87_cw = x87_cw;
800           }
801           pcb->pcb_fpu_dflt_cw = x87_cw;
802 }
803 
804 void
fpu_clear(struct lwp * l,unsigned int x87_cw)805 fpu_clear(struct lwp *l, unsigned int x87_cw)
806 {
807           union savefpu *fpu_save;
808           struct pcb *pcb;
809 
810           KASSERT(l == curlwp);
811           fpu_save = fpu_lwp_area(l);
812 
813           switch (x86_fpu_save) {
814           case FPU_SAVE_FSAVE:
815                     memset(&fpu_save->sv_87, 0, x86_fpu_save_size);
816                     fpu_save->sv_87.s87_tw = 0xffff;
817                     fpu_save->sv_87.s87_cw = x87_cw;
818                     break;
819           case FPU_SAVE_FXSAVE:
820                     memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size);
821                     fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
822                     fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
823                     fpu_save->sv_xmm.fx_cw = x87_cw;
824                     break;
825           case FPU_SAVE_XSAVE:
826           case FPU_SAVE_XSAVEOPT:
827                     memset(&fpu_save->sv_xmm, 0, x86_fpu_save_size);
828                     fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
829                     fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
830                     fpu_save->sv_xmm.fx_cw = x87_cw;
831                     if (__predict_false(x87_cw != __INITIAL_NPXCW__)) {
832                               fpu_xstate_reload(fpu_save, XCR0_X87);
833                     }
834                     break;
835           }
836 
837           pcb = lwp_getpcb(l);
838           pcb->pcb_fpu_dflt_cw = x87_cw;
839 }
840 
841 void
fpu_sigreset(struct lwp * l)842 fpu_sigreset(struct lwp *l)
843 {
844           union savefpu *fpu_save = fpu_lwp_area(l);
845           struct pcb *pcb = lwp_getpcb(l);
846 
847           /*
848            * For signal handlers the register values don't matter. Just reset
849            * a few fields.
850            */
851           if (i386_use_fxsave) {
852                     fpu_save->sv_xmm.fx_mxcsr = __INITIAL_MXCSR__;
853                     fpu_save->sv_xmm.fx_mxcsr_mask = x86_fpu_mxcsr_mask;
854                     fpu_save->sv_xmm.fx_tw = 0;
855                     fpu_save->sv_xmm.fx_cw = pcb->pcb_fpu_dflt_cw;
856           } else {
857                     fpu_save->sv_87.s87_tw = 0xffff;
858                     fpu_save->sv_87.s87_cw = pcb->pcb_fpu_dflt_cw;
859           }
860 }
861 
862 void
process_write_fpregs_xmm(struct lwp * l,const struct fxsave * fpregs)863 process_write_fpregs_xmm(struct lwp *l, const struct fxsave *fpregs)
864 {
865           union savefpu *fpu_save = fpu_lwp_area(l);
866 
867           if (i386_use_fxsave) {
868                     memcpy(&fpu_save->sv_xmm, fpregs, sizeof(fpu_save->sv_xmm));
869 
870                     /*
871                      * Invalid bits in mxcsr or mxcsr_mask will cause faults.
872                      */
873                     fpu_save->sv_xmm.fx_mxcsr_mask &= x86_fpu_mxcsr_mask;
874                     fpu_save->sv_xmm.fx_mxcsr &= fpu_save->sv_xmm.fx_mxcsr_mask;
875 
876                     fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE);
877           } else {
878                     process_xmm_to_s87(fpregs, &fpu_save->sv_87);
879           }
880 }
881 
882 void
process_write_fpregs_s87(struct lwp * l,const struct save87 * fpregs)883 process_write_fpregs_s87(struct lwp *l, const struct save87 *fpregs)
884 {
885           union savefpu *fpu_save = fpu_lwp_area(l);
886 
887           if (i386_use_fxsave) {
888                     process_s87_to_xmm(fpregs, &fpu_save->sv_xmm);
889                     fpu_xstate_reload(fpu_save, XCR0_X87 | XCR0_SSE);
890           } else {
891                     memcpy(&fpu_save->sv_87, fpregs, sizeof(fpu_save->sv_87));
892           }
893 }
894 
895 void
process_read_fpregs_xmm(struct lwp * l,struct fxsave * fpregs)896 process_read_fpregs_xmm(struct lwp *l, struct fxsave *fpregs)
897 {
898           union savefpu *fpu_save = fpu_lwp_area(l);
899 
900           if (i386_use_fxsave) {
901                     memcpy(fpregs, &fpu_save->sv_xmm, sizeof(fpu_save->sv_xmm));
902           } else {
903                     memset(fpregs, 0, sizeof(*fpregs));
904                     process_s87_to_xmm(&fpu_save->sv_87, fpregs);
905           }
906 }
907 
908 void
process_read_fpregs_s87(struct lwp * l,struct save87 * fpregs)909 process_read_fpregs_s87(struct lwp *l, struct save87 *fpregs)
910 {
911           union savefpu *fpu_save = fpu_lwp_area(l);
912 
913           if (i386_use_fxsave) {
914                     memset(fpregs, 0, sizeof(*fpregs));
915                     process_xmm_to_s87(&fpu_save->sv_xmm, fpregs);
916           } else {
917                     memcpy(fpregs, &fpu_save->sv_87, sizeof(fpu_save->sv_87));
918           }
919 }
920 
921 int
process_read_xstate(struct lwp * l,struct xstate * xstate)922 process_read_xstate(struct lwp *l, struct xstate *xstate)
923 {
924           union savefpu *fpu_save = fpu_lwp_area(l);
925 
926           if (x86_fpu_save == FPU_SAVE_FSAVE) {
927                     /* Convert from legacy FSAVE format. */
928                     memset(&xstate->xs_fxsave, 0, sizeof(xstate->xs_fxsave));
929                     process_s87_to_xmm(&fpu_save->sv_87, &xstate->xs_fxsave);
930 
931                     /* We only got x87 data. */
932                     xstate->xs_rfbm = XCR0_X87;
933                     xstate->xs_xstate_bv = XCR0_X87;
934                     return 0;
935           }
936 
937           /* Copy the legacy area. */
938           memcpy(&xstate->xs_fxsave, fpu_save->sv_xsave_hdr.xsh_fxsave,
939               sizeof(xstate->xs_fxsave));
940 
941           if (x86_fpu_save == FPU_SAVE_FXSAVE) {
942                     /* FXSAVE means we've got x87 + SSE data. */
943                     xstate->xs_rfbm = XCR0_X87 | XCR0_SSE;
944                     xstate->xs_xstate_bv = XCR0_X87 | XCR0_SSE;
945                     return 0;
946           }
947 
948           /* Copy the bitmap indicating which states are available. */
949           xstate->xs_rfbm = x86_xsave_features & XCR0_FPU;
950           xstate->xs_xstate_bv = fpu_save->sv_xsave_hdr.xsh_xstate_bv;
951           KASSERT(!(xstate->xs_xstate_bv & ~xstate->xs_rfbm));
952 
953 #define COPY_COMPONENT(xcr0_val, xsave_val, field)                              \
954           if (xstate->xs_xstate_bv & xcr0_val) {                                \
955                     KASSERT(x86_xsave_offsets[xsave_val]                        \
956                         >= sizeof(struct xsave_header));                        \
957                     KASSERT(x86_xsave_sizes[xsave_val]                          \
958                         >= sizeof(xstate->field));                                        \
959                     memcpy(&xstate->field,                                                \
960                         (char*)fpu_save + x86_xsave_offsets[xsave_val],         \
961                         sizeof(xstate->field));                                 \
962           }
963 
964           COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128);
965           COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask);
966           COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256);
967           COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm);
968 
969 #undef COPY_COMPONENT
970 
971           return 0;
972 }
973 
974 int
process_verify_xstate(const struct xstate * xstate)975 process_verify_xstate(const struct xstate *xstate)
976 {
977           /* xstate_bv must be a subset of RFBM */
978           if (xstate->xs_xstate_bv & ~xstate->xs_rfbm)
979                     return EINVAL;
980 
981           switch (x86_fpu_save) {
982           case FPU_SAVE_FSAVE:
983                     if ((xstate->xs_rfbm & ~XCR0_X87))
984                               return EINVAL;
985                     break;
986           case FPU_SAVE_FXSAVE:
987                     if ((xstate->xs_rfbm & ~(XCR0_X87 | XCR0_SSE)))
988                               return EINVAL;
989                     break;
990           default:
991                     /* Verify whether no unsupported features are enabled */
992                     if ((xstate->xs_rfbm & ~(x86_xsave_features & XCR0_FPU)) != 0)
993                               return EINVAL;
994           }
995 
996           return 0;
997 }
998 
999 int
process_write_xstate(struct lwp * l,const struct xstate * xstate)1000 process_write_xstate(struct lwp *l, const struct xstate *xstate)
1001 {
1002           union savefpu *fpu_save = fpu_lwp_area(l);
1003 
1004           /* Convert data into legacy FSAVE format. */
1005           if (x86_fpu_save == FPU_SAVE_FSAVE) {
1006                     if (xstate->xs_xstate_bv & XCR0_X87)
1007                               process_xmm_to_s87(&xstate->xs_fxsave, &fpu_save->sv_87);
1008                     return 0;
1009           }
1010 
1011           /* If XSAVE is supported, make sure that xstate_bv is set correctly. */
1012           if (x86_fpu_save >= FPU_SAVE_XSAVE) {
1013                     /*
1014                      * Bit-wise "xstate->xs_rfbm ? xstate->xs_xstate_bv :
1015                      *           fpu_save->sv_xsave_hdr.xsh_xstate_bv"
1016                      */
1017                     fpu_save->sv_xsave_hdr.xsh_xstate_bv =
1018                         (fpu_save->sv_xsave_hdr.xsh_xstate_bv & ~xstate->xs_rfbm) |
1019                         xstate->xs_xstate_bv;
1020           }
1021 
1022           if (xstate->xs_xstate_bv & XCR0_X87) {
1023                     /*
1024                      * X87 state is split into two areas, interspersed with SSE
1025                      * data.
1026                      */
1027                     memcpy(&fpu_save->sv_xmm, &xstate->xs_fxsave, 24);
1028                     memcpy(fpu_save->sv_xmm.fx_87_ac, xstate->xs_fxsave.fx_87_ac,
1029                         sizeof(xstate->xs_fxsave.fx_87_ac));
1030           }
1031 
1032           /*
1033            * Copy MXCSR if either SSE or AVX state is requested, to match the
1034            * XSAVE behavior for those flags.
1035            */
1036           if (xstate->xs_xstate_bv & (XCR0_SSE|XCR0_YMM_Hi128)) {
1037                     /*
1038                      * Invalid bits in mxcsr or mxcsr_mask will cause faults.
1039                      */
1040                     fpu_save->sv_xmm.fx_mxcsr_mask = xstate->xs_fxsave.fx_mxcsr_mask
1041                         & x86_fpu_mxcsr_mask;
1042                     fpu_save->sv_xmm.fx_mxcsr = xstate->xs_fxsave.fx_mxcsr &
1043                         fpu_save->sv_xmm.fx_mxcsr_mask;
1044           }
1045 
1046           if (xstate->xs_xstate_bv & XCR0_SSE) {
1047                     memcpy(&fpu_save->sv_xsave_hdr.xsh_fxsave[160],
1048                         xstate->xs_fxsave.fx_xmm, sizeof(xstate->xs_fxsave.fx_xmm));
1049           }
1050 
1051 #define COPY_COMPONENT(xcr0_val, xsave_val, field)                              \
1052           if (xstate->xs_xstate_bv & xcr0_val) {                                \
1053                     KASSERT(x86_xsave_offsets[xsave_val]                        \
1054                         >= sizeof(struct xsave_header));                        \
1055                     KASSERT(x86_xsave_sizes[xsave_val]                          \
1056                         >= sizeof(xstate->field));                                        \
1057                     memcpy((char *)fpu_save + x86_xsave_offsets[xsave_val],     \
1058                         &xstate->field, sizeof(xstate->field));                 \
1059           }
1060 
1061           COPY_COMPONENT(XCR0_YMM_Hi128, XSAVE_YMM_Hi128, xs_ymm_hi128);
1062           COPY_COMPONENT(XCR0_Opmask, XSAVE_Opmask, xs_opmask);
1063           COPY_COMPONENT(XCR0_ZMM_Hi256, XSAVE_ZMM_Hi256, xs_zmm_hi256);
1064           COPY_COMPONENT(XCR0_Hi16_ZMM, XSAVE_Hi16_ZMM, xs_hi16_zmm);
1065 
1066 #undef COPY_COMPONENT
1067 
1068           return 0;
1069 }
1070