xref: /dragonfly/sys/platform/pc64/x86_64/trap.c (revision f89dddc72568ef3eb11aa11e247b37ce69e77bce)
1 /*-
2  * Copyright (c) 1990, 1993
3  *        The Regents of the University of California.  All rights reserved.
4  * Copyright (C) 1994, David Greenman
5  * Copyright (c) 2008-2018 The DragonFly Project.
6  * Copyright (c) 2008 Jordan Gordeev.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * the University of Utah, and William Jolitz.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *        This product includes software developed by the University of
22  *        California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
40  * $FreeBSD: src/sys/i386/i386/trap.c,v 1.147.2.11 2003/02/27 19:09:59 luoqi Exp $
41  */
42 
43 /*
44  * x86_64 Trap and System call handling
45  */
46 
47 #include "use_isa.h"
48 
49 #include "opt_ddb.h"
50 #include "opt_ktrace.h"
51 
52 #include <machine/frame.h>
53 #include <sys/param.h>
54 #include <sys/systm.h>
55 #include <sys/kernel.h>
56 #include <sys/kerneldump.h>
57 #include <sys/proc.h>
58 #include <sys/pioctl.h>
59 #include <sys/types.h>
60 #include <sys/signal2.h>
61 #include <sys/syscall.h>
62 #include <sys/sysctl.h>
63 #include <sys/sysent.h>
64 #ifdef KTRACE
65 #include <sys/ktrace.h>
66 #endif
67 #include <sys/ktr.h>
68 #include <sys/sysmsg.h>
69 
70 #include <vm/pmap.h>
71 #include <vm/vm.h>
72 #include <vm/vm_extern.h>
73 #include <vm/vm_kern.h>
74 #include <vm/vm_param.h>
75 #include <machine/cpu.h>
76 #include <machine/pcb.h>
77 #include <machine/smp.h>
78 #include <machine/thread.h>
79 #include <machine/clock.h>
80 #include <machine/vmparam.h>
81 #include <machine/md_var.h>
82 #include <machine_base/isa/isa_intr.h>
83 #include <machine_base/apic/lapic.h>
84 
85 #include <ddb/ddb.h>
86 
87 #include <sys/thread2.h>
88 #include <sys/spinlock2.h>
89 
90 /*
91  * These %rip's are used to detect a historical CPU artifact on syscall or
92  * int $3 entry, if not shortcutted in exception.S via
93  * DIRECT_DISALLOW_SS_CPUBUG.
94  */
95 extern void Xbpt(void);
96 extern void Xfast_syscall(void);
97 #define IDTVEC(vec) X##vec
98 
99 extern void trap(struct trapframe *frame);
100 
101 static int trap_pfault(struct trapframe *, int);
102 static void trap_fatal(struct trapframe *, vm_offset_t);
103 void dblfault_handler(struct trapframe *frame);
104 
105 #define MAX_TRAP_MSG                    30
106 static char *trap_msg[] = {
107           "",                                               /*  0 unused */
108           "privileged instruction fault",                   /*  1 T_PRIVINFLT */
109           "",                                               /*  2 unused */
110           "breakpoint instruction fault",                   /*  3 T_BPTFLT */
111           "",                                               /*  4 unused */
112           "",                                               /*  5 unused */
113           "arithmetic trap",                      /*  6 T_ARITHTRAP */
114           "system forced exception",              /*  7 T_ASTFLT */
115           "",                                               /*  8 unused */
116           "general protection fault",             /*  9 T_PROTFLT */
117           "trace trap",                                     /* 10 T_TRCTRAP */
118           "",                                               /* 11 unused */
119           "page fault",                                     /* 12 T_PAGEFLT */
120           "",                                               /* 13 unused */
121           "alignment fault",                      /* 14 T_ALIGNFLT */
122           "",                                               /* 15 unused */
123           "",                                               /* 16 unused */
124           "",                                               /* 17 unused */
125           "integer divide fault",                           /* 18 T_DIVIDE */
126           "non-maskable interrupt trap",                    /* 19 T_NMI */
127           "overflow trap",                        /* 20 T_OFLOW */
128           "FPU bounds check fault",               /* 21 T_BOUND */
129           "FPU device not available",             /* 22 T_DNA */
130           "double fault",                                   /* 23 T_DOUBLEFLT */
131           "FPU operand fetch fault",              /* 24 T_FPOPFLT */
132           "invalid TSS fault",                              /* 25 T_TSSFLT */
133           "segment not present fault",            /* 26 T_SEGNPFLT */
134           "stack fault",                                    /* 27 T_STKFLT */
135           "machine check trap",                             /* 28 T_MCHK */
136           "SIMD floating-point exception",        /* 29 T_XMMFLT */
137           "reserved (unknown) fault",             /* 30 T_RESERVED */
138 };
139 
140 #ifdef DDB
141 static int ddb_on_nmi = 1;
142 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_nmi, CTLFLAG_RW,
143           &ddb_on_nmi, 0, "Go to DDB on NMI");
144 static int ddb_on_seg_fault = 0;
145 SYSCTL_INT(_machdep, OID_AUTO, ddb_on_seg_fault, CTLFLAG_RW,
146           &ddb_on_seg_fault, 0, "Go to DDB on user seg-fault");
147 __read_mostly static int freeze_on_seg_fault = 0;
148 SYSCTL_INT(_machdep, OID_AUTO, freeze_on_seg_fault, CTLFLAG_RW,
149           &freeze_on_seg_fault, 0, "Go to DDB on user seg-fault");
150 #endif
151 static int panic_on_nmi = 1;
152 SYSCTL_INT(_machdep, OID_AUTO, panic_on_nmi, CTLFLAG_RW,
153           &panic_on_nmi, 0, "Panic on NMI");
154 
155 /*
156  * System call debugging records the worst-case system call
157  * overhead (inclusive of blocking), but may be inaccurate.
158  */
159 /*#define SYSCALL_DEBUG*/
160 #ifdef SYSCALL_DEBUG
161 
162 #define SCWC_MAXT   30
163 
164 struct syscallwc {
165           uint32_t idx;
166           uint32_t dummy;
167           uint64_t tot[SYS_MAXSYSCALL];
168           uint64_t timings[SYS_MAXSYSCALL][SCWC_MAXT];
169 } __cachealign;
170 
171 struct syscallwc SysCallsWorstCase[MAXCPU];
172 
173 #endif
174 
175 /*
176  * Passively intercepts the thread switch function to increase
177  * the thread priority from a user priority to a kernel priority, reducing
178  * syscall and trap overhead for the case where no switch occurs.
179  *
180  * Synchronizes td_ucred with p_ucred.  This is used by system calls,
181  * signal handling, faults, AST traps, and anything else that enters the
182  * kernel from userland and provides the kernel with a stable read-only
183  * copy of the process ucred.
184  *
185  * To avoid races with another thread updating p_ucred we obtain p_spin.
186  * The other thread doing the update will obtain both p_token and p_spin.
187  * In the case where the cached cred pointer matches, we will already have
188  * the ref and we don't have to do one blessed thing.
189  */
190 static __inline void
userenter(struct thread * curtd,struct proc * curp)191 userenter(struct thread *curtd, struct proc *curp)
192 {
193           struct ucred *ocred;
194           struct ucred *ncred;
195 
196           curtd->td_release = lwkt_passive_release;
197 
198           if (__predict_false(curtd->td_ucred != curp->p_ucred)) {
199                     spin_lock(&curp->p_spin);
200                     ncred = crhold(curp->p_ucred);
201                     spin_unlock(&curp->p_spin);
202                     ocred = curtd->td_ucred;
203                     curtd->td_ucred = ncred;
204                     if (ocred)
205                               crfree(ocred);
206           }
207 }
208 
209 /*
210  * Handle signals, upcalls, profiling, and other AST's and/or tasks that
211  * must be completed before we can return to or try to return to userland.
212  *
213  * Note that td_sticks is a 64 bit quantity, but there's no point doing 64
214  * arithmatic on the delta calculation so the absolute tick values are
215  * truncated to an integer.
216  */
217 static void
userret(struct lwp * lp,struct trapframe * frame,int sticks)218 userret(struct lwp *lp, struct trapframe *frame, int sticks)
219 {
220           struct proc *p = lp->lwp_proc;
221           int sig;
222           int ptok;
223 
224           /*
225            * Charge system time if profiling.  Note: times are in microseconds.
226            * This may do a copyout and block, so do it first even though it
227            * means some system time will be charged as user time.
228            */
229           if (__predict_false(p->p_flags & P_PROFIL)) {
230                     addupc_task(p, frame->tf_rip,
231                               (u_int)((int)lp->lwp_thread->td_sticks - sticks));
232           }
233 
234 recheck:
235           /*
236            * Specific on-return-to-usermode checks (LWP_MP_WEXIT,
237            * LWP_MP_VNLRU, etc).
238            */
239           if (lp->lwp_mpflags & LWP_MP_URETMASK)
240                     lwpuserret(lp);
241 
242           /*
243            * Block here if we are in a stopped state.
244            */
245           if (__predict_false(STOPLWP(p, lp))) {
246                     lwkt_gettoken(&p->p_token);
247                     tstop();
248                     lwkt_reltoken(&p->p_token);
249                     goto recheck;
250           }
251           while (__predict_false(dump_stop_usertds)) {
252                     tsleep(&dump_stop_usertds, 0, "dumpstp", 0);
253           }
254 
255           /*
256            * Post any pending upcalls.  If running a virtual kernel be sure
257            * to restore the virtual kernel's vmspace before posting the upcall.
258            */
259           if (__predict_false(p->p_flags & (P_SIGVTALRM | P_SIGPROF))) {
260                     lwkt_gettoken(&p->p_token);
261                     if (p->p_flags & P_SIGVTALRM) {
262                               p->p_flags &= ~P_SIGVTALRM;
263                               ksignal(p, SIGVTALRM);
264                     }
265                     if (p->p_flags & P_SIGPROF) {
266                               p->p_flags &= ~P_SIGPROF;
267                               ksignal(p, SIGPROF);
268                     }
269                     lwkt_reltoken(&p->p_token);
270                     goto recheck;
271           }
272 
273           /*
274            * Post any pending signals.  If running a virtual kernel be sure
275            * to restore the virtual kernel's vmspace before posting the signal.
276            *
277            * WARNING!  postsig() can exit and not return.
278            */
279           if (__predict_false((sig = CURSIG_LCK_TRACE(lp, &ptok)) != 0)) {
280                     postsig(sig, ptok);
281                     goto recheck;
282           }
283 
284           /*
285            * In a multi-threaded program it is possible for a thread to change
286            * signal state during a system call which temporarily changes the
287            * signal mask.  In this case postsig() might not be run and we
288            * have to restore the mask ourselves.
289            */
290           if (__predict_false(lp->lwp_flags & LWP_OLDMASK)) {
291                     lp->lwp_flags &= ~LWP_OLDMASK;
292                     lp->lwp_sigmask = lp->lwp_oldsigmask;
293                     goto recheck;
294           }
295 }
296 
297 /*
298  * Cleanup from userenter and any passive release that might have occured.
299  * We must reclaim the current-process designation before we can return
300  * to usermode.  We also handle both LWKT and USER reschedule requests.
301  */
302 static __inline void
userexit(struct lwp * lp)303 userexit(struct lwp *lp)
304 {
305           struct thread *td = lp->lwp_thread;
306           /* globaldata_t gd = td->td_gd; */
307 
308           /*
309            * Handle stop requests at kernel priority.  Any requests queued
310            * after this loop will generate another AST.
311            */
312           while (__predict_false(STOPLWP(lp->lwp_proc, lp))) {
313                     lwkt_gettoken(&lp->lwp_proc->p_token);
314                     tstop();
315                     lwkt_reltoken(&lp->lwp_proc->p_token);
316           }
317 
318           /*
319            * Reduce our priority in preparation for a return to userland.  If
320            * our passive release function was still in place, our priority was
321            * never raised and does not need to be reduced.
322            */
323           lwkt_passive_recover(td);
324 
325           /* WARNING: we may have migrated cpu's */
326           /* gd = td->td_gd; */
327 
328           /*
329            * Become the current user scheduled process if we aren't already,
330            * and deal with reschedule requests and other factors.
331            *
332            * Do a silly hack to avoid RETPOLINE nonsense.
333            */
334           if (lp->lwp_proc->p_usched == &usched_dfly)
335                     dfly_acquire_curproc(lp);
336           else
337                     lp->lwp_proc->p_usched->acquire_curproc(lp);
338 }
339 
340 /*
341  * A page fault on a userspace address is classified as SMAP-induced
342  * if:
343  *        - SMAP is supported
344  *        - kernel mode accessed present data page
345  *        - rflags.AC was cleared
346  */
347 static int
trap_is_smap(struct trapframe * frame)348 trap_is_smap(struct trapframe *frame)
349 {
350         if ((cpu_stdext_feature & CPUID_STDEXT_SMAP) != 0 &&
351             (frame->tf_err & (PGEX_P | PGEX_U | PGEX_I | PGEX_RSV)) == PGEX_P &&
352               (frame->tf_rflags & PSL_AC) == 0) {
353                     return 1;
354           } else {
355                     return 0;
356           }
357 }
358 
359 #if !defined(KTR_KERNENTRY)
360 #define   KTR_KERNENTRY       KTR_ALL
361 #endif
362 KTR_INFO_MASTER(kernentry);
363 KTR_INFO(KTR_KERNENTRY, kernentry, trap, 0,
364            "TRAP(pid %d, tid %d, trapno %ld, eva %lu)",
365            pid_t pid, lwpid_t tid,  register_t trapno, vm_offset_t eva);
366 KTR_INFO(KTR_KERNENTRY, kernentry, trap_ret, 0, "TRAP_RET(pid %d, tid %d)",
367            pid_t pid, lwpid_t tid);
368 KTR_INFO(KTR_KERNENTRY, kernentry, syscall, 0, "SYSC(pid %d, tid %d, nr %ld)",
369            pid_t pid, lwpid_t tid,  register_t trapno);
370 KTR_INFO(KTR_KERNENTRY, kernentry, syscall_ret, 0, "SYSRET(pid %d, tid %d, err %d)",
371            pid_t pid, lwpid_t tid,  int err);
372 KTR_INFO(KTR_KERNENTRY, kernentry, fork_ret, 0, "FORKRET(pid %d, tid %d)",
373            pid_t pid, lwpid_t tid);
374 
375 /*
376  * Exception, fault, and trap interface to the kernel.
377  * This common code is called from assembly language IDT gate entry
378  * routines that prepare a suitable stack frame, and restore this
379  * frame after the exception has been processed.
380  *
381  * This function is also called from doreti in an interlock to handle ASTs.
382  * For example:  hardwareint->INTROUTINE->(set ast)->doreti->trap
383  *
384  * NOTE!  We have to retrieve the fault address prior to potentially
385  *          blocking, including blocking on any token.
386  *
387  * NOTE!  NMI and kernel DBG traps remain on their respective pcpu IST
388  *          stacks if taken from a kernel RPL. trap() cannot block in this
389  *          situation.  DDB entry or a direct report-and-return is ok.
390  *
391  * XXX gd_trap_nesting_level currently prevents lwkt_switch() from panicing
392  * if an attempt is made to switch from a fast interrupt or IPI.
393  */
394 void
trap(struct trapframe * frame)395 trap(struct trapframe *frame)
396 {
397           static struct krate sscpubugrate = { 1 };
398           struct globaldata *gd = mycpu;
399           struct thread *td = gd->gd_curthread;
400           struct lwp *lp = td->td_lwp;
401           struct proc *p;
402           int sticks = 0;
403           int i = 0, ucode = 0, type, code;
404 #ifdef INVARIANTS
405           int crit_count = td->td_critcount;
406           lwkt_tokref_t curstop = td->td_toks_stop;
407 #endif
408           vm_offset_t eva;
409 
410           p = td->td_proc;
411           clear_quickret();
412 
413 #ifdef DDB
414         /*
415            * We need to allow T_DNA faults when the debugger is active since
416            * some dumping paths do large bcopy() which use the floating
417            * point registers for faster copying.
418            */
419           if (db_active && frame->tf_trapno != T_DNA) {
420                     eva = (frame->tf_trapno == T_PAGEFLT ? frame->tf_addr : 0);
421                     ++gd->gd_trap_nesting_level;
422                     trap_fatal(frame, eva);
423                     --gd->gd_trap_nesting_level;
424                     goto out2;
425           }
426 #endif
427 
428           eva = 0;
429 
430           if ((frame->tf_rflags & PSL_I) == 0) {
431                     /*
432                      * Buggy application or kernel code has disabled interrupts
433                      * and then trapped.  Enabling interrupts now is wrong, but
434                      * it is better than running with interrupts disabled until
435                      * they are accidentally enabled later.
436                      */
437 
438                     type = frame->tf_trapno;
439                     if (ISPL(frame->tf_cs) == SEL_UPL) {
440                               /* JG curproc can be NULL */
441                               kprintf(
442                                   "pid %ld (%s): trap %d with interrupts disabled\n",
443                                   (long)curproc->p_pid, curproc->p_comm, type);
444                     } else if ((type == T_STKFLT || type == T_PROTFLT ||
445                                   type == T_SEGNPFLT) &&
446                                  frame->tf_rip == (long)doreti_iret) {
447                               /*
448                                * iretq fault from kernel mode during return to
449                                * userland.
450                                *
451                                * This situation is expected, don't complain.
452                                */
453                     } else if (type != T_NMI && type != T_BPTFLT &&
454                                  type != T_TRCTRAP) {
455                               /*
456                                * XXX not quite right, since this may be for a
457                                * multiple fault in user mode.
458                                */
459                               kprintf("kernel trap %d (%s @ 0x%016jx) with "
460                                         "interrupts disabled\n",
461                                         type,
462                                         td->td_comm,
463                                         frame->tf_rip);
464                     }
465                     cpu_enable_intr();
466           }
467 
468           type = frame->tf_trapno;
469           code = frame->tf_err;
470 
471           if (ISPL(frame->tf_cs) == SEL_UPL) {
472                     /* user trap */
473 
474                     KTR_LOG(kernentry_trap, p->p_pid, lp->lwp_tid,
475                               frame->tf_trapno, eva);
476 
477                     userenter(td, p);
478 
479                     sticks = (int)td->td_sticks;
480                     KASSERT(lp->lwp_md.md_regs == frame,
481                               ("Frame mismatch %p %p", lp->lwp_md.md_regs, frame));
482 
483                     switch (type) {
484                     case T_PRIVINFLT:   /* privileged instruction fault */
485                               i = SIGILL;
486                               ucode = ILL_PRVOPC;
487                               break;
488 
489                     case T_BPTFLT:                /* bpt instruction fault */
490                     case T_TRCTRAP:               /* trace trap */
491                               frame->tf_rflags &= ~PSL_T;
492                               i = SIGTRAP;
493                               ucode = (type == T_TRCTRAP ? TRAP_TRACE : TRAP_BRKPT);
494                               break;
495 
496                     case T_ARITHTRAP:   /* arithmetic trap */
497                               ucode = code;
498                               i = SIGFPE;
499                               break;
500 
501                     case T_ASTFLT:                /* Allow process switch */
502                               mycpu->gd_cnt.v_soft++;
503                               if (mycpu->gd_reqflags & RQF_AST_OWEUPC) {
504                                         atomic_clear_int(&mycpu->gd_reqflags,
505                                                              RQF_AST_OWEUPC);
506                                         addupc_task(p, p->p_prof.pr_addr,
507                                                       p->p_prof.pr_ticks);
508                               }
509                               goto out;
510 
511                     case T_PROTFLT:               /* general protection fault */
512                               i = SIGBUS;
513                               ucode = BUS_OBJERR;
514                               break;
515                     case T_STKFLT:                /* stack fault */
516                     case T_SEGNPFLT:    /* segment not present fault */
517                               i = SIGBUS;
518                               ucode = BUS_ADRERR;
519                               break;
520                     case T_TSSFLT:                /* invalid TSS fault */
521                     case T_DOUBLEFLT:   /* double fault */
522                     default:
523                               i = SIGBUS;
524                               ucode = BUS_OBJERR;
525                               break;
526 
527                     case T_PAGEFLT:               /* page fault */
528                               i = trap_pfault(frame, TRUE);
529 #ifdef DDB
530                               if (frame->tf_rip == 0) {
531                                         /* used for kernel debugging only */
532                                         while (freeze_on_seg_fault)
533                                                   tsleep(p, 0, "freeze", hz * 20);
534                               }
535 #endif
536                               if (i == -1 || i == 0)
537                                         goto out;
538                               if (i == SIGSEGV) {
539                                         ucode = SEGV_MAPERR;
540                               } else {
541                                         i = SIGSEGV;
542                                         ucode = SEGV_ACCERR;
543                               }
544                               break;
545 
546                     case T_DIVIDE:                /* integer divide fault */
547                               ucode = FPE_INTDIV;
548                               i = SIGFPE;
549                               break;
550 
551 #if NISA > 0
552                     case T_NMI:
553                               /* machine/parity/power fail/"kitchen sink" faults */
554                               if (isa_nmi(code) == 0) {
555 #ifdef DDB
556                                         /*
557                                          * NMI can be hooked up to a pushbutton
558                                          * for debugging.
559                                          */
560                                         if (ddb_on_nmi) {
561                                                   kprintf ("NMI ... going to debugger\n");
562                                                   kdb_trap(type, 0, frame);
563                                         }
564 #endif /* DDB */
565                                         goto out2;
566                               } else if (panic_on_nmi)
567                                         panic("NMI indicates hardware failure");
568                               break;
569 #endif /* NISA > 0 */
570 
571                     case T_OFLOW:                 /* integer overflow fault */
572                               ucode = FPE_INTOVF;
573                               i = SIGFPE;
574                               break;
575 
576                     case T_BOUND:                 /* bounds check fault */
577                               ucode = FPE_FLTSUB;
578                               i = SIGFPE;
579                               break;
580 
581                     case T_DNA:
582                               /*
583                                * Virtual kernel intercept - pass the DNA exception
584                                * to the virtual kernel if it asked to handle it.
585                                * This occurs when the virtual kernel is holding
586                                * onto the FP context for a different emulated
587                                * process then the one currently running.
588                                *
589                                * We must still call npxdna() since we may have
590                                * saved FP state that the virtual kernel needs
591                                * to hand over to a different emulated process.
592                                */
593                               if (lp->lwp_vkernel && lp->lwp_vkernel->ve &&
594                                   (td->td_pcb->pcb_flags & FP_VIRTFP)
595                               ) {
596                                         npxdna();
597                                         break;
598                               }
599 
600                               /*
601                                * The kernel may have switched out the FP unit's
602                                * state, causing the user process to take a fault
603                                * when it tries to use the FP unit.  Restore the
604                                * state here
605                                */
606                               if (npxdna()) {
607                                         gd->gd_cnt.v_trap++;
608                                         goto out;
609                               }
610                               i = SIGFPE;
611                               ucode = FPE_FPU_NP_TRAP;
612                               break;
613 
614                     case T_FPOPFLT:               /* FPU operand fetch fault */
615                               ucode = ILL_COPROC;
616                               i = SIGILL;
617                               break;
618 
619                     case T_XMMFLT:                /* SIMD floating-point exception */
620                               ucode = 0; /* XXX */
621                               i = SIGFPE;
622                               break;
623                     }
624           } else {
625                     /* kernel trap */
626 
627                     switch (type) {
628                     case T_PAGEFLT:                         /* page fault */
629                               trap_pfault(frame, FALSE);
630                               goto out2;
631 
632                     case T_DNA:
633                               /*
634                                * The kernel is apparently using fpu for copying.
635                                * XXX this should be fatal unless the kernel has
636                                * registered such use.
637                                */
638                               if (npxdna()) {
639                                         gd->gd_cnt.v_trap++;
640                                         goto out2;
641                               }
642                               break;
643 
644                     case T_STKFLT:                /* stack fault */
645                     case T_PROTFLT:               /* general protection fault */
646                     case T_SEGNPFLT:    /* segment not present fault */
647                               /*
648                                * Invalid segment selectors and out of bounds
649                                * %rip's and %rsp's can be set up in user mode.
650                                * This causes a fault in kernel mode when the
651                                * kernel tries to return to user mode.  We want
652                                * to get this fault so that we can fix the
653                                * problem here and not have to check all the
654                                * selectors and pointers when the user changes
655                                * them.
656                                */
657                               if (mycpu->gd_intr_nesting_level == 0) {
658                                         /*
659                                          * NOTE: in 64-bit mode traps push rsp/ss
660                                          *         even if no ring change occurs.
661                                          */
662                                         if (td->td_pcb->pcb_onfault &&
663                                             td->td_pcb->pcb_onfault_sp ==
664                                             frame->tf_rsp) {
665                                                   frame->tf_rip = (register_t)
666                                                             td->td_pcb->pcb_onfault;
667                                                   goto out2;
668                                         }
669 
670                                         /*
671                                          * If the iretq in doreti faults during
672                                          * return to user, it will be special-cased
673                                          * in IDTVEC(prot) to get here.  We want
674                                          * to 'return' to doreti_iret_fault in
675                                          * ipl.s in approximately the same state we
676                                          * were in at the iretq.
677                                          */
678                                         if (frame->tf_rip == (long)doreti_iret) {
679                                                   frame->tf_rip = (long)doreti_iret_fault;
680                                                   goto out2;
681                                         }
682                               }
683                               break;
684 
685                     case T_TSSFLT:
686                               /*
687                                * PSL_NT can be set in user mode and isn't cleared
688                                * automatically when the kernel is entered.  This
689                                * causes a TSS fault when the kernel attempts to
690                                * `iret' because the TSS link is uninitialized.  We
691                                * want to get this fault so that we can fix the
692                                * problem here and not every time the kernel is
693                                * entered.
694                                */
695                               if (frame->tf_rflags & PSL_NT) {
696                                         frame->tf_rflags &= ~PSL_NT;
697 #if 0
698                                         /* do we need this? */
699                                         if (frame->tf_rip == (long)doreti_iret)
700                                                   frame->tf_rip = (long)doreti_iret_fault;
701 #endif
702                                         goto out2;
703                               }
704                               break;
705 
706                     case T_TRCTRAP:      /* trace trap */
707                               /*
708                                * Detect historical CPU artifact on syscall or int $3
709                                * entry (if not shortcutted in exception.s via
710                                * DIRECT_DISALLOW_SS_CPUBUG).
711                                */
712                               gd->gd_cnt.v_trap++;
713                               if (frame->tf_rip == (register_t)IDTVEC(fast_syscall)) {
714                                         krateprintf(&sscpubugrate,
715                                                   "Caught #DB at syscall cpu artifact\n");
716                                         goto out2;
717                               }
718                               if (frame->tf_rip == (register_t)IDTVEC(bpt)) {
719                                         krateprintf(&sscpubugrate,
720                                                   "Caught #DB at int $N cpu artifact\n");
721                                         goto out2;
722                               }
723 
724                               /*
725                                * Ignore debug register trace traps due to
726                                * accesses in the user's address space, which
727                                * can happen under several conditions such as
728                                * if a user sets a watchpoint on a buffer and
729                                * then passes that buffer to a system call.
730                                * We still want to get TRCTRAPS for addresses
731                                * in kernel space because that is useful when
732                                * debugging the kernel.
733                                */
734                               if (user_dbreg_trap()) {
735                                         /*
736                                          * Reset breakpoint bits because the
737                                          * processor doesn't
738                                          */
739                                         load_dr6(rdr6() & ~0xf);
740                                         goto out2;
741                               }
742                               /*
743                                * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
744                                */
745                     case T_BPTFLT:
746                               /*
747                                * If DDB is enabled, let it handle the debugger trap.
748                                * Otherwise, debugger traps "can't happen".
749                                */
750                               ucode = TRAP_BRKPT;
751 #ifdef DDB
752                               if (kdb_trap(type, 0, frame))
753                                         goto out2;
754 #endif
755                               break;
756 
757 #if NISA > 0
758                     case T_NMI:
759                               /* machine/parity/power fail/"kitchen sink" faults */
760                               if (isa_nmi(code) == 0) {
761 #ifdef DDB
762                                         /*
763                                          * NMI can be hooked up to a pushbutton
764                                          * for debugging.
765                                          */
766                                         if (ddb_on_nmi) {
767                                                   kprintf ("NMI ... going to debugger\n");
768                                                   kdb_trap(type, 0, frame);
769                                         }
770 #endif /* DDB */
771                                         goto out2;
772                               } else if (panic_on_nmi == 0)
773                                         goto out2;
774 #endif /* NISA > 0 */
775                               break;
776                     default:
777                               if (type >= T_RESERVED && type < T_RESERVED + 256) {
778                                         kprintf("Ignoring spurious unknown "
779                                                   "cpu trap T_RESERVED+%d\n",
780                                                   type - T_RESERVED);
781                                         gd->gd_cnt.v_trap++;
782                                         goto out2;
783                               }
784                               break;
785                     }
786                     trap_fatal(frame, 0);
787                     goto out2;
788           }
789 
790           /*
791            * Fault from user mode, virtual kernel interecept.
792            *
793            * If the fault is directly related to a VM context managed by a
794            * virtual kernel then let the virtual kernel handle it.
795            */
796           if (lp->lwp_vkernel && lp->lwp_vkernel->ve) {
797                     vkernel_trap(lp, frame);
798                     goto out;
799           }
800 
801           /* Translate fault for emulators (e.g. Linux) */
802           if (*p->p_sysent->sv_transtrap)
803                     i = (*p->p_sysent->sv_transtrap)(i, type);
804 
805           gd->gd_cnt.v_trap++;
806           trapsignal(lp, i, ucode);
807 
808 #ifdef DEBUG
809           if (type <= MAX_TRAP_MSG) {
810                     uprintf("fatal process exception: %s",
811                               trap_msg[type]);
812                     if ((type == T_PAGEFLT) || (type == T_PROTFLT))
813                               uprintf(", fault VA = 0x%lx", frame->tf_addr);
814                     uprintf("\n");
815           }
816 #endif
817 
818 out:
819           userret(lp, frame, sticks);
820           userexit(lp);
821 out2:     ;
822           if (p != NULL && lp != NULL)
823                     KTR_LOG(kernentry_trap_ret, p->p_pid, lp->lwp_tid);
824 #ifdef INVARIANTS
825           KASSERT(crit_count == td->td_critcount,
826                     ("trap: critical section count mismatch! %d/%d",
827                     crit_count, td->td_critcount));
828           KASSERT(curstop == td->td_toks_stop,
829                     ("trap: extra tokens held after trap! %ld/%ld (%s)",
830                     curstop - &td->td_toks_base,
831                     td->td_toks_stop - &td->td_toks_base,
832                     td->td_toks_stop[-1].tr_tok->t_desc));
833 #endif
834 }
835 
836 void
trap_handle_userenter(struct thread * td)837 trap_handle_userenter(struct thread *td)
838 {
839           userenter(td, td->td_proc);
840 }
841 
842 void
trap_handle_userexit(struct trapframe * frame,int sticks)843 trap_handle_userexit(struct trapframe *frame, int sticks)
844 {
845           struct lwp *lp = curthread->td_lwp;
846 
847           if (lp) {
848                     userret(lp, frame, sticks);
849                     userexit(lp);
850           }
851 }
852 
853 static int
trap_pfault(struct trapframe * frame,int usermode)854 trap_pfault(struct trapframe *frame, int usermode)
855 {
856           vm_offset_t va;
857           struct vmspace *vm = NULL;
858           vm_map_t map;
859           int rv = 0;
860           int fault_flags;
861           vm_prot_t ftype;
862           thread_t td = curthread;
863           struct lwp *lp = td->td_lwp;
864           struct proc *p;
865 
866           va = trunc_page(frame->tf_addr);
867           if (va >= VM_MIN_KERNEL_ADDRESS) {
868                     /*
869                      * Don't allow user-mode faults in kernel address space.
870                      */
871                     if (usermode) {
872                               fault_flags = -1;
873                               ftype = -1;
874                               goto nogo;
875                     }
876 
877                     map = kernel_map;
878           } else {
879                     /*
880                      * This is a fault on non-kernel virtual memory.
881                      * vm is initialized above to NULL. If curproc is NULL
882                      * or curproc->p_vmspace is NULL the fault is fatal.
883                      */
884                     if (lp != NULL)
885                               vm = lp->lwp_vmspace;
886 
887                     if (vm == NULL) {
888                               fault_flags = -1;
889                               ftype = -1;
890                               goto nogo;
891                     }
892 
893                     if (usermode == 0) {
894 #ifdef DDB
895                               /*
896                                * Debugging, catch kernel faults on the user address
897                                * space when not inside on onfault (e.g. copyin/
898                                * copyout) routine.
899                                */
900                               if (td->td_pcb == NULL ||
901                                   td->td_pcb->pcb_onfault == NULL) {
902                                         if (freeze_on_seg_fault) {
903                                                   kprintf("trap_pfault: user address "
904                                                             "fault from kernel mode "
905                                                             "%016lx\n",
906                                                             (long)frame->tf_addr);
907                                                   while (freeze_on_seg_fault) {
908                                                                 tsleep(&freeze_on_seg_fault,
909                                                                          0,
910                                                                          "frzseg",
911                                                                          hz * 20);
912                                                   }
913                                         }
914                               }
915 #endif
916                               if (td->td_gd->gd_intr_nesting_level ||
917                                   trap_is_smap(frame) ||
918                                   td->td_pcb == NULL ||
919                                   td->td_pcb->pcb_onfault == NULL) {
920                                         kprintf("Fatal user address access "
921                                                   "from kernel mode from %s at %016jx\n",
922                                                   td->td_comm, frame->tf_rip);
923                                         trap_fatal(frame, frame->tf_addr);
924                                         return (-1);
925                               }
926                     }
927                     map = &vm->vm_map;
928           }
929 
930           /*
931            * PGEX_I is defined only if the execute disable bit capability is
932            * supported and enabled.
933            */
934           if (frame->tf_err & PGEX_W)
935                     ftype = VM_PROT_WRITE;
936           else if (frame->tf_err & PGEX_I)
937                     ftype = VM_PROT_EXECUTE;
938           else
939                     ftype = VM_PROT_READ;
940 
941           lwkt_tokref_t stop = td->td_toks_stop;
942 
943           if (map != kernel_map) {
944                     /*
945                      * Keep swapout from messing with us during this
946                      *        critical time.
947                      */
948                     PHOLD(lp->lwp_proc);
949 
950                     /*
951                      * Issue fault
952                      */
953                     fault_flags = 0;
954                     if (usermode)
955                               fault_flags |= VM_FAULT_BURST | VM_FAULT_USERMODE;
956                     if (ftype & VM_PROT_WRITE)
957                               fault_flags |= VM_FAULT_DIRTY;
958                     else
959                               fault_flags |= VM_FAULT_NORMAL;
960                     rv = vm_fault(map, va, ftype, fault_flags);
961                     if (td->td_toks_stop != stop) {
962                               stop = td->td_toks_stop - 1;
963                               kprintf("A-HELD TOKENS DURING PFAULT td=%p(%s) map=%p va=%p ftype=%d fault_flags=%d\n", td, td->td_comm, map, (void *)va, ftype, fault_flags);
964                               panic("held tokens");
965                     }
966 
967                     PRELE(lp->lwp_proc);
968           } else {
969                     /*
970                      * Don't have to worry about process locking or stacks in the
971                      * kernel.
972                      */
973                     fault_flags = VM_FAULT_NORMAL;
974                     rv = vm_fault(map, va, ftype, VM_FAULT_NORMAL);
975                     if (td->td_toks_stop != stop) {
976                               stop = td->td_toks_stop - 1;
977                               kprintf("B-HELD TOKENS DURING PFAULT td=%p(%s) map=%p va=%p ftype=%d fault_flags=%d\n", td, td->td_comm, map, (void *)va, ftype, VM_FAULT_NORMAL);
978                               panic("held tokens");
979                     }
980           }
981           if (rv == KERN_SUCCESS)
982                     return (0);
983 nogo:
984           if (!usermode) {
985                     /*
986                      * NOTE: in 64-bit mode traps push rsp/ss
987                      *         even if no ring change occurs.
988                      */
989                     if (td->td_pcb->pcb_onfault &&
990                         td->td_pcb->pcb_onfault_sp == frame->tf_rsp &&
991                         td->td_gd->gd_intr_nesting_level == 0) {
992                               frame->tf_rip = (register_t)td->td_pcb->pcb_onfault;
993                               return (0);
994                     }
995                     trap_fatal(frame, frame->tf_addr);
996                     return (-1);
997           }
998 
999           /*
1000            * NOTE: on x86_64 we have a tf_addr field in the trapframe, no
1001            * kludge is needed to pass the fault address to signal handlers.
1002            */
1003           p = td->td_proc;
1004 #ifdef DDB
1005           if (td->td_lwp->lwp_vkernel == NULL) {
1006                     while (freeze_on_seg_fault) {
1007                               tsleep(p, 0, "freeze", hz * 20);
1008                     }
1009                     if (ddb_on_seg_fault)
1010                               Debugger("ddb_on_seg_fault");
1011           }
1012 #endif
1013 
1014           return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
1015 }
1016 
1017 static void
trap_fatal(struct trapframe * frame,vm_offset_t eva)1018 trap_fatal(struct trapframe *frame, vm_offset_t eva)
1019 {
1020           int code, ss;
1021           u_int type;
1022           long rsp;
1023           struct soft_segment_descriptor softseg;
1024 
1025           code = frame->tf_err;
1026           type = frame->tf_trapno;
1027           sdtossd(&mdcpu->gd_gdt[IDXSEL(frame->tf_cs & 0xffff)], &softseg);
1028 
1029           kprintf("\n\nFatal trap %d: ", type);
1030           if (type <= MAX_TRAP_MSG)
1031                     kprintf("%s ", trap_msg[type]);
1032           else
1033                     kprintf("rsvd(%d) ", type - T_RESERVED);
1034 
1035           kprintf("while in %s mode\n",
1036                     ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
1037 
1038           /* three separate prints in case of a trap on an unmapped page */
1039           kprintf("cpuid = %d; ", mycpu->gd_cpuid);
1040           if (lapic_usable)
1041                     kprintf("lapic id = %u\n", LAPIC_READID);
1042           if (type == T_PAGEFLT) {
1043                     kprintf("fault virtual address          = 0x%lx\n", eva);
1044                     kprintf("fault code           = %s %s %s, %s\n",
1045                               code & PGEX_U ? "user" : "supervisor",
1046                               code & PGEX_W ? "write" : "read",
1047                               code & PGEX_I ? "instruction" : "data",
1048                               code & PGEX_P ? "protection violation" : "page not present");
1049           }
1050           kprintf("instruction pointer  = 0x%lx:0x%lx\n",
1051                  frame->tf_cs & 0xffff, frame->tf_rip);
1052         if (ISPL(frame->tf_cs) == SEL_UPL) {
1053                     ss = frame->tf_ss & 0xffff;
1054                     rsp = frame->tf_rsp;
1055           } else {
1056                     /*
1057                      * NOTE: in 64-bit mode traps push rsp/ss even if no ring
1058                      *         change occurs.
1059                      */
1060                     ss = GSEL(GDATA_SEL, SEL_KPL);
1061                     rsp = frame->tf_rsp;
1062           }
1063           kprintf("stack pointer                = 0x%x:0x%lx\n", ss, rsp);
1064           kprintf("frame pointer                = 0x%x:0x%lx\n", ss, frame->tf_rbp);
1065           kprintf("code segment                   = base 0x%lx, limit 0x%lx, type 0x%x\n",
1066                  softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
1067           kprintf("                     = DPL %d, pres %d, long %d, def32 %d, gran %d\n",
1068                  softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_long, softseg.ssd_def32,
1069                  softseg.ssd_gran);
1070           kprintf("processor eflags     = ");
1071           if (frame->tf_rflags & PSL_T)
1072                     kprintf("trace trap, ");
1073           if (frame->tf_rflags & PSL_I)
1074                     kprintf("interrupt enabled, ");
1075           if (frame->tf_rflags & PSL_NT)
1076                     kprintf("nested task, ");
1077           if (frame->tf_rflags & PSL_RF)
1078                     kprintf("resume, ");
1079           if (frame->tf_rflags & PSL_AC)
1080                     kprintf("smap_open, ");
1081           kprintf("IOPL = %ld\n", (frame->tf_rflags & PSL_IOPL) >> 12);
1082           kprintf("current process                = ");
1083           if (curproc) {
1084                     kprintf("%lu\n",
1085                         (u_long)curproc->p_pid);
1086           } else {
1087                     kprintf("Idle\n");
1088           }
1089           kprintf("current thread          = pri %d ", curthread->td_pri);
1090           if (curthread->td_critcount)
1091                     kprintf("(CRIT)");
1092           kprintf("\n");
1093 
1094 #ifdef DDB
1095           if ((debugger_on_panic || db_active) && kdb_trap(type, code, frame))
1096                     return;
1097 #endif
1098           kprintf("trap number                    = %d\n", type);
1099           if (type <= MAX_TRAP_MSG)
1100                     panic("%s", trap_msg[type]);
1101           else
1102                     panic("unknown/reserved trap");
1103 }
1104 
1105 /*
1106  * Double fault handler. Called when a fault occurs while writing
1107  * a frame for a trap/exception onto the stack. This usually occurs
1108  * when the stack overflows (such is the case with infinite recursion,
1109  * for example).
1110  */
1111 static __inline
1112 int
in_kstack_guard(register_t rptr)1113 in_kstack_guard(register_t rptr)
1114 {
1115           thread_t td = curthread;
1116 
1117           if ((char *)rptr >= td->td_kstack &&
1118               (char *)rptr < td->td_kstack + PAGE_SIZE) {
1119                     return 1;
1120           }
1121           return 0;
1122 }
1123 
1124 void
dblfault_handler(struct trapframe * frame)1125 dblfault_handler(struct trapframe *frame)
1126 {
1127           thread_t td = curthread;
1128 
1129           if (in_kstack_guard(frame->tf_rsp) || in_kstack_guard(frame->tf_rbp)) {
1130                     kprintf("DOUBLE FAULT - KERNEL STACK GUARD HIT!\n");
1131                     if (in_kstack_guard(frame->tf_rsp))
1132                               frame->tf_rsp = (register_t)(td->td_kstack + PAGE_SIZE);
1133                     if (in_kstack_guard(frame->tf_rbp))
1134                               frame->tf_rbp = (register_t)(td->td_kstack + PAGE_SIZE);
1135           } else {
1136                     kprintf("DOUBLE FAULT\n");
1137           }
1138           kprintf("\nFatal double fault\n");
1139           kprintf("rip = 0x%lx\n", frame->tf_rip);
1140           kprintf("rsp = 0x%lx\n", frame->tf_rsp);
1141           kprintf("rbp = 0x%lx\n", frame->tf_rbp);
1142           /* three separate prints in case of a trap on an unmapped page */
1143           kprintf("cpuid = %d; ", mycpu->gd_cpuid);
1144           if (lapic_usable)
1145                     kprintf("lapic id = %u\n", LAPIC_READID);
1146           panic("double fault");
1147 }
1148 
1149 /*
1150  * syscall2 -       MP aware system call request C handler
1151  *
1152  * A system call is essentially treated as a trap except that the
1153  * MP lock is not held on entry or return.  We are responsible for
1154  * obtaining the MP lock if necessary and for handling ASTs
1155  * (e.g. a task switch) prior to return.
1156  */
1157 void
syscall2(struct trapframe * frame)1158 syscall2(struct trapframe *frame)
1159 {
1160           struct thread *td = curthread;
1161           struct proc *p = td->td_proc;
1162           struct lwp *lp = td->td_lwp;
1163           struct sysent *callp;
1164           register_t orig_tf_rflags;
1165           int sticks;
1166           int error;
1167           int narg;
1168 #ifdef INVARIANTS
1169           int crit_count = td->td_critcount;
1170 #endif
1171           struct sysmsg sysmsg;
1172           union sysunion *argp;
1173           u_int code;
1174           const int regcnt = 6;         /* number of args passed in registers */
1175 
1176           mycpu->gd_cnt.v_syscall++;
1177 
1178 #ifdef DIAGNOSTIC
1179           if (__predict_false(ISPL(frame->tf_cs) != SEL_UPL)) {
1180                     panic("syscall");
1181                     /* NOT REACHED */
1182           }
1183 #endif
1184 
1185           KTR_LOG(kernentry_syscall, p->p_pid, lp->lwp_tid,
1186                     frame->tf_rax);
1187 
1188           userenter(td, p);   /* lazy raise our priority */
1189 
1190           /*
1191            * Misc
1192            */
1193           sticks = (int)td->td_sticks;
1194           orig_tf_rflags = frame->tf_rflags;
1195 
1196           /*
1197            * Virtual kernel intercept - if a VM context managed by a virtual
1198            * kernel issues a system call the virtual kernel handles it, not us.
1199            * Restore the virtual kernel context and return from its system
1200            * call.  The current frame is copied out to the virtual kernel.
1201            */
1202           if (__predict_false(lp->lwp_vkernel && lp->lwp_vkernel->ve)) {
1203                     vkernel_trap(lp, frame);
1204                     error = EJUSTRETURN;
1205                     callp = NULL;
1206                     code = 0;
1207                     goto out;
1208           }
1209 
1210           /*
1211            * Get the system call parameters and account for time
1212            */
1213 #ifdef DIAGNOSTIC
1214           KASSERT(lp->lwp_md.md_regs == frame,
1215                     ("Frame mismatch %p %p", lp->lwp_md.md_regs, frame));
1216 #endif
1217 
1218           code = (u_int)frame->tf_rax;
1219           if (code >= p->p_sysent->sv_size)
1220                     code = SYS___nosys;
1221 
1222           argp = (union sysunion *)&frame->tf_rdi;
1223           callp = &p->p_sysent->sv_table[code];
1224 
1225           /*
1226            * On x86_64 we get up to six arguments in registers. The rest are
1227            * on the stack. The first six members of 'struct trapframe' happen
1228            * to be the registers used to pass arguments, in exactly the right
1229            * order.
1230            *
1231            * Any arguments beyond available argument-passing registers must
1232            * be copyin()'d from the user stack.
1233            */
1234           narg = callp->sy_narg;
1235           if (__predict_false(narg > regcnt)) {
1236                     register_t *argsdst;
1237                     caddr_t params;
1238 
1239                     argsdst = (register_t *)&sysmsg.extargs;
1240                     bcopy(argp, argsdst, sizeof(register_t) * regcnt);
1241                     params = (caddr_t)frame->tf_rsp + sizeof(register_t);
1242                     error = copyin(params, &argsdst[regcnt],
1243                                      (narg - regcnt) * sizeof(register_t));
1244                     argp = (void *)argsdst;
1245                     if (error) {
1246 #ifdef KTRACE
1247                               if (KTRPOINTP(p, td, KTR_SYSCALL)) {
1248                                         ktrsyscall(lp, code, narg, argp);
1249                               }
1250 #endif
1251                               goto bad;
1252                     }
1253           }
1254 
1255 #ifdef KTRACE
1256           if (KTRPOINTP(p, td, KTR_SYSCALL)) {
1257                     ktrsyscall(lp, code, narg, argp);
1258           }
1259 #endif
1260 
1261           /*
1262            * Default return value is 0 (will be copied to %rax).  Double-value
1263            * returns use %rax and %rdx.  %rdx is left unchanged for system
1264            * calls which return only one result.
1265            */
1266           sysmsg.sysmsg_fds[0] = 0;
1267           sysmsg.sysmsg_fds[1] = frame->tf_rdx;
1268 
1269           /*
1270            * The syscall might manipulate the trap frame. If it does it
1271            * will probably return EJUSTRETURN.
1272            */
1273           sysmsg.sysmsg_frame = frame;
1274 
1275           STOPEVENT(p, S_SCE, narg);    /* MP aware */
1276 
1277           /*
1278            * NOTE: All system calls run MPSAFE now.  The system call itself
1279            *         is responsible for getting the MP lock.
1280            */
1281 #ifdef SYSCALL_DEBUG
1282           tsc_uclock_t tscval = rdtsc();
1283 #endif
1284           error = (*callp->sy_call)(&sysmsg, argp);
1285 #ifdef SYSCALL_DEBUG
1286           tscval = rdtsc() - tscval;
1287           tscval = tscval * 1000000 / (tsc_frequency / 1000);         /* ns */
1288           {
1289                     struct syscallwc *scwc = &SysCallsWorstCase[mycpu->gd_cpuid];
1290                     int idx = scwc->idx++ % SCWC_MAXT;
1291 
1292                     scwc->tot[code] += tscval - scwc->timings[code][idx];
1293                     scwc->timings[code][idx] = tscval;
1294           }
1295 #endif
1296 
1297 out:
1298           /*
1299            * MP SAFE (we may or may not have the MP lock at this point)
1300            */
1301           //kprintf("SYSMSG %d ", error);
1302           if (__predict_true(error == 0)) {
1303                     /*
1304                      * Reinitialize proc pointer `p' as it may be different
1305                      * if this is a child returning from fork syscall.
1306                      */
1307                     p = curproc;
1308                     lp = curthread->td_lwp;
1309                     frame->tf_rax = sysmsg.sysmsg_fds[0];
1310                     frame->tf_rdx = sysmsg.sysmsg_fds[1];
1311                     frame->tf_rflags &= ~PSL_C;
1312           } else if (error == ERESTART) {
1313                     /*
1314                      * Reconstruct pc, we know that 'syscall' is 2 bytes.
1315                      * We have to do a full context restore so that %r10
1316                      * (which was holding the value of %rcx) is restored for
1317                      * the next iteration.
1318                      */
1319                     if (frame->tf_err != 0 && frame->tf_err != 2)
1320                               kprintf("lp %s:%d frame->tf_err is weird %ld\n",
1321                                         td->td_comm, lp->lwp_proc->p_pid, frame->tf_err);
1322                     frame->tf_rip -= frame->tf_err;
1323                     frame->tf_r10 = frame->tf_rcx;
1324           } else if (error == EJUSTRETURN) {
1325                     /* do nothing */
1326           } else if (error == EASYNC) {
1327                     panic("Unexpected EASYNC return value (for now)");
1328           } else {
1329 bad:
1330                     if (p->p_sysent->sv_errsize) {
1331                               if (error >= p->p_sysent->sv_errsize)
1332                                         error = -1;         /* XXX */
1333                               else
1334                                         error = p->p_sysent->sv_errtbl[error];
1335                     }
1336                     frame->tf_rax = error;
1337                     frame->tf_rflags |= PSL_C;
1338           }
1339 
1340           /*
1341            * Traced syscall.  trapsignal() should now be MP aware
1342            */
1343           if (__predict_false(orig_tf_rflags & PSL_T)) {
1344                     frame->tf_rflags &= ~PSL_T;
1345                     trapsignal(lp, SIGTRAP, TRAP_TRACE);
1346           }
1347 
1348           /*
1349            * Handle reschedule and other end-of-syscall issues
1350            */
1351           userret(lp, frame, sticks);
1352 
1353 #ifdef KTRACE
1354           if (KTRPOINTP(p, td, KTR_SYSRET)) {
1355                     ktrsysret(lp, code, error, sysmsg.sysmsg_result);
1356           }
1357 #endif
1358 
1359           /*
1360            * This works because errno is findable through the
1361            * register set.  If we ever support an emulation where this
1362            * is not the case, this code will need to be revisited.
1363            */
1364           STOPEVENT(p, S_SCX, code);
1365 
1366           userexit(lp);
1367           KTR_LOG(kernentry_syscall_ret, p->p_pid, lp->lwp_tid, error);
1368 #ifdef INVARIANTS
1369           KASSERT(crit_count == td->td_critcount,
1370                     ("syscall: critical section count mismatch! "
1371                      "%d/%d in %s sysno=%d",
1372                     crit_count, td->td_critcount, td->td_comm, code));
1373           KASSERT(&td->td_toks_base == td->td_toks_stop,
1374                     ("syscall: %ld extra tokens held after trap! syscall %p",
1375                     td->td_toks_stop - &td->td_toks_base,
1376                     callp->sy_call));
1377 #endif
1378 }
1379 
1380 /*
1381  * Handles the syscall() and __syscall() API
1382  */
1383 void xsyscall(struct sysmsg *sysmsg, struct nosys_args *uap);
1384 
1385 int
sys_xsyscall(struct sysmsg * sysmsg,const struct nosys_args * uap)1386 sys_xsyscall(struct sysmsg *sysmsg, const struct nosys_args *uap)
1387 {
1388           struct trapframe *frame;
1389           struct sysent *callp;
1390           union sysunion *argp;
1391           struct thread *td;
1392           struct proc *p;
1393           const int regcnt = 5;         /* number of args passed in registers */
1394           u_int code;
1395           int error;
1396           int narg;
1397 
1398           td = curthread;
1399           p = td->td_proc;
1400           frame = sysmsg->sysmsg_frame;
1401           code = (u_int)frame->tf_rdi;
1402           if (code >= p->p_sysent->sv_size)
1403                     code = SYS___nosys;
1404           argp = (union sysunion *)(&frame->tf_rdi + 1);
1405           callp = &p->p_sysent->sv_table[code];
1406           narg = callp->sy_narg;
1407 
1408           /*
1409            * On x86_64 we get up to six arguments in registers.  The rest are
1410            * on the stack.  However, for syscall() and __syscall() the syscall
1411            * number is inserted as the first argument, so the limit is reduced
1412            * by one to five.
1413            */
1414           if (__predict_false(narg > regcnt)) {
1415                     register_t *argsdst;
1416                     caddr_t params;
1417 
1418                     argsdst = (register_t *)&sysmsg->extargs;
1419                     bcopy(argp, argsdst, sizeof(register_t) * regcnt);
1420                     params = (caddr_t)frame->tf_rsp + sizeof(register_t);
1421                     error = copyin(params, &argsdst[regcnt],
1422                                      (narg - regcnt) * sizeof(register_t));
1423                     argp = (void *)argsdst;
1424                     if (error) {
1425 #ifdef KTRACE
1426                               if (KTRPOINTP(p, td, KTR_SYSCALL)) {
1427                                         ktrsyscall(td->td_lwp, code, narg, argp);
1428                               }
1429                               if (KTRPOINTP(p, td, KTR_SYSRET)) {
1430                                         ktrsysret(td->td_lwp, code, error,
1431                                                     sysmsg->sysmsg_result);
1432                               }
1433 #endif
1434                               return error;
1435                     }
1436           }
1437 
1438 #ifdef KTRACE
1439           if (KTRPOINTP(p, td, KTR_SYSCALL)) {
1440                     ktrsyscall(td->td_lwp, code, narg, argp);
1441           }
1442 #endif
1443 
1444           error = (*callp->sy_call)(sysmsg, argp);
1445 
1446 #ifdef KTRACE
1447           if (KTRPOINTP(p, td, KTR_SYSRET)) {
1448                     register_t rval;
1449 
1450                     rval = (callp->sy_rsize <= 4) ? sysmsg->sysmsg_result :
1451                                                             sysmsg->sysmsg_lresult;
1452                     ktrsysret(td->td_lwp, code, error, rval);
1453           }
1454 #endif
1455 
1456           return error;
1457 }
1458 
1459 void
fork_return(struct lwp * lp,struct trapframe * frame)1460 fork_return(struct lwp *lp, struct trapframe *frame)
1461 {
1462           frame->tf_rax = 0;            /* Child returns zero */
1463           frame->tf_rflags &= ~PSL_C;   /* success */
1464           frame->tf_rdx = 1;
1465 
1466           generic_lwp_return(lp, frame);
1467           KTR_LOG(kernentry_fork_ret, lp->lwp_proc->p_pid, lp->lwp_tid);
1468 }
1469 
1470 /*
1471  * Simplified back end of syscall(), used when returning from fork()
1472  * directly into user mode.
1473  *
1474  * This code will return back into the fork trampoline code which then
1475  * runs doreti.
1476  */
1477 void
generic_lwp_return(struct lwp * lp,struct trapframe * frame)1478 generic_lwp_return(struct lwp *lp, struct trapframe *frame)
1479 {
1480           struct proc *p = lp->lwp_proc;
1481 
1482           /*
1483            * Check for exit-race.  If one lwp exits the process concurrent with
1484            * another lwp creating a new thread, the two operations may cross
1485            * each other resulting in the newly-created lwp not receiving a
1486            * KILL signal.
1487            */
1488           if (p->p_flags & P_WEXIT) {
1489                     lwpsignal(p, lp, SIGKILL);
1490           }
1491 
1492           /*
1493            * Newly forked processes are given a kernel priority.  We have to
1494            * adjust the priority to a normal user priority and fake entry
1495            * into the kernel (call userenter()) to install a passive release
1496            * function just in case userret() decides to stop the process.  This
1497            * can occur when ^Z races a fork.  If we do not install the passive
1498            * release function the current process designation will not be
1499            * released when the thread goes to sleep.
1500            */
1501           lwkt_setpri_self(TDPRI_USER_NORM);
1502           userenter(lp->lwp_thread, p);
1503           userret(lp, frame, 0);
1504 #ifdef KTRACE
1505           if (KTRPOINTP(p, lp->lwp_thread, KTR_SYSRET))
1506                     ktrsysret(lp, SYS_fork, 0, 0);
1507 #endif
1508           lp->lwp_flags |= LWP_PASSIVE_ACQ;
1509           userexit(lp);
1510           lp->lwp_flags &= ~LWP_PASSIVE_ACQ;
1511 }
1512 
1513 /*
1514  * If PGEX_FPFAULT is set then set FP_VIRTFP in the PCB to force a T_DNA
1515  * fault (which is then passed back to the virtual kernel) if an attempt is
1516  * made to use the FP unit.
1517  *
1518  * XXX this is a fairly big hack.
1519  */
1520 void
set_vkernel_fp(struct trapframe * frame)1521 set_vkernel_fp(struct trapframe *frame)
1522 {
1523           struct thread *td = curthread;
1524 
1525           if (frame->tf_xflags & PGEX_FPFAULT) {
1526                     td->td_pcb->pcb_flags |= FP_VIRTFP;
1527                     if (mdcpu->gd_npxthread == td)
1528                               npxexit();
1529           } else {
1530                     td->td_pcb->pcb_flags &= ~FP_VIRTFP;
1531           }
1532 }
1533 
1534 /*
1535  * Called from vkernel_trap() to fixup the vkernel's syscall
1536  * frame for vmspace_ctl() return.
1537  */
1538 void
cpu_vkernel_trap(struct trapframe * frame,int error)1539 cpu_vkernel_trap(struct trapframe *frame, int error)
1540 {
1541           frame->tf_rax = error;
1542           if (error)
1543                     frame->tf_rflags |= PSL_C;
1544           else
1545                     frame->tf_rflags &= ~PSL_C;
1546 }
1547