xref: /f-stack/freebsd/i386/i386/trap.c (revision 22ce4aff)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (C) 1994, David Greenman
5  * Copyright (c) 1990, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * the University of Utah, and William Jolitz.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
40  */
41 
42 #include <sys/cdefs.h>
43 __FBSDID("$FreeBSD$");
44 
45 /*
46  * 386 Trap and System call handling
47  */
48 
49 #include "opt_clock.h"
50 #include "opt_compat.h"
51 #include "opt_cpu.h"
52 #include "opt_hwpmc_hooks.h"
53 #include "opt_isa.h"
54 #include "opt_kdb.h"
55 #include "opt_trap.h"
56 
57 #include <sys/param.h>
58 #include <sys/bus.h>
59 #include <sys/systm.h>
60 #include <sys/proc.h>
61 #include <sys/ptrace.h>
62 #include <sys/kdb.h>
63 #include <sys/kernel.h>
64 #include <sys/ktr.h>
65 #include <sys/lock.h>
66 #include <sys/mutex.h>
67 #include <sys/resourcevar.h>
68 #include <sys/signalvar.h>
69 #include <sys/syscall.h>
70 #include <sys/sysctl.h>
71 #include <sys/sysent.h>
72 #include <sys/uio.h>
73 #include <sys/vmmeter.h>
74 #ifdef HWPMC_HOOKS
75 #include <sys/pmckern.h>
76 PMC_SOFT_DEFINE( , , page_fault, all);
77 PMC_SOFT_DEFINE( , , page_fault, read);
78 PMC_SOFT_DEFINE( , , page_fault, write);
79 #endif
80 #include <security/audit/audit.h>
81 
82 #include <vm/vm.h>
83 #include <vm/vm_param.h>
84 #include <vm/pmap.h>
85 #include <vm/vm_kern.h>
86 #include <vm/vm_map.h>
87 #include <vm/vm_page.h>
88 #include <vm/vm_extern.h>
89 
90 #include <machine/cpu.h>
91 #include <machine/intr_machdep.h>
92 #include <x86/mca.h>
93 #include <machine/md_var.h>
94 #include <machine/pcb.h>
95 #ifdef SMP
96 #include <machine/smp.h>
97 #endif
98 #include <machine/stack.h>
99 #include <machine/trap.h>
100 #include <machine/tss.h>
101 #include <machine/vm86.h>
102 
103 #ifdef POWERFAIL_NMI
104 #include <sys/syslog.h>
105 #include <machine/clock.h>
106 #endif
107 
108 #ifdef KDTRACE_HOOKS
109 #include <sys/dtrace_bsd.h>
110 #endif
111 
112 void trap(struct trapframe *frame);
113 void syscall(struct trapframe *frame);
114 
115 static int trap_pfault(struct trapframe *, bool, vm_offset_t, int *, int *);
116 static void trap_fatal(struct trapframe *, vm_offset_t);
117 #ifdef KDTRACE_HOOKS
118 static bool trap_user_dtrace(struct trapframe *,
119     int (**hook)(struct trapframe *));
120 #endif
121 void dblfault_handler(void);
122 
123 extern inthand_t IDTVEC(bpt), IDTVEC(dbg), IDTVEC(int0x80_syscall);
124 extern uint64_t pg_nx;
125 
126 struct trap_data {
127 	bool		ei;
128 	const char	*msg;
129 };
130 
131 static const struct trap_data trap_data[] = {
132 	[T_PRIVINFLT] =	{ .ei = true,	.msg = "privileged instruction fault" },
133 	[T_BPTFLT] =	{ .ei = false,	.msg = "breakpoint instruction fault" },
134 	[T_ARITHTRAP] =	{ .ei = true,	.msg = "arithmetic trap" },
135 	[T_PROTFLT] =	{ .ei = true,	.msg = "general protection fault" },
136 	[T_TRCTRAP] =	{ .ei = false,	.msg = "debug exception" },
137 	[T_PAGEFLT] =	{ .ei = true,	.msg = "page fault" },
138 	[T_ALIGNFLT] = 	{ .ei = true,	.msg = "alignment fault" },
139 	[T_DIVIDE] =	{ .ei = true,	.msg = "integer divide fault" },
140 	[T_NMI] =	{ .ei = false,	.msg = "non-maskable interrupt trap" },
141 	[T_OFLOW] =	{ .ei = true,	.msg = "overflow trap" },
142 	[T_BOUND] =	{ .ei = true,	.msg = "FPU bounds check fault" },
143 	[T_DNA] =	{ .ei = true,	.msg = "FPU device not available" },
144 	[T_DOUBLEFLT] =	{ .ei = false,	.msg = "double fault" },
145 	[T_FPOPFLT] =	{ .ei = true,	.msg = "FPU operand fetch fault" },
146 	[T_TSSFLT] =	{ .ei = true,	.msg = "invalid TSS fault" },
147 	[T_SEGNPFLT] =	{ .ei = true,	.msg = "segment not present fault" },
148 	[T_STKFLT] =	{ .ei = true,	.msg = "stack fault" },
149 	[T_MCHK] =	{ .ei = true,	.msg = "machine check trap" },
150 	[T_XMMFLT] =	{ .ei = true,	.msg = "SIMD floating-point exception" },
151 	[T_DTRACE_RET] ={ .ei = true,	.msg = "DTrace pid return trap" },
152 };
153 
154 static bool
trap_enable_intr(int trapno)155 trap_enable_intr(int trapno)
156 {
157 
158 	MPASS(trapno > 0);
159 	if (trapno < nitems(trap_data) && trap_data[trapno].msg != NULL)
160 		return (trap_data[trapno].ei);
161 	return (false);
162 }
163 
164 static const char *
trap_msg(int trapno)165 trap_msg(int trapno)
166 {
167 	const char *res;
168 	static const char unkn[] = "UNKNOWN";
169 
170 	res = NULL;
171 	if (trapno < nitems(trap_data))
172 		res = trap_data[trapno].msg;
173 	if (res == NULL)
174 		res = unkn;
175 	return (res);
176 }
177 
178 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
179 int has_f00f_bug = 0;		/* Initialized so that it can be patched. */
180 #endif
181 
182 static int uprintf_signal;
183 SYSCTL_INT(_machdep, OID_AUTO, uprintf_signal, CTLFLAG_RW,
184     &uprintf_signal, 0,
185     "Print debugging information on trap signal to ctty");
186 
187 /*
188  * Exception, fault, and trap interface to the FreeBSD kernel.
189  * This common code is called from assembly language IDT gate entry
190  * routines that prepare a suitable stack frame, and restore this
191  * frame after the exception has been processed.
192  */
193 
194 void
trap(struct trapframe * frame)195 trap(struct trapframe *frame)
196 {
197 	ksiginfo_t ksi;
198 	struct thread *td;
199 	struct proc *p;
200 	int pf, signo, ucode;
201 	u_int type;
202 	register_t addr, dr6;
203 	vm_offset_t eva;
204 #ifdef POWERFAIL_NMI
205 	static int lastalert = 0;
206 #endif
207 
208 	td = curthread;
209 	p = td->td_proc;
210 	dr6 = 0;
211 
212 	VM_CNT_INC(v_trap);
213 	type = frame->tf_trapno;
214 
215 	KASSERT((read_eflags() & PSL_I) == 0,
216 	    ("trap: interrupts enabled, type %d frame %p", type, frame));
217 
218 #ifdef SMP
219 	/* Handler for NMI IPIs used for stopping CPUs. */
220 	if (type == T_NMI && ipi_nmi_handler() == 0)
221 		return;
222 #endif /* SMP */
223 
224 #ifdef KDB
225 	if (kdb_active) {
226 		kdb_reenter();
227 		return;
228 	}
229 #endif
230 
231 	if (type == T_RESERVED) {
232 		trap_fatal(frame, 0);
233 		return;
234 	}
235 
236 	if (type == T_NMI) {
237 #ifdef HWPMC_HOOKS
238 		/*
239 		 * CPU PMCs interrupt using an NMI so we check for that first.
240 		 * If the HWPMC module is active, 'pmc_hook' will point to
241 		 * the function to be called.  A non-zero return value from the
242 		 * hook means that the NMI was consumed by it and that we can
243 		 * return immediately.
244 		 */
245 		if (pmc_intr != NULL &&
246 		    (*pmc_intr)(frame) != 0)
247 			return;
248 #endif
249 	}
250 
251 	if (type == T_MCHK) {
252 		mca_intr();
253 		return;
254 	}
255 
256 #ifdef KDTRACE_HOOKS
257 	/*
258 	 * A trap can occur while DTrace executes a probe. Before
259 	 * executing the probe, DTrace blocks re-scheduling and sets
260 	 * a flag in its per-cpu flags to indicate that it doesn't
261 	 * want to fault. On returning from the probe, the no-fault
262 	 * flag is cleared and finally re-scheduling is enabled.
263 	 */
264 	if ((type == T_PROTFLT || type == T_PAGEFLT) &&
265 	    dtrace_trap_func != NULL && (*dtrace_trap_func)(frame, type))
266 		return;
267 #endif
268 
269 	/*
270 	 * We must not allow context switches until %cr2 is read.
271 	 * Also, for some Cyrix CPUs, %cr2 is clobbered by interrupts.
272 	 * All faults use interrupt gates, so %cr2 can be safely read
273 	 * now, before optional enable of the interrupts below.
274 	 */
275 	if (type == T_PAGEFLT)
276 		eva = rcr2();
277 
278 	/*
279 	 * Buggy application or kernel code has disabled interrupts
280 	 * and then trapped.  Enabling interrupts now is wrong, but it
281 	 * is better than running with interrupts disabled until they
282 	 * are accidentally enabled later.
283 	 */
284 	if ((frame->tf_eflags & PSL_I) == 0 && TRAPF_USERMODE(frame) &&
285 	    (curpcb->pcb_flags & PCB_VM86CALL) == 0)
286 		uprintf("pid %ld (%s): trap %d with interrupts disabled\n",
287 		    (long)curproc->p_pid, curthread->td_name, type);
288 
289 	/*
290 	 * Conditionally reenable interrupts.  If we hold a spin lock,
291 	 * then we must not reenable interrupts.  This might be a
292 	 * spurious page fault.
293 	 */
294 	if (trap_enable_intr(type) && td->td_md.md_spinlock_count == 0 &&
295 	    frame->tf_eip != (int)cpu_switch_load_gs)
296 		enable_intr();
297 
298         if (TRAPF_USERMODE(frame) && (curpcb->pcb_flags & PCB_VM86CALL) == 0) {
299 		/* user trap */
300 
301 		td->td_pticks = 0;
302 		td->td_frame = frame;
303 		addr = frame->tf_eip;
304 		if (td->td_cowgen != p->p_cowgen)
305 			thread_cow_update(td);
306 
307 		switch (type) {
308 		case T_PRIVINFLT:	/* privileged instruction fault */
309 			signo = SIGILL;
310 			ucode = ILL_PRVOPC;
311 			break;
312 
313 		case T_BPTFLT:		/* bpt instruction fault */
314 #ifdef KDTRACE_HOOKS
315 			if (trap_user_dtrace(frame, &dtrace_pid_probe_ptr))
316 				return;
317 #else
318 			enable_intr();
319 #endif
320 			signo = SIGTRAP;
321 			ucode = TRAP_BRKPT;
322 			break;
323 
324 		case T_TRCTRAP:		/* debug exception */
325 			enable_intr();
326 user_trctrap_out:
327 			signo = SIGTRAP;
328 			ucode = TRAP_TRACE;
329 			dr6 = rdr6();
330 			if ((dr6 & DBREG_DR6_BS) != 0) {
331 				PROC_LOCK(td->td_proc);
332 				if ((td->td_dbgflags & TDB_STEP) != 0) {
333 					td->td_frame->tf_eflags &= ~PSL_T;
334 					td->td_dbgflags &= ~TDB_STEP;
335 				}
336 				PROC_UNLOCK(td->td_proc);
337 			}
338 			break;
339 
340 		case T_ARITHTRAP:	/* arithmetic trap */
341 			ucode = npxtrap_x87();
342 			if (ucode == -1)
343 				return;
344 			signo = SIGFPE;
345 			break;
346 
347 		/*
348 		 * The following two traps can happen in vm86 mode,
349 		 * and, if so, we want to handle them specially.
350 		 */
351 		case T_PROTFLT:		/* general protection fault */
352 		case T_STKFLT:		/* stack fault */
353 			if (frame->tf_eflags & PSL_VM) {
354 				signo = vm86_emulate((struct vm86frame *)frame);
355 				ucode = 0;	/* XXXKIB: better code ? */
356 				if (signo == SIGTRAP) {
357 					load_dr6(rdr6() | 0x4000);
358 					goto user_trctrap_out;
359 				}
360 				if (signo == 0)
361 					goto user;
362 				break;
363 			}
364 			signo = SIGBUS;
365 			ucode = (type == T_PROTFLT) ? BUS_OBJERR : BUS_ADRERR;
366 			break;
367 		case T_SEGNPFLT:	/* segment not present fault */
368 			signo = SIGBUS;
369 			ucode = BUS_ADRERR;
370 			break;
371 		case T_TSSFLT:		/* invalid TSS fault */
372 			signo = SIGBUS;
373 			ucode = BUS_OBJERR;
374 			break;
375 		case T_ALIGNFLT:
376 			signo = SIGBUS;
377 			ucode = BUS_ADRALN;
378 			break;
379 		case T_DOUBLEFLT:	/* double fault */
380 		default:
381 			signo = SIGBUS;
382 			ucode = BUS_OBJERR;
383 			break;
384 
385 		case T_PAGEFLT:		/* page fault */
386 			addr = eva;
387 			pf = trap_pfault(frame, true, eva, &signo, &ucode);
388 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
389 			if (pf == -2) {
390 				/*
391 				 * The f00f hack workaround has triggered, so
392 				 * treat the fault as an illegal instruction
393 				 * (T_PRIVINFLT) instead of a page fault.
394 				 */
395 				type = frame->tf_trapno = T_PRIVINFLT;
396 				break;
397 			}
398 #endif
399 			if (pf == -1)
400 				return;
401 			if (pf == 0)
402 				goto user;
403 			break;
404 
405 		case T_DIVIDE:		/* integer divide fault */
406 			ucode = FPE_INTDIV;
407 			signo = SIGFPE;
408 			break;
409 
410 		case T_NMI:
411 #ifdef POWERFAIL_NMI
412 #ifndef TIMER_FREQ
413 #  define TIMER_FREQ 1193182
414 #endif
415 			if (time_second - lastalert > 10) {
416 				log(LOG_WARNING, "NMI: power fail\n");
417 				sysbeep(880, hz);
418 				lastalert = time_second;
419 			}
420 			return;
421 #else /* !POWERFAIL_NMI */
422 			nmi_handle_intr(type, frame);
423 			return;
424 #endif /* POWERFAIL_NMI */
425 
426 		case T_OFLOW:		/* integer overflow fault */
427 			ucode = FPE_INTOVF;
428 			signo = SIGFPE;
429 			break;
430 
431 		case T_BOUND:		/* bounds check fault */
432 			ucode = FPE_FLTSUB;
433 			signo = SIGFPE;
434 			break;
435 
436 		case T_DNA:
437 			KASSERT(PCB_USER_FPU(td->td_pcb),
438 			    ("kernel FPU ctx has leaked"));
439 			/* transparent fault (due to context switch "late") */
440 			if (npxdna())
441 				return;
442 			uprintf("pid %d killed due to lack of floating point\n",
443 				p->p_pid);
444 			signo = SIGKILL;
445 			ucode = 0;
446 			break;
447 
448 		case T_FPOPFLT:		/* FPU operand fetch fault */
449 			ucode = ILL_COPROC;
450 			signo = SIGILL;
451 			break;
452 
453 		case T_XMMFLT:		/* SIMD floating-point exception */
454 			ucode = npxtrap_sse();
455 			if (ucode == -1)
456 				return;
457 			signo = SIGFPE;
458 			break;
459 #ifdef KDTRACE_HOOKS
460 		case T_DTRACE_RET:
461 			(void)trap_user_dtrace(frame, &dtrace_return_probe_ptr);
462 			return;
463 #endif
464 		}
465 	} else {
466 		/* kernel trap */
467 
468 		KASSERT(cold || td->td_ucred != NULL,
469 		    ("kernel trap doesn't have ucred"));
470 		switch (type) {
471 		case T_PAGEFLT:			/* page fault */
472 			(void)trap_pfault(frame, false, eva, NULL, NULL);
473 			return;
474 
475 		case T_DNA:
476 			if (PCB_USER_FPU(td->td_pcb))
477 				panic("Unregistered use of FPU in kernel");
478 			if (npxdna())
479 				return;
480 			break;
481 
482 		case T_ARITHTRAP:	/* arithmetic trap */
483 		case T_XMMFLT:		/* SIMD floating-point exception */
484 		case T_FPOPFLT:		/* FPU operand fetch fault */
485 			/*
486 			 * XXXKIB for now disable any FPU traps in kernel
487 			 * handler registration seems to be overkill
488 			 */
489 			trap_fatal(frame, 0);
490 			return;
491 
492 			/*
493 			 * The following two traps can happen in
494 			 * vm86 mode, and, if so, we want to handle
495 			 * them specially.
496 			 */
497 		case T_PROTFLT:		/* general protection fault */
498 		case T_STKFLT:		/* stack fault */
499 			if (frame->tf_eflags & PSL_VM) {
500 				signo = vm86_emulate((struct vm86frame *)frame);
501 				if (signo == SIGTRAP) {
502 					type = T_TRCTRAP;
503 					load_dr6(rdr6() | 0x4000);
504 					goto kernel_trctrap;
505 				}
506 				if (signo != 0)
507 					/*
508 					 * returns to original process
509 					 */
510 					vm86_trap((struct vm86frame *)frame);
511 				return;
512 			}
513 			/* FALL THROUGH */
514 		case T_SEGNPFLT:	/* segment not present fault */
515 			if (curpcb->pcb_flags & PCB_VM86CALL)
516 				break;
517 
518 			/*
519 			 * Invalid %fs's and %gs's can be created using
520 			 * procfs or PT_SETREGS or by invalidating the
521 			 * underlying LDT entry.  This causes a fault
522 			 * in kernel mode when the kernel attempts to
523 			 * switch contexts.  Lose the bad context
524 			 * (XXX) so that we can continue, and generate
525 			 * a signal.
526 			 */
527 			if (frame->tf_eip == (int)cpu_switch_load_gs) {
528 				curpcb->pcb_gs = 0;
529 #if 0
530 				PROC_LOCK(p);
531 				kern_psignal(p, SIGBUS);
532 				PROC_UNLOCK(p);
533 #endif
534 				return;
535 			}
536 
537 			if (td->td_intr_nesting_level != 0)
538 				break;
539 
540 			/*
541 			 * Invalid segment selectors and out of bounds
542 			 * %eip's and %esp's can be set up in user mode.
543 			 * This causes a fault in kernel mode when the
544 			 * kernel tries to return to user mode.  We want
545 			 * to get this fault so that we can fix the
546 			 * problem here and not have to check all the
547 			 * selectors and pointers when the user changes
548 			 * them.
549 			 *
550 			 * N.B. Comparing to long mode, 32-bit mode
551 			 * does not push %esp on the trap frame,
552 			 * because iretl faulted while in ring 0.  As
553 			 * the consequence, there is no need to fixup
554 			 * the stack pointer for doreti_iret_fault,
555 			 * the fixup and the complimentary trap() call
556 			 * are executed on the main thread stack, not
557 			 * on the trampoline stack.
558 			 */
559 			if (frame->tf_eip == (int)doreti_iret + setidt_disp) {
560 				frame->tf_eip = (int)doreti_iret_fault +
561 				    setidt_disp;
562 				return;
563 			}
564 			if (type == T_STKFLT)
565 				break;
566 
567 			if (frame->tf_eip == (int)doreti_popl_ds +
568 			    setidt_disp) {
569 				frame->tf_eip = (int)doreti_popl_ds_fault +
570 				    setidt_disp;
571 				return;
572 			}
573 			if (frame->tf_eip == (int)doreti_popl_es +
574 			    setidt_disp) {
575 				frame->tf_eip = (int)doreti_popl_es_fault +
576 				    setidt_disp;
577 				return;
578 			}
579 			if (frame->tf_eip == (int)doreti_popl_fs +
580 			    setidt_disp) {
581 				frame->tf_eip = (int)doreti_popl_fs_fault +
582 				    setidt_disp;
583 				return;
584 			}
585 			if (curpcb->pcb_onfault != NULL) {
586 				frame->tf_eip = (int)curpcb->pcb_onfault;
587 				return;
588 			}
589 			break;
590 
591 		case T_TSSFLT:
592 			/*
593 			 * PSL_NT can be set in user mode and isn't cleared
594 			 * automatically when the kernel is entered.  This
595 			 * causes a TSS fault when the kernel attempts to
596 			 * `iret' because the TSS link is uninitialized.  We
597 			 * want to get this fault so that we can fix the
598 			 * problem here and not every time the kernel is
599 			 * entered.
600 			 */
601 			if (frame->tf_eflags & PSL_NT) {
602 				frame->tf_eflags &= ~PSL_NT;
603 				return;
604 			}
605 			break;
606 
607 		case T_TRCTRAP:	 /* debug exception */
608 kernel_trctrap:
609 			/* Clear any pending debug events. */
610 			dr6 = rdr6();
611 			load_dr6(0);
612 
613 			/*
614 			 * Ignore debug register exceptions due to
615 			 * accesses in the user's address space, which
616 			 * can happen under several conditions such as
617 			 * if a user sets a watchpoint on a buffer and
618 			 * then passes that buffer to a system call.
619 			 * We still want to get TRCTRAPS for addresses
620 			 * in kernel space because that is useful when
621 			 * debugging the kernel.
622 			 */
623 			if (user_dbreg_trap(dr6) &&
624 			   !(curpcb->pcb_flags & PCB_VM86CALL))
625 				return;
626 
627 			/*
628 			 * Malicious user code can configure a debug
629 			 * register watchpoint to trap on data access
630 			 * to the top of stack and then execute 'pop
631 			 * %ss; int 3'.  Due to exception deferral for
632 			 * 'pop %ss', the CPU will not interrupt 'int
633 			 * 3' to raise the DB# exception for the debug
634 			 * register but will postpone the DB# until
635 			 * execution of the first instruction of the
636 			 * BP# handler (in kernel mode).  Normally the
637 			 * previous check would ignore DB# exceptions
638 			 * for watchpoints on user addresses raised in
639 			 * kernel mode.  However, some CPU errata
640 			 * include cases where DB# exceptions do not
641 			 * properly set bits in %dr6, e.g. Haswell
642 			 * HSD23 and Skylake-X SKZ24.
643 			 *
644 			 * A deferred DB# can also be raised on the
645 			 * first instructions of system call entry
646 			 * points or single-step traps via similar use
647 			 * of 'pop %ss' or 'mov xxx, %ss'.
648 			 */
649 			if (frame->tf_eip ==
650 			    (uintptr_t)IDTVEC(int0x80_syscall) + setidt_disp ||
651 			    frame->tf_eip == (uintptr_t)IDTVEC(bpt) +
652 			    setidt_disp ||
653 			    frame->tf_eip == (uintptr_t)IDTVEC(dbg) +
654 			    setidt_disp)
655 				return;
656 			/*
657 			 * FALLTHROUGH (TRCTRAP kernel mode, kernel address)
658 			 */
659 		case T_BPTFLT:
660 			/*
661 			 * If KDB is enabled, let it handle the debugger trap.
662 			 * Otherwise, debugger traps "can't happen".
663 			 */
664 #ifdef KDB
665 			if (kdb_trap(type, dr6, frame))
666 				return;
667 #endif
668 			break;
669 
670 		case T_NMI:
671 #ifdef POWERFAIL_NMI
672 			if (time_second - lastalert > 10) {
673 				log(LOG_WARNING, "NMI: power fail\n");
674 				sysbeep(880, hz);
675 				lastalert = time_second;
676 			}
677 			return;
678 #else /* !POWERFAIL_NMI */
679 			nmi_handle_intr(type, frame);
680 			return;
681 #endif /* POWERFAIL_NMI */
682 		}
683 
684 		trap_fatal(frame, eva);
685 		return;
686 	}
687 
688 	/* Translate fault for emulators (e.g. Linux) */
689 	if (*p->p_sysent->sv_transtrap != NULL)
690 		signo = (*p->p_sysent->sv_transtrap)(signo, type);
691 
692 	ksiginfo_init_trap(&ksi);
693 	ksi.ksi_signo = signo;
694 	ksi.ksi_code = ucode;
695 	ksi.ksi_addr = (void *)addr;
696 	ksi.ksi_trapno = type;
697 	if (uprintf_signal) {
698 		uprintf("pid %d comm %s: signal %d err %x code %d type %d "
699 		    "addr 0x%x ss 0x%04x esp 0x%08x cs 0x%04x eip 0x%08x "
700 		    "<%02x %02x %02x %02x %02x %02x %02x %02x>\n",
701 		    p->p_pid, p->p_comm, signo, frame->tf_err, ucode, type,
702 		    addr, frame->tf_ss, frame->tf_esp, frame->tf_cs,
703 		    frame->tf_eip,
704 		    fubyte((void *)(frame->tf_eip + 0)),
705 		    fubyte((void *)(frame->tf_eip + 1)),
706 		    fubyte((void *)(frame->tf_eip + 2)),
707 		    fubyte((void *)(frame->tf_eip + 3)),
708 		    fubyte((void *)(frame->tf_eip + 4)),
709 		    fubyte((void *)(frame->tf_eip + 5)),
710 		    fubyte((void *)(frame->tf_eip + 6)),
711 		    fubyte((void *)(frame->tf_eip + 7)));
712 	}
713 	KASSERT((read_eflags() & PSL_I) != 0, ("interrupts disabled"));
714 	trapsignal(td, &ksi);
715 
716 user:
717 	userret(td, frame);
718 	KASSERT(PCB_USER_FPU(td->td_pcb),
719 	    ("Return from trap with kernel FPU ctx leaked"));
720 }
721 
722 /*
723  * Handle all details of a page fault.
724  * Returns:
725  * -2 if the fault was caused by triggered workaround for Intel Pentium
726  *    0xf00f bug.
727  * -1 if this fault was fatal, typically from kernel mode
728  *    (cannot happen, but we need to return something).
729  * 0  if this fault was handled by updating either the user or kernel
730  *    page table, execution can continue.
731  * 1  if this fault was from usermode and it was not handled, a synchronous
732  *    signal should be delivered to the thread.  *signo returns the signal
733  *    number, *ucode gives si_code.
734  */
735 static int
trap_pfault(struct trapframe * frame,bool usermode,vm_offset_t eva,int * signo,int * ucode)736 trap_pfault(struct trapframe *frame, bool usermode, vm_offset_t eva,
737     int *signo, int *ucode)
738 {
739 	struct thread *td;
740 	struct proc *p;
741 	vm_map_t map;
742 	int rv;
743 	vm_prot_t ftype;
744 
745 	MPASS(!usermode || (signo != NULL && ucode != NULL));
746 
747 	td = curthread;
748 	p = td->td_proc;
749 
750 	if (__predict_false((td->td_pflags & TDP_NOFAULTING) != 0)) {
751 		/*
752 		 * Due to both processor errata and lazy TLB invalidation when
753 		 * access restrictions are removed from virtual pages, memory
754 		 * accesses that are allowed by the physical mapping layer may
755 		 * nonetheless cause one spurious page fault per virtual page.
756 		 * When the thread is executing a "no faulting" section that
757 		 * is bracketed by vm_fault_{disable,enable}_pagefaults(),
758 		 * every page fault is treated as a spurious page fault,
759 		 * unless it accesses the same virtual address as the most
760 		 * recent page fault within the same "no faulting" section.
761 		 */
762 		if (td->td_md.md_spurflt_addr != eva ||
763 		    (td->td_pflags & TDP_RESETSPUR) != 0) {
764 			/*
765 			 * Do nothing to the TLB.  A stale TLB entry is
766 			 * flushed automatically by a page fault.
767 			 */
768 			td->td_md.md_spurflt_addr = eva;
769 			td->td_pflags &= ~TDP_RESETSPUR;
770 			return (0);
771 		}
772 	} else {
773 		/*
774 		 * If we get a page fault while in a critical section, then
775 		 * it is most likely a fatal kernel page fault.  The kernel
776 		 * is already going to panic trying to get a sleep lock to
777 		 * do the VM lookup, so just consider it a fatal trap so the
778 		 * kernel can print out a useful trap message and even get
779 		 * to the debugger.
780 		 *
781 		 * If we get a page fault while holding a non-sleepable
782 		 * lock, then it is most likely a fatal kernel page fault.
783 		 * If WITNESS is enabled, then it's going to whine about
784 		 * bogus LORs with various VM locks, so just skip to the
785 		 * fatal trap handling directly.
786 		 */
787 		if (td->td_critnest != 0 ||
788 		    WITNESS_CHECK(WARN_SLEEPOK | WARN_GIANTOK, NULL,
789 		    "Kernel page fault") != 0) {
790 			trap_fatal(frame, eva);
791 			return (-1);
792 		}
793 	}
794 	if (eva >= PMAP_TRM_MIN_ADDRESS) {
795 		/*
796 		 * Don't allow user-mode faults in kernel address space.
797 		 * An exception:  if the faulting address is the invalid
798 		 * instruction entry in the IDT, then the Intel Pentium
799 		 * F00F bug workaround was triggered, and we need to
800 		 * treat it is as an illegal instruction, and not a page
801 		 * fault.
802 		 */
803 #if defined(I586_CPU) && !defined(NO_F00F_HACK)
804 		if ((eva == (unsigned int)&idt[6]) && has_f00f_bug) {
805 			*ucode = ILL_PRVOPC;
806 			*signo = SIGILL;
807 			return (-2);
808 		}
809 #endif
810 		if (usermode) {
811 			*signo = SIGSEGV;
812 			*ucode = SEGV_MAPERR;
813 			return (1);
814 		}
815 		trap_fatal(frame, eva);
816 		return (-1);
817 	} else {
818 		map = usermode ? &p->p_vmspace->vm_map : kernel_map;
819 
820 		/*
821 		 * Kernel cannot access a user-space address directly
822 		 * because user pages are not mapped.  Also, page
823 		 * faults must not be caused during the interrupts.
824 		 */
825 		if (!usermode && td->td_intr_nesting_level != 0) {
826 			trap_fatal(frame, eva);
827 			return (-1);
828 		}
829 	}
830 
831 	/*
832 	 * If the trap was caused by errant bits in the PTE then panic.
833 	 */
834 	if (frame->tf_err & PGEX_RSV) {
835 		trap_fatal(frame, eva);
836 		return (-1);
837 	}
838 
839 	/*
840 	 * PGEX_I is defined only if the execute disable bit capability is
841 	 * supported and enabled.
842 	 */
843 	if (frame->tf_err & PGEX_W)
844 		ftype = VM_PROT_WRITE;
845 	else if ((frame->tf_err & PGEX_I) && pg_nx != 0)
846 		ftype = VM_PROT_EXECUTE;
847 	else
848 		ftype = VM_PROT_READ;
849 
850 	/* Fault in the page. */
851 	rv = vm_fault_trap(map, eva, ftype, VM_FAULT_NORMAL, signo, ucode);
852 	if (rv == KERN_SUCCESS) {
853 #ifdef HWPMC_HOOKS
854 		if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) {
855 			PMC_SOFT_CALL_TF( , , page_fault, all, frame);
856 			if (ftype == VM_PROT_READ)
857 				PMC_SOFT_CALL_TF( , , page_fault, read,
858 				    frame);
859 			else
860 				PMC_SOFT_CALL_TF( , , page_fault, write,
861 				    frame);
862 		}
863 #endif
864 		return (0);
865 	}
866 	if (usermode)
867 		return (1);
868 	if (td->td_intr_nesting_level == 0 &&
869 	    curpcb->pcb_onfault != NULL) {
870 		frame->tf_eip = (int)curpcb->pcb_onfault;
871 		return (0);
872 	}
873 	trap_fatal(frame, eva);
874 	return (-1);
875 }
876 
877 static void
trap_fatal(frame,eva)878 trap_fatal(frame, eva)
879 	struct trapframe *frame;
880 	vm_offset_t eva;
881 {
882 	int code, ss, esp;
883 	u_int type;
884 	struct soft_segment_descriptor softseg;
885 #ifdef KDB
886 	bool handled;
887 #endif
888 
889 	code = frame->tf_err;
890 	type = frame->tf_trapno;
891 	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
892 
893 	printf("\n\nFatal trap %d: %s while in %s mode\n", type, trap_msg(type),
894 	    frame->tf_eflags & PSL_VM ? "vm86" :
895 	    ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
896 #ifdef SMP
897 	/* two separate prints in case of a trap on an unmapped page */
898 	printf("cpuid = %d; ", PCPU_GET(cpuid));
899 	printf("apic id = %02x\n", PCPU_GET(apic_id));
900 #endif
901 	if (type == T_PAGEFLT) {
902 		printf("fault virtual address	= 0x%x\n", eva);
903 		printf("fault code		= %s %s%s, %s\n",
904 			code & PGEX_U ? "user" : "supervisor",
905 			code & PGEX_W ? "write" : "read",
906 			pg_nx != 0 ?
907 			(code & PGEX_I ? " instruction" : " data") :
908 			"",
909 			code & PGEX_RSV ? "reserved bits in PTE" :
910 			code & PGEX_P ? "protection violation" : "page not present");
911 	} else {
912 		printf("error code		= %#x\n", code);
913 	}
914 	printf("instruction pointer	= 0x%x:0x%x\n",
915 	       frame->tf_cs & 0xffff, frame->tf_eip);
916         if (TF_HAS_STACKREGS(frame)) {
917 		ss = frame->tf_ss & 0xffff;
918 		esp = frame->tf_esp;
919 	} else {
920 		ss = GSEL(GDATA_SEL, SEL_KPL);
921 		esp = (int)&frame->tf_esp;
922 	}
923 	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
924 	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
925 	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
926 	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
927 	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
928 	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
929 	       softseg.ssd_gran);
930 	printf("processor eflags	= ");
931 	if (frame->tf_eflags & PSL_T)
932 		printf("trace trap, ");
933 	if (frame->tf_eflags & PSL_I)
934 		printf("interrupt enabled, ");
935 	if (frame->tf_eflags & PSL_NT)
936 		printf("nested task, ");
937 	if (frame->tf_eflags & PSL_RF)
938 		printf("resume, ");
939 	if (frame->tf_eflags & PSL_VM)
940 		printf("vm86, ");
941 	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
942 	printf("current process		= %d (%s)\n",
943 	    curproc->p_pid, curthread->td_name);
944 
945 #ifdef KDB
946 	if (debugger_on_trap) {
947 		kdb_why = KDB_WHY_TRAP;
948 		frame->tf_err = eva;	/* smuggle fault address to ddb */
949 		handled = kdb_trap(type, 0, frame);
950 		frame->tf_err = code;	/* restore error code */
951 		kdb_why = KDB_WHY_UNSET;
952 		if (handled)
953 			return;
954 	}
955 #endif
956 	printf("trap number		= %d\n", type);
957 	if (trap_msg(type) != NULL)
958 		panic("%s", trap_msg(type));
959 	else
960 		panic("unknown/reserved trap");
961 }
962 
963 #ifdef KDTRACE_HOOKS
964 /*
965  * Invoke a userspace DTrace hook.  The hook pointer is cleared when no
966  * userspace probes are enabled, so we must synchronize with DTrace to ensure
967  * that a trapping thread is able to call the hook before it is cleared.
968  */
969 static bool
trap_user_dtrace(struct trapframe * frame,int (** hookp)(struct trapframe *))970 trap_user_dtrace(struct trapframe *frame, int (**hookp)(struct trapframe *))
971 {
972 	int (*hook)(struct trapframe *);
973 
974 	hook = atomic_load_ptr(hookp);
975 	enable_intr();
976 	if (hook != NULL)
977 		return ((hook)(frame) == 0);
978 	return (false);
979 }
980 #endif
981 
982 /*
983  * Double fault handler. Called when a fault occurs while writing
984  * a frame for a trap/exception onto the stack. This usually occurs
985  * when the stack overflows (such is the case with infinite recursion,
986  * for example).
987  *
988  * XXX Note that the current PTD gets replaced by IdlePTD when the
989  * task switch occurs. This means that the stack that was active at
990  * the time of the double fault is not available at <kstack> unless
991  * the machine was idle when the double fault occurred. The downside
992  * of this is that "trace <ebp>" in ddb won't work.
993  */
994 void
dblfault_handler(void)995 dblfault_handler(void)
996 {
997 #ifdef KDTRACE_HOOKS
998 	if (dtrace_doubletrap_func != NULL)
999 		(*dtrace_doubletrap_func)();
1000 #endif
1001 	printf("\nFatal double fault:\n");
1002 	printf("eip = 0x%x\n", PCPU_GET(common_tssp)->tss_eip);
1003 	printf("esp = 0x%x\n", PCPU_GET(common_tssp)->tss_esp);
1004 	printf("ebp = 0x%x\n", PCPU_GET(common_tssp)->tss_ebp);
1005 #ifdef SMP
1006 	/* two separate prints in case of a trap on an unmapped page */
1007 	printf("cpuid = %d; ", PCPU_GET(cpuid));
1008 	printf("apic id = %02x\n", PCPU_GET(apic_id));
1009 #endif
1010 	panic("double fault");
1011 }
1012 
1013 int
cpu_fetch_syscall_args(struct thread * td)1014 cpu_fetch_syscall_args(struct thread *td)
1015 {
1016 	struct proc *p;
1017 	struct trapframe *frame;
1018 	struct syscall_args *sa;
1019 	caddr_t params;
1020 	long tmp;
1021 	int error;
1022 #ifdef COMPAT_43
1023 	u_int32_t eip;
1024 	int cs;
1025 #endif
1026 
1027 	p = td->td_proc;
1028 	frame = td->td_frame;
1029 	sa = &td->td_sa;
1030 
1031 #ifdef COMPAT_43
1032 	if (__predict_false(frame->tf_cs == 7 && frame->tf_eip == 2)) {
1033 		/*
1034 		 * In lcall $7,$0 after int $0x80.  Convert the user
1035 		 * frame to what it would be for a direct int 0x80 instead
1036 		 * of lcall $7,$0, by popping the lcall return address.
1037 		 */
1038 		error = fueword32((void *)frame->tf_esp, &eip);
1039 		if (error == -1)
1040 			return (EFAULT);
1041 		cs = fuword16((void *)(frame->tf_esp + sizeof(u_int32_t)));
1042 		if (cs == -1)
1043 			return (EFAULT);
1044 
1045 		/*
1046 		 * Unwind in-kernel frame after all stack frame pieces
1047 		 * were successfully read.
1048 		 */
1049 		frame->tf_eip = eip;
1050 		frame->tf_cs = cs;
1051 		frame->tf_esp += 2 * sizeof(u_int32_t);
1052 		frame->tf_err = 7;	/* size of lcall $7,$0 */
1053 	}
1054 #endif
1055 
1056 	sa->code = frame->tf_eax;
1057 	params = (caddr_t)frame->tf_esp + sizeof(uint32_t);
1058 
1059 	/*
1060 	 * Need to check if this is a 32 bit or 64 bit syscall.
1061 	 */
1062 	if (sa->code == SYS_syscall) {
1063 		/*
1064 		 * Code is first argument, followed by actual args.
1065 		 */
1066 		error = fueword(params, &tmp);
1067 		if (error == -1)
1068 			return (EFAULT);
1069 		sa->code = tmp;
1070 		params += sizeof(uint32_t);
1071 	} else if (sa->code == SYS___syscall) {
1072 		/*
1073 		 * Like syscall, but code is a quad, so as to maintain
1074 		 * quad alignment for the rest of the arguments.
1075 		 */
1076 		error = fueword(params, &tmp);
1077 		if (error == -1)
1078 			return (EFAULT);
1079 		sa->code = tmp;
1080 		params += sizeof(quad_t);
1081 	}
1082 
1083  	if (sa->code >= p->p_sysent->sv_size)
1084  		sa->callp = &p->p_sysent->sv_table[0];
1085   	else
1086  		sa->callp = &p->p_sysent->sv_table[sa->code];
1087 
1088 	if (params != NULL && sa->callp->sy_narg != 0)
1089 		error = copyin(params, (caddr_t)sa->args,
1090 		    (u_int)(sa->callp->sy_narg * sizeof(uint32_t)));
1091 	else
1092 		error = 0;
1093 
1094 	if (error == 0) {
1095 		td->td_retval[0] = 0;
1096 		td->td_retval[1] = frame->tf_edx;
1097 	}
1098 
1099 	return (error);
1100 }
1101 
1102 #include "../../kern/subr_syscall.c"
1103 
1104 /*
1105  * syscall - system call request C handler.  A system call is
1106  * essentially treated as a trap by reusing the frame layout.
1107  */
1108 void
syscall(struct trapframe * frame)1109 syscall(struct trapframe *frame)
1110 {
1111 	struct thread *td;
1112 	register_t orig_tf_eflags;
1113 	ksiginfo_t ksi;
1114 
1115 #ifdef DIAGNOSTIC
1116 	if (!(TRAPF_USERMODE(frame) &&
1117 	    (curpcb->pcb_flags & PCB_VM86CALL) == 0)) {
1118 		panic("syscall");
1119 		/* NOT REACHED */
1120 	}
1121 #endif
1122 	orig_tf_eflags = frame->tf_eflags;
1123 
1124 	td = curthread;
1125 	td->td_frame = frame;
1126 
1127 	syscallenter(td);
1128 
1129 	/*
1130 	 * Traced syscall.
1131 	 */
1132 	if ((orig_tf_eflags & PSL_T) && !(orig_tf_eflags & PSL_VM)) {
1133 		frame->tf_eflags &= ~PSL_T;
1134 		ksiginfo_init_trap(&ksi);
1135 		ksi.ksi_signo = SIGTRAP;
1136 		ksi.ksi_code = TRAP_TRACE;
1137 		ksi.ksi_addr = (void *)frame->tf_eip;
1138 		trapsignal(td, &ksi);
1139 	}
1140 
1141 	KASSERT(PCB_USER_FPU(td->td_pcb),
1142 	    ("System call %s returning with kernel FPU ctx leaked",
1143 	     syscallname(td->td_proc, td->td_sa.code)));
1144 	KASSERT(td->td_pcb->pcb_save == get_pcb_user_save_td(td),
1145 	    ("System call %s returning with mangled pcb_save",
1146 	     syscallname(td->td_proc, td->td_sa.code)));
1147 
1148 	syscallret(td);
1149 }
1150