xref: /f-stack/freebsd/amd64/amd64/machdep.c (revision 22ce4aff)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 2003 Peter Wemm.
5  * Copyright (c) 1992 Terrence R. Lambert.
6  * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * William Jolitz.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. All advertising materials mentioning features or use of this software
21  *    must display the following acknowledgement:
22  *	This product includes software developed by the University of
23  *	California, Berkeley and its contributors.
24  * 4. Neither the name of the University nor the names of its contributors
25  *    may be used to endorse or promote products derived from this software
26  *    without specific prior written permission.
27  *
28  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38  * SUCH DAMAGE.
39  *
40  *	from: @(#)machdep.c	7.4 (Berkeley) 6/3/91
41  */
42 
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45 
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57 
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/csan.h>
68 #include <sys/efi.h>
69 #include <sys/eventhandler.h>
70 #include <sys/exec.h>
71 #include <sys/imgact.h>
72 #include <sys/kdb.h>
73 #include <sys/kernel.h>
74 #include <sys/ktr.h>
75 #include <sys/linker.h>
76 #include <sys/lock.h>
77 #include <sys/malloc.h>
78 #include <sys/memrange.h>
79 #include <sys/msgbuf.h>
80 #include <sys/mutex.h>
81 #include <sys/pcpu.h>
82 #include <sys/ptrace.h>
83 #include <sys/reboot.h>
84 #include <sys/rwlock.h>
85 #include <sys/sched.h>
86 #include <sys/signalvar.h>
87 #ifdef SMP
88 #include <sys/smp.h>
89 #endif
90 #include <sys/syscallsubr.h>
91 #include <sys/sysctl.h>
92 #include <sys/sysent.h>
93 #include <sys/sysproto.h>
94 #include <sys/ucontext.h>
95 #include <sys/vmmeter.h>
96 
97 #include <vm/vm.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_extern.h>
100 #include <vm/vm_kern.h>
101 #include <vm/vm_page.h>
102 #include <vm/vm_map.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_pager.h>
105 #include <vm/vm_phys.h>
106 #include <vm/vm_dumpset.h>
107 
108 #ifdef DDB
109 #ifndef KDB
110 #error KDB must be enabled in order for DDB to work!
111 #endif
112 #include <ddb/ddb.h>
113 #include <ddb/db_sym.h>
114 #endif
115 
116 #include <net/netisr.h>
117 
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/mp_watchdog.h>
127 #include <machine/pc/bios.h>
128 #include <machine/pcb.h>
129 #include <machine/proc.h>
130 #include <machine/reg.h>
131 #include <machine/sigframe.h>
132 #include <machine/specialreg.h>
133 #include <machine/trap.h>
134 #include <machine/tss.h>
135 #include <x86/ucode.h>
136 #include <x86/ifunc.h>
137 #ifdef SMP
138 #include <machine/smp.h>
139 #endif
140 #ifdef FDT
141 #include <x86/fdt.h>
142 #endif
143 
144 #ifdef DEV_ATPIC
145 #include <x86/isa/icu.h>
146 #else
147 #include <x86/apicvar.h>
148 #endif
149 
150 #include <isa/isareg.h>
151 #include <isa/rtc.h>
152 #include <x86/init.h>
153 
154 /* Sanity check for __curthread() */
155 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
156 
157 /*
158  * The PTI trampoline stack needs enough space for a hardware trapframe and a
159  * couple of scratch registers, as well as the trapframe left behind after an
160  * iret fault.
161  */
162 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
163     offsetof(struct pti_frame, pti_rip));
164 
165 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
166 
167 #define	CS_SECURE(cs)		(ISPL(cs) == SEL_UPL)
168 #define	EFL_SECURE(ef, oef)	((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
169 
170 static void cpu_startup(void *);
171 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
172     char *xfpusave, size_t xfpusave_len);
173 static int  set_fpcontext(struct thread *td, mcontext_t *mcp,
174     char *xfpustate, size_t xfpustate_len);
175 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
176 
177 /* Preload data parse function */
178 static caddr_t native_parse_preload_data(u_int64_t);
179 
180 /* Native function to fetch and parse the e820 map */
181 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
182 
183 /* Default init_ops implementation. */
184 struct init_ops init_ops = {
185 	.parse_preload_data =	native_parse_preload_data,
186 	.early_clock_source_init =	i8254_init,
187 	.early_delay =			i8254_delay,
188 	.parse_memmap =			native_parse_memmap,
189 #ifdef SMP
190 	.mp_bootaddress =		mp_bootaddress,
191 	.start_all_aps =		native_start_all_aps,
192 #endif
193 #ifdef DEV_PCI
194 	.msi_init =			msi_init,
195 #endif
196 };
197 
198 /*
199  * Physical address of the EFI System Table. Stashed from the metadata hints
200  * passed into the kernel and used by the EFI code to call runtime services.
201  */
202 vm_paddr_t efi_systbl_phys;
203 
204 /* Intel ICH registers */
205 #define ICH_PMBASE	0x400
206 #define ICH_SMI_EN	ICH_PMBASE + 0x30
207 
208 int	_udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
209 
210 int cold = 1;
211 
212 long Maxmem = 0;
213 long realmem = 0;
214 
215 struct kva_md_info kmi;
216 
217 static struct trapframe proc0_tf;
218 struct region_descriptor r_idt;
219 
220 struct pcpu *__pcpu;
221 struct pcpu temp_bsp_pcpu;
222 
223 struct mtx icu_lock;
224 
225 struct mem_range_softc mem_range_softc;
226 
227 struct mtx dt_lock;	/* lock for GDT and LDT */
228 
229 void (*vmm_resume_p)(void);
230 
231 static void
cpu_startup(dummy)232 cpu_startup(dummy)
233 	void *dummy;
234 {
235 	uintmax_t memsize;
236 	char *sysenv;
237 
238 	/*
239 	 * On MacBooks, we need to disallow the legacy USB circuit to
240 	 * generate an SMI# because this can cause several problems,
241 	 * namely: incorrect CPU frequency detection and failure to
242 	 * start the APs.
243 	 * We do this by disabling a bit in the SMI_EN (SMI Control and
244 	 * Enable register) of the Intel ICH LPC Interface Bridge.
245 	 */
246 	sysenv = kern_getenv("smbios.system.product");
247 	if (sysenv != NULL) {
248 		if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
249 		    strncmp(sysenv, "MacBook3,1", 10) == 0 ||
250 		    strncmp(sysenv, "MacBook4,1", 10) == 0 ||
251 		    strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
252 		    strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
253 		    strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
254 		    strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
255 		    strncmp(sysenv, "Macmini1,1", 10) == 0) {
256 			if (bootverbose)
257 				printf("Disabling LEGACY_USB_EN bit on "
258 				    "Intel ICH.\n");
259 			outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
260 		}
261 		freeenv(sysenv);
262 	}
263 
264 	/*
265 	 * Good {morning,afternoon,evening,night}.
266 	 */
267 	startrtclock();
268 	printcpuinfo();
269 
270 	/*
271 	 * Display physical memory if SMBIOS reports reasonable amount.
272 	 */
273 	memsize = 0;
274 	sysenv = kern_getenv("smbios.memory.enabled");
275 	if (sysenv != NULL) {
276 		memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
277 		freeenv(sysenv);
278 	}
279 	if (memsize < ptoa((uintmax_t)vm_free_count()))
280 		memsize = ptoa((uintmax_t)Maxmem);
281 	printf("real memory  = %ju (%ju MB)\n", memsize, memsize >> 20);
282 	realmem = atop(memsize);
283 
284 	/*
285 	 * Display any holes after the first chunk of extended memory.
286 	 */
287 	if (bootverbose) {
288 		int indx;
289 
290 		printf("Physical memory chunk(s):\n");
291 		for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
292 			vm_paddr_t size;
293 
294 			size = phys_avail[indx + 1] - phys_avail[indx];
295 			printf(
296 			    "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
297 			    (uintmax_t)phys_avail[indx],
298 			    (uintmax_t)phys_avail[indx + 1] - 1,
299 			    (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
300 		}
301 	}
302 
303 	vm_ksubmap_init(&kmi);
304 
305 	printf("avail memory = %ju (%ju MB)\n",
306 	    ptoa((uintmax_t)vm_free_count()),
307 	    ptoa((uintmax_t)vm_free_count()) / 1048576);
308 #ifdef DEV_PCI
309 	if (bootverbose && intel_graphics_stolen_base != 0)
310 		printf("intel stolen mem: base %#jx size %ju MB\n",
311 		    (uintmax_t)intel_graphics_stolen_base,
312 		    (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
313 #endif
314 
315 	/*
316 	 * Set up buffers, so they can be used to read disk labels.
317 	 */
318 	bufinit();
319 	vm_pager_bufferinit();
320 
321 	cpu_setregs();
322 }
323 
324 static void
late_ifunc_resolve(void * dummy __unused)325 late_ifunc_resolve(void *dummy __unused)
326 {
327 	link_elf_late_ireloc();
328 }
329 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
330 
331 /*
332  * Send an interrupt to process.
333  *
334  * Stack is set up to allow sigcode stored
335  * at top to call routine, followed by call
336  * to sigreturn routine below.  After sigreturn
337  * resets the signal mask, the stack, and the
338  * frame pointer, it returns to the user
339  * specified pc, psl.
340  */
341 void
sendsig(sig_t catcher,ksiginfo_t * ksi,sigset_t * mask)342 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
343 {
344 	struct sigframe sf, *sfp;
345 	struct pcb *pcb;
346 	struct proc *p;
347 	struct thread *td;
348 	struct sigacts *psp;
349 	char *sp;
350 	struct trapframe *regs;
351 	char *xfpusave;
352 	size_t xfpusave_len;
353 	int sig;
354 	int oonstack;
355 
356 	td = curthread;
357 	pcb = td->td_pcb;
358 	p = td->td_proc;
359 	PROC_LOCK_ASSERT(p, MA_OWNED);
360 	sig = ksi->ksi_signo;
361 	psp = p->p_sigacts;
362 	mtx_assert(&psp->ps_mtx, MA_OWNED);
363 	regs = td->td_frame;
364 	oonstack = sigonstack(regs->tf_rsp);
365 
366 	if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
367 		xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
368 		xfpusave = __builtin_alloca(xfpusave_len);
369 	} else {
370 		xfpusave_len = 0;
371 		xfpusave = NULL;
372 	}
373 
374 	/* Save user context. */
375 	bzero(&sf, sizeof(sf));
376 	sf.sf_uc.uc_sigmask = *mask;
377 	sf.sf_uc.uc_stack = td->td_sigstk;
378 	sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
379 	    ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
380 	sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
381 	bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
382 	sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
383 	get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
384 	fpstate_drop(td);
385 	update_pcb_bases(pcb);
386 	sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
387 	sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
388 	bzero(sf.sf_uc.uc_mcontext.mc_spare,
389 	    sizeof(sf.sf_uc.uc_mcontext.mc_spare));
390 
391 	/* Allocate space for the signal handler context. */
392 	if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
393 	    SIGISMEMBER(psp->ps_sigonstack, sig)) {
394 		sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
395 #if defined(COMPAT_43)
396 		td->td_sigstk.ss_flags |= SS_ONSTACK;
397 #endif
398 	} else
399 		sp = (char *)regs->tf_rsp - 128;
400 	if (xfpusave != NULL) {
401 		sp -= xfpusave_len;
402 		sp = (char *)((unsigned long)sp & ~0x3Ful);
403 		sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
404 	}
405 	sp -= sizeof(struct sigframe);
406 	/* Align to 16 bytes. */
407 	sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
408 
409 	/* Build the argument list for the signal handler. */
410 	regs->tf_rdi = sig;			/* arg 1 in %rdi */
411 	regs->tf_rdx = (register_t)&sfp->sf_uc;	/* arg 3 in %rdx */
412 	bzero(&sf.sf_si, sizeof(sf.sf_si));
413 	if (SIGISMEMBER(psp->ps_siginfo, sig)) {
414 		/* Signal handler installed with SA_SIGINFO. */
415 		regs->tf_rsi = (register_t)&sfp->sf_si;	/* arg 2 in %rsi */
416 		sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
417 
418 		/* Fill in POSIX parts */
419 		sf.sf_si = ksi->ksi_info;
420 		sf.sf_si.si_signo = sig; /* maybe a translated signal */
421 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
422 	} else {
423 		/* Old FreeBSD-style arguments. */
424 		regs->tf_rsi = ksi->ksi_code;	/* arg 2 in %rsi */
425 		regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
426 		sf.sf_ahu.sf_handler = catcher;
427 	}
428 	mtx_unlock(&psp->ps_mtx);
429 	PROC_UNLOCK(p);
430 
431 	/*
432 	 * Copy the sigframe out to the user's stack.
433 	 */
434 	if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
435 	    (xfpusave != NULL && copyout(xfpusave,
436 	    (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
437 	    != 0)) {
438 #ifdef DEBUG
439 		printf("process %ld has trashed its stack\n", (long)p->p_pid);
440 #endif
441 		PROC_LOCK(p);
442 		sigexit(td, SIGILL);
443 	}
444 
445 	regs->tf_rsp = (long)sfp;
446 	regs->tf_rip = p->p_sysent->sv_sigcode_base;
447 	regs->tf_rflags &= ~(PSL_T | PSL_D);
448 	regs->tf_cs = _ucodesel;
449 	regs->tf_ds = _udatasel;
450 	regs->tf_ss = _udatasel;
451 	regs->tf_es = _udatasel;
452 	regs->tf_fs = _ufssel;
453 	regs->tf_gs = _ugssel;
454 	regs->tf_flags = TF_HASSEGS;
455 	PROC_LOCK(p);
456 	mtx_lock(&psp->ps_mtx);
457 }
458 
459 /*
460  * System call to cleanup state after a signal
461  * has been taken.  Reset signal mask and
462  * stack state from context left by sendsig (above).
463  * Return to previous pc and psl as specified by
464  * context left by sendsig. Check carefully to
465  * make sure that the user has not modified the
466  * state to gain improper privileges.
467  *
468  * MPSAFE
469  */
470 int
sys_sigreturn(td,uap)471 sys_sigreturn(td, uap)
472 	struct thread *td;
473 	struct sigreturn_args /* {
474 		const struct __ucontext *sigcntxp;
475 	} */ *uap;
476 {
477 	ucontext_t uc;
478 	struct pcb *pcb;
479 	struct proc *p;
480 	struct trapframe *regs;
481 	ucontext_t *ucp;
482 	char *xfpustate;
483 	size_t xfpustate_len;
484 	long rflags;
485 	int cs, error, ret;
486 	ksiginfo_t ksi;
487 
488 	pcb = td->td_pcb;
489 	p = td->td_proc;
490 
491 	error = copyin(uap->sigcntxp, &uc, sizeof(uc));
492 	if (error != 0) {
493 		uprintf("pid %d (%s): sigreturn copyin failed\n",
494 		    p->p_pid, td->td_name);
495 		return (error);
496 	}
497 	ucp = &uc;
498 	if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
499 		uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
500 		    td->td_name, ucp->uc_mcontext.mc_flags);
501 		return (EINVAL);
502 	}
503 	regs = td->td_frame;
504 	rflags = ucp->uc_mcontext.mc_rflags;
505 	/*
506 	 * Don't allow users to change privileged or reserved flags.
507 	 */
508 	if (!EFL_SECURE(rflags, regs->tf_rflags)) {
509 		uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
510 		    td->td_name, rflags);
511 		return (EINVAL);
512 	}
513 
514 	/*
515 	 * Don't allow users to load a valid privileged %cs.  Let the
516 	 * hardware check for invalid selectors, excess privilege in
517 	 * other selectors, invalid %eip's and invalid %esp's.
518 	 */
519 	cs = ucp->uc_mcontext.mc_cs;
520 	if (!CS_SECURE(cs)) {
521 		uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
522 		    td->td_name, cs);
523 		ksiginfo_init_trap(&ksi);
524 		ksi.ksi_signo = SIGBUS;
525 		ksi.ksi_code = BUS_OBJERR;
526 		ksi.ksi_trapno = T_PROTFLT;
527 		ksi.ksi_addr = (void *)regs->tf_rip;
528 		trapsignal(td, &ksi);
529 		return (EINVAL);
530 	}
531 
532 	if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
533 		xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
534 		if (xfpustate_len > cpu_max_ext_state_size -
535 		    sizeof(struct savefpu)) {
536 			uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
537 			    p->p_pid, td->td_name, xfpustate_len);
538 			return (EINVAL);
539 		}
540 		xfpustate = __builtin_alloca(xfpustate_len);
541 		error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
542 		    xfpustate, xfpustate_len);
543 		if (error != 0) {
544 			uprintf(
545 	"pid %d (%s): sigreturn copying xfpustate failed\n",
546 			    p->p_pid, td->td_name);
547 			return (error);
548 		}
549 	} else {
550 		xfpustate = NULL;
551 		xfpustate_len = 0;
552 	}
553 	ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
554 	if (ret != 0) {
555 		uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
556 		    p->p_pid, td->td_name, ret);
557 		return (ret);
558 	}
559 	bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
560 	update_pcb_bases(pcb);
561 	pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
562 	pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
563 
564 #if defined(COMPAT_43)
565 	if (ucp->uc_mcontext.mc_onstack & 1)
566 		td->td_sigstk.ss_flags |= SS_ONSTACK;
567 	else
568 		td->td_sigstk.ss_flags &= ~SS_ONSTACK;
569 #endif
570 
571 	kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
572 	return (EJUSTRETURN);
573 }
574 
575 #ifdef COMPAT_FREEBSD4
576 int
freebsd4_sigreturn(struct thread * td,struct freebsd4_sigreturn_args * uap)577 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
578 {
579 
580 	return sys_sigreturn(td, (struct sigreturn_args *)uap);
581 }
582 #endif
583 
584 /*
585  * Reset registers to default values on exec.
586  */
587 void
exec_setregs(struct thread * td,struct image_params * imgp,uintptr_t stack)588 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
589 {
590 	struct trapframe *regs;
591 	struct pcb *pcb;
592 	register_t saved_rflags;
593 
594 	regs = td->td_frame;
595 	pcb = td->td_pcb;
596 
597 	if (td->td_proc->p_md.md_ldt != NULL)
598 		user_ldt_free(td);
599 
600 	update_pcb_bases(pcb);
601 	pcb->pcb_fsbase = 0;
602 	pcb->pcb_gsbase = 0;
603 	clear_pcb_flags(pcb, PCB_32BIT);
604 	pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
605 
606 	saved_rflags = regs->tf_rflags & PSL_T;
607 	bzero((char *)regs, sizeof(struct trapframe));
608 	regs->tf_rip = imgp->entry_addr;
609 	regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
610 	regs->tf_rdi = stack;		/* argv */
611 	regs->tf_rflags = PSL_USER | saved_rflags;
612 	regs->tf_ss = _udatasel;
613 	regs->tf_cs = _ucodesel;
614 	regs->tf_ds = _udatasel;
615 	regs->tf_es = _udatasel;
616 	regs->tf_fs = _ufssel;
617 	regs->tf_gs = _ugssel;
618 	regs->tf_flags = TF_HASSEGS;
619 
620 	/*
621 	 * Reset the hardware debug registers if they were in use.
622 	 * They won't have any meaning for the newly exec'd process.
623 	 */
624 	if (pcb->pcb_flags & PCB_DBREGS) {
625 		pcb->pcb_dr0 = 0;
626 		pcb->pcb_dr1 = 0;
627 		pcb->pcb_dr2 = 0;
628 		pcb->pcb_dr3 = 0;
629 		pcb->pcb_dr6 = 0;
630 		pcb->pcb_dr7 = 0;
631 		if (pcb == curpcb) {
632 			/*
633 			 * Clear the debug registers on the running
634 			 * CPU, otherwise they will end up affecting
635 			 * the next process we switch to.
636 			 */
637 			reset_dbregs();
638 		}
639 		clear_pcb_flags(pcb, PCB_DBREGS);
640 	}
641 
642 	/*
643 	 * Drop the FP state if we hold it, so that the process gets a
644 	 * clean FP state if it uses the FPU again.
645 	 */
646 	fpstate_drop(td);
647 }
648 
649 void
cpu_setregs(void)650 cpu_setregs(void)
651 {
652 	register_t cr0;
653 
654 	cr0 = rcr0();
655 	/*
656 	 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
657 	 * BSP.  See the comments there about why we set them.
658 	 */
659 	cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
660 	load_cr0(cr0);
661 }
662 
663 /*
664  * Initialize amd64 and configure to run kernel
665  */
666 
667 /*
668  * Initialize segments & interrupt table
669  */
670 static struct gate_descriptor idt0[NIDT];
671 struct gate_descriptor *idt = &idt0[0];	/* interrupt descriptor table */
672 
673 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
674 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
675 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
676 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
677 CTASSERT(sizeof(struct nmi_pcpu) == 16);
678 
679 /*
680  * Software prototypes -- in more palatable form.
681  *
682  * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
683  * slots as corresponding segments for i386 kernel.
684  */
685 struct soft_segment_descriptor gdt_segs[] = {
686 /* GNULL_SEL	0 Null Descriptor */
687 {	.ssd_base = 0x0,
688 	.ssd_limit = 0x0,
689 	.ssd_type = 0,
690 	.ssd_dpl = 0,
691 	.ssd_p = 0,
692 	.ssd_long = 0,
693 	.ssd_def32 = 0,
694 	.ssd_gran = 0		},
695 /* GNULL2_SEL	1 Null Descriptor */
696 {	.ssd_base = 0x0,
697 	.ssd_limit = 0x0,
698 	.ssd_type = 0,
699 	.ssd_dpl = 0,
700 	.ssd_p = 0,
701 	.ssd_long = 0,
702 	.ssd_def32 = 0,
703 	.ssd_gran = 0		},
704 /* GUFS32_SEL	2 32 bit %gs Descriptor for user */
705 {	.ssd_base = 0x0,
706 	.ssd_limit = 0xfffff,
707 	.ssd_type = SDT_MEMRWA,
708 	.ssd_dpl = SEL_UPL,
709 	.ssd_p = 1,
710 	.ssd_long = 0,
711 	.ssd_def32 = 1,
712 	.ssd_gran = 1		},
713 /* GUGS32_SEL	3 32 bit %fs Descriptor for user */
714 {	.ssd_base = 0x0,
715 	.ssd_limit = 0xfffff,
716 	.ssd_type = SDT_MEMRWA,
717 	.ssd_dpl = SEL_UPL,
718 	.ssd_p = 1,
719 	.ssd_long = 0,
720 	.ssd_def32 = 1,
721 	.ssd_gran = 1		},
722 /* GCODE_SEL	4 Code Descriptor for kernel */
723 {	.ssd_base = 0x0,
724 	.ssd_limit = 0xfffff,
725 	.ssd_type = SDT_MEMERA,
726 	.ssd_dpl = SEL_KPL,
727 	.ssd_p = 1,
728 	.ssd_long = 1,
729 	.ssd_def32 = 0,
730 	.ssd_gran = 1		},
731 /* GDATA_SEL	5 Data Descriptor for kernel */
732 {	.ssd_base = 0x0,
733 	.ssd_limit = 0xfffff,
734 	.ssd_type = SDT_MEMRWA,
735 	.ssd_dpl = SEL_KPL,
736 	.ssd_p = 1,
737 	.ssd_long = 1,
738 	.ssd_def32 = 0,
739 	.ssd_gran = 1		},
740 /* GUCODE32_SEL	6 32 bit Code Descriptor for user */
741 {	.ssd_base = 0x0,
742 	.ssd_limit = 0xfffff,
743 	.ssd_type = SDT_MEMERA,
744 	.ssd_dpl = SEL_UPL,
745 	.ssd_p = 1,
746 	.ssd_long = 0,
747 	.ssd_def32 = 1,
748 	.ssd_gran = 1		},
749 /* GUDATA_SEL	7 32/64 bit Data Descriptor for user */
750 {	.ssd_base = 0x0,
751 	.ssd_limit = 0xfffff,
752 	.ssd_type = SDT_MEMRWA,
753 	.ssd_dpl = SEL_UPL,
754 	.ssd_p = 1,
755 	.ssd_long = 0,
756 	.ssd_def32 = 1,
757 	.ssd_gran = 1		},
758 /* GUCODE_SEL	8 64 bit Code Descriptor for user */
759 {	.ssd_base = 0x0,
760 	.ssd_limit = 0xfffff,
761 	.ssd_type = SDT_MEMERA,
762 	.ssd_dpl = SEL_UPL,
763 	.ssd_p = 1,
764 	.ssd_long = 1,
765 	.ssd_def32 = 0,
766 	.ssd_gran = 1		},
767 /* GPROC0_SEL	9 Proc 0 Tss Descriptor */
768 {	.ssd_base = 0x0,
769 	.ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
770 	.ssd_type = SDT_SYSTSS,
771 	.ssd_dpl = SEL_KPL,
772 	.ssd_p = 1,
773 	.ssd_long = 0,
774 	.ssd_def32 = 0,
775 	.ssd_gran = 0		},
776 /* Actually, the TSS is a system descriptor which is double size */
777 {	.ssd_base = 0x0,
778 	.ssd_limit = 0x0,
779 	.ssd_type = 0,
780 	.ssd_dpl = 0,
781 	.ssd_p = 0,
782 	.ssd_long = 0,
783 	.ssd_def32 = 0,
784 	.ssd_gran = 0		},
785 /* GUSERLDT_SEL	11 LDT Descriptor */
786 {	.ssd_base = 0x0,
787 	.ssd_limit = 0x0,
788 	.ssd_type = 0,
789 	.ssd_dpl = 0,
790 	.ssd_p = 0,
791 	.ssd_long = 0,
792 	.ssd_def32 = 0,
793 	.ssd_gran = 0		},
794 /* GUSERLDT_SEL	12 LDT Descriptor, double size */
795 {	.ssd_base = 0x0,
796 	.ssd_limit = 0x0,
797 	.ssd_type = 0,
798 	.ssd_dpl = 0,
799 	.ssd_p = 0,
800 	.ssd_long = 0,
801 	.ssd_def32 = 0,
802 	.ssd_gran = 0		},
803 };
804 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
805 
806 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)807 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
808 {
809 	struct gate_descriptor *ip;
810 
811 	ip = idt + idx;
812 	ip->gd_looffset = (uintptr_t)func;
813 	ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
814 	ip->gd_ist = ist;
815 	ip->gd_xx = 0;
816 	ip->gd_type = typ;
817 	ip->gd_dpl = dpl;
818 	ip->gd_p = 1;
819 	ip->gd_hioffset = ((uintptr_t)func)>>16 ;
820 }
821 
822 extern inthand_t
823 	IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
824 	IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
825 	IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
826 	IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
827 	IDTVEC(xmm), IDTVEC(dblfault),
828 	IDTVEC(div_pti), IDTVEC(bpt_pti),
829 	IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
830 	IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
831 	IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
832 	IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
833 	IDTVEC(xmm_pti),
834 #ifdef KDTRACE_HOOKS
835 	IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
836 #endif
837 #ifdef XENHVM
838 	IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
839 #endif
840 	IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
841 	IDTVEC(fast_syscall_pti);
842 
843 #ifdef DDB
844 /*
845  * Display the index and function name of any IDT entries that don't use
846  * the default 'rsvd' entry point.
847  */
DB_SHOW_COMMAND(idt,db_show_idt)848 DB_SHOW_COMMAND(idt, db_show_idt)
849 {
850 	struct gate_descriptor *ip;
851 	int idx;
852 	uintptr_t func;
853 
854 	ip = idt;
855 	for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
856 		func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
857 		if (func != (uintptr_t)&IDTVEC(rsvd)) {
858 			db_printf("%3d\t", idx);
859 			db_printsym(func, DB_STGY_PROC);
860 			db_printf("\n");
861 		}
862 		ip++;
863 	}
864 }
865 
866 /* Show privileged registers. */
DB_SHOW_COMMAND(sysregs,db_show_sysregs)867 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
868 {
869 	struct {
870 		uint16_t limit;
871 		uint64_t base;
872 	} __packed idtr, gdtr;
873 	uint16_t ldt, tr;
874 
875 	__asm __volatile("sidt %0" : "=m" (idtr));
876 	db_printf("idtr\t0x%016lx/%04x\n",
877 	    (u_long)idtr.base, (u_int)idtr.limit);
878 	__asm __volatile("sgdt %0" : "=m" (gdtr));
879 	db_printf("gdtr\t0x%016lx/%04x\n",
880 	    (u_long)gdtr.base, (u_int)gdtr.limit);
881 	__asm __volatile("sldt %0" : "=r" (ldt));
882 	db_printf("ldtr\t0x%04x\n", ldt);
883 	__asm __volatile("str %0" : "=r" (tr));
884 	db_printf("tr\t0x%04x\n", tr);
885 	db_printf("cr0\t0x%016lx\n", rcr0());
886 	db_printf("cr2\t0x%016lx\n", rcr2());
887 	db_printf("cr3\t0x%016lx\n", rcr3());
888 	db_printf("cr4\t0x%016lx\n", rcr4());
889 	if (rcr4() & CR4_XSAVE)
890 		db_printf("xcr0\t0x%016lx\n", rxcr(0));
891 	db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
892 	if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
893 		db_printf("FEATURES_CTL\t%016lx\n",
894 		    rdmsr(MSR_IA32_FEATURE_CONTROL));
895 	db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
896 	db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
897 	db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
898 }
899 
DB_SHOW_COMMAND(dbregs,db_show_dbregs)900 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
901 {
902 
903 	db_printf("dr0\t0x%016lx\n", rdr0());
904 	db_printf("dr1\t0x%016lx\n", rdr1());
905 	db_printf("dr2\t0x%016lx\n", rdr2());
906 	db_printf("dr3\t0x%016lx\n", rdr3());
907 	db_printf("dr6\t0x%016lx\n", rdr6());
908 	db_printf("dr7\t0x%016lx\n", rdr7());
909 }
910 #endif
911 
912 void
sdtossd(sd,ssd)913 sdtossd(sd, ssd)
914 	struct user_segment_descriptor *sd;
915 	struct soft_segment_descriptor *ssd;
916 {
917 
918 	ssd->ssd_base  = (sd->sd_hibase << 24) | sd->sd_lobase;
919 	ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
920 	ssd->ssd_type  = sd->sd_type;
921 	ssd->ssd_dpl   = sd->sd_dpl;
922 	ssd->ssd_p     = sd->sd_p;
923 	ssd->ssd_long  = sd->sd_long;
924 	ssd->ssd_def32 = sd->sd_def32;
925 	ssd->ssd_gran  = sd->sd_gran;
926 }
927 
928 void
ssdtosd(ssd,sd)929 ssdtosd(ssd, sd)
930 	struct soft_segment_descriptor *ssd;
931 	struct user_segment_descriptor *sd;
932 {
933 
934 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
935 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
936 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
937 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
938 	sd->sd_type  = ssd->ssd_type;
939 	sd->sd_dpl   = ssd->ssd_dpl;
940 	sd->sd_p     = ssd->ssd_p;
941 	sd->sd_long  = ssd->ssd_long;
942 	sd->sd_def32 = ssd->ssd_def32;
943 	sd->sd_gran  = ssd->ssd_gran;
944 }
945 
946 void
ssdtosyssd(ssd,sd)947 ssdtosyssd(ssd, sd)
948 	struct soft_segment_descriptor *ssd;
949 	struct system_segment_descriptor *sd;
950 {
951 
952 	sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
953 	sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
954 	sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
955 	sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
956 	sd->sd_type  = ssd->ssd_type;
957 	sd->sd_dpl   = ssd->ssd_dpl;
958 	sd->sd_p     = ssd->ssd_p;
959 	sd->sd_gran  = ssd->ssd_gran;
960 }
961 
962 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
963 #include <isa/isavar.h>
964 #include <isa/isareg.h>
965 /*
966  * Return a bitmap of the current interrupt requests.  This is 8259-specific
967  * and is only suitable for use at probe time.
968  * This is only here to pacify sio.  It is NOT FATAL if this doesn't work.
969  * It shouldn't be here.  There should probably be an APIC centric
970  * implementation in the apic driver code, if at all.
971  */
972 intrmask_t
isa_irq_pending(void)973 isa_irq_pending(void)
974 {
975 	u_char irr1;
976 	u_char irr2;
977 
978 	irr1 = inb(IO_ICU1);
979 	irr2 = inb(IO_ICU2);
980 	return ((irr2 << 8) | irr1);
981 }
982 #endif
983 
984 u_int basemem;
985 
986 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)987 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
988     int *physmap_idxp)
989 {
990 	int i, insert_idx, physmap_idx;
991 
992 	physmap_idx = *physmap_idxp;
993 
994 	if (length == 0)
995 		return (1);
996 
997 	/*
998 	 * Find insertion point while checking for overlap.  Start off by
999 	 * assuming the new entry will be added to the end.
1000 	 *
1001 	 * NB: physmap_idx points to the next free slot.
1002 	 */
1003 	insert_idx = physmap_idx;
1004 	for (i = 0; i <= physmap_idx; i += 2) {
1005 		if (base < physmap[i + 1]) {
1006 			if (base + length <= physmap[i]) {
1007 				insert_idx = i;
1008 				break;
1009 			}
1010 			if (boothowto & RB_VERBOSE)
1011 				printf(
1012 		    "Overlapping memory regions, ignoring second region\n");
1013 			return (1);
1014 		}
1015 	}
1016 
1017 	/* See if we can prepend to the next entry. */
1018 	if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1019 		physmap[insert_idx] = base;
1020 		return (1);
1021 	}
1022 
1023 	/* See if we can append to the previous entry. */
1024 	if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1025 		physmap[insert_idx - 1] += length;
1026 		return (1);
1027 	}
1028 
1029 	physmap_idx += 2;
1030 	*physmap_idxp = physmap_idx;
1031 	if (physmap_idx == PHYS_AVAIL_ENTRIES) {
1032 		printf(
1033 		"Too many segments in the physical address map, giving up\n");
1034 		return (0);
1035 	}
1036 
1037 	/*
1038 	 * Move the last 'N' entries down to make room for the new
1039 	 * entry if needed.
1040 	 */
1041 	for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1042 		physmap[i] = physmap[i - 2];
1043 		physmap[i + 1] = physmap[i - 1];
1044 	}
1045 
1046 	/* Insert the new entry. */
1047 	physmap[insert_idx] = base;
1048 	physmap[insert_idx + 1] = base + length;
1049 	return (1);
1050 }
1051 
1052 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)1053 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1054                       vm_paddr_t *physmap, int *physmap_idx)
1055 {
1056 	struct bios_smap *smap, *smapend;
1057 
1058 	smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1059 
1060 	for (smap = smapbase; smap < smapend; smap++) {
1061 		if (boothowto & RB_VERBOSE)
1062 			printf("SMAP type=%02x base=%016lx len=%016lx\n",
1063 			    smap->type, smap->base, smap->length);
1064 
1065 		if (smap->type != SMAP_TYPE_MEMORY)
1066 			continue;
1067 
1068 		if (!add_physmap_entry(smap->base, smap->length, physmap,
1069 		    physmap_idx))
1070 			break;
1071 	}
1072 }
1073 
1074 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)1075 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1076     int *physmap_idx)
1077 {
1078 	struct efi_md *map, *p;
1079 	const char *type;
1080 	size_t efisz;
1081 	int ndesc, i;
1082 
1083 	static const char *types[] = {
1084 		"Reserved",
1085 		"LoaderCode",
1086 		"LoaderData",
1087 		"BootServicesCode",
1088 		"BootServicesData",
1089 		"RuntimeServicesCode",
1090 		"RuntimeServicesData",
1091 		"ConventionalMemory",
1092 		"UnusableMemory",
1093 		"ACPIReclaimMemory",
1094 		"ACPIMemoryNVS",
1095 		"MemoryMappedIO",
1096 		"MemoryMappedIOPortSpace",
1097 		"PalCode",
1098 		"PersistentMemory"
1099 	};
1100 
1101 	/*
1102 	 * Memory map data provided by UEFI via the GetMemoryMap
1103 	 * Boot Services API.
1104 	 */
1105 	efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1106 	map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1107 
1108 	if (efihdr->descriptor_size == 0)
1109 		return;
1110 	ndesc = efihdr->memory_size / efihdr->descriptor_size;
1111 
1112 	if (boothowto & RB_VERBOSE)
1113 		printf("%23s %12s %12s %8s %4s\n",
1114 		    "Type", "Physical", "Virtual", "#Pages", "Attr");
1115 
1116 	for (i = 0, p = map; i < ndesc; i++,
1117 	    p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1118 		if (boothowto & RB_VERBOSE) {
1119 			if (p->md_type < nitems(types))
1120 				type = types[p->md_type];
1121 			else
1122 				type = "<INVALID>";
1123 			printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1124 			    p->md_virt, p->md_pages);
1125 			if (p->md_attr & EFI_MD_ATTR_UC)
1126 				printf("UC ");
1127 			if (p->md_attr & EFI_MD_ATTR_WC)
1128 				printf("WC ");
1129 			if (p->md_attr & EFI_MD_ATTR_WT)
1130 				printf("WT ");
1131 			if (p->md_attr & EFI_MD_ATTR_WB)
1132 				printf("WB ");
1133 			if (p->md_attr & EFI_MD_ATTR_UCE)
1134 				printf("UCE ");
1135 			if (p->md_attr & EFI_MD_ATTR_WP)
1136 				printf("WP ");
1137 			if (p->md_attr & EFI_MD_ATTR_RP)
1138 				printf("RP ");
1139 			if (p->md_attr & EFI_MD_ATTR_XP)
1140 				printf("XP ");
1141 			if (p->md_attr & EFI_MD_ATTR_NV)
1142 				printf("NV ");
1143 			if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1144 				printf("MORE_RELIABLE ");
1145 			if (p->md_attr & EFI_MD_ATTR_RO)
1146 				printf("RO ");
1147 			if (p->md_attr & EFI_MD_ATTR_RT)
1148 				printf("RUNTIME");
1149 			printf("\n");
1150 		}
1151 
1152 		switch (p->md_type) {
1153 		case EFI_MD_TYPE_CODE:
1154 		case EFI_MD_TYPE_DATA:
1155 		case EFI_MD_TYPE_BS_CODE:
1156 		case EFI_MD_TYPE_BS_DATA:
1157 		case EFI_MD_TYPE_FREE:
1158 			/*
1159 			 * We're allowed to use any entry with these types.
1160 			 */
1161 			break;
1162 		default:
1163 			continue;
1164 		}
1165 
1166 		if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1167 		    physmap, physmap_idx))
1168 			break;
1169 	}
1170 }
1171 
1172 static char bootmethod[16] = "";
1173 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1174     "System firmware boot method");
1175 
1176 static void
native_parse_memmap(caddr_t kmdp,vm_paddr_t * physmap,int * physmap_idx)1177 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1178 {
1179 	struct bios_smap *smap;
1180 	struct efi_map_header *efihdr;
1181 	u_int32_t size;
1182 
1183 	/*
1184 	 * Memory map from INT 15:E820.
1185 	 *
1186 	 * subr_module.c says:
1187 	 * "Consumer may safely assume that size value precedes data."
1188 	 * ie: an int32_t immediately precedes smap.
1189 	 */
1190 
1191 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1192 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
1193 	smap = (struct bios_smap *)preload_search_info(kmdp,
1194 	    MODINFO_METADATA | MODINFOMD_SMAP);
1195 	if (efihdr == NULL && smap == NULL)
1196 		panic("No BIOS smap or EFI map info from loader!");
1197 
1198 	if (efihdr != NULL) {
1199 		add_efi_map_entries(efihdr, physmap, physmap_idx);
1200 		strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1201 	} else {
1202 		size = *((u_int32_t *)smap - 1);
1203 		bios_add_smap_entries(smap, size, physmap, physmap_idx);
1204 		strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1205 	}
1206 }
1207 
1208 #define	PAGES_PER_GB	(1024 * 1024 * 1024 / PAGE_SIZE)
1209 
1210 /*
1211  * Populate the (physmap) array with base/bound pairs describing the
1212  * available physical memory in the system, then test this memory and
1213  * build the phys_avail array describing the actually-available memory.
1214  *
1215  * Total memory size may be set by the kernel environment variable
1216  * hw.physmem or the compile-time define MAXMEM.
1217  *
1218  * XXX first should be vm_paddr_t.
1219  */
1220 static void
getmemsize(caddr_t kmdp,u_int64_t first)1221 getmemsize(caddr_t kmdp, u_int64_t first)
1222 {
1223 	int i, physmap_idx, pa_indx, da_indx;
1224 	vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
1225 	u_long physmem_start, physmem_tunable, memtest;
1226 	pt_entry_t *pte;
1227 	quad_t dcons_addr, dcons_size;
1228 	int page_counter;
1229 
1230 	/*
1231 	 * Tell the physical memory allocator about pages used to store
1232 	 * the kernel and preloaded data.  See kmem_bootstrap_free().
1233 	 */
1234 	vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1235 
1236 	bzero(physmap, sizeof(physmap));
1237 	physmap_idx = 0;
1238 
1239 	init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1240 	physmap_idx -= 2;
1241 
1242 	/*
1243 	 * Find the 'base memory' segment for SMP
1244 	 */
1245 	basemem = 0;
1246 	for (i = 0; i <= physmap_idx; i += 2) {
1247 		if (physmap[i] <= 0xA0000) {
1248 			basemem = physmap[i + 1] / 1024;
1249 			break;
1250 		}
1251 	}
1252 	if (basemem == 0 || basemem > 640) {
1253 		if (bootverbose)
1254 			printf(
1255 		"Memory map doesn't contain a basemem segment, faking it");
1256 		basemem = 640;
1257 	}
1258 
1259 	/*
1260 	 * Maxmem isn't the "maximum memory", it's one larger than the
1261 	 * highest page of the physical address space.  It should be
1262 	 * called something like "Maxphyspage".  We may adjust this
1263 	 * based on ``hw.physmem'' and the results of the memory test.
1264 	 */
1265 	Maxmem = atop(physmap[physmap_idx + 1]);
1266 
1267 #ifdef MAXMEM
1268 	Maxmem = MAXMEM / 4;
1269 #endif
1270 
1271 	if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1272 		Maxmem = atop(physmem_tunable);
1273 
1274 	/*
1275 	 * The boot memory test is disabled by default, as it takes a
1276 	 * significant amount of time on large-memory systems, and is
1277 	 * unfriendly to virtual machines as it unnecessarily touches all
1278 	 * pages.
1279 	 *
1280 	 * A general name is used as the code may be extended to support
1281 	 * additional tests beyond the current "page present" test.
1282 	 */
1283 	memtest = 0;
1284 	TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1285 
1286 	/*
1287 	 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1288 	 * in the system.
1289 	 */
1290 	if (Maxmem > atop(physmap[physmap_idx + 1]))
1291 		Maxmem = atop(physmap[physmap_idx + 1]);
1292 
1293 	if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1294 	    (boothowto & RB_VERBOSE))
1295 		printf("Physical memory use set to %ldK\n", Maxmem * 4);
1296 
1297 	/*
1298 	 * Make hole for "AP -> long mode" bootstrap code.  The
1299 	 * mp_bootaddress vector is only available when the kernel
1300 	 * is configured to support APs and APs for the system start
1301 	 * in real mode mode (e.g. SMP bare metal).
1302 	 */
1303 	if (init_ops.mp_bootaddress)
1304 		init_ops.mp_bootaddress(physmap, &physmap_idx);
1305 
1306 	/* call pmap initialization to make new kernel address space */
1307 	pmap_bootstrap(&first);
1308 
1309 	/*
1310 	 * Size up each available chunk of physical memory.
1311 	 *
1312 	 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1313 	 * By default, mask off the first 16 pages unless we appear to be
1314 	 * running in a VM.
1315 	 */
1316 	physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1317 	TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1318 	if (physmap[0] < physmem_start) {
1319 		if (physmem_start < PAGE_SIZE)
1320 			physmap[0] = PAGE_SIZE;
1321 		else if (physmem_start >= physmap[1])
1322 			physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1323 		else
1324 			physmap[0] = round_page(physmem_start);
1325 	}
1326 	pa_indx = 0;
1327 	da_indx = 1;
1328 	phys_avail[pa_indx++] = physmap[0];
1329 	phys_avail[pa_indx] = physmap[0];
1330 	dump_avail[da_indx] = physmap[0];
1331 	pte = CMAP1;
1332 
1333 	/*
1334 	 * Get dcons buffer address
1335 	 */
1336 	if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1337 	    getenv_quad("dcons.size", &dcons_size) == 0)
1338 		dcons_addr = 0;
1339 
1340 	/*
1341 	 * physmap is in bytes, so when converting to page boundaries,
1342 	 * round up the start address and round down the end address.
1343 	 */
1344 	page_counter = 0;
1345 	if (memtest != 0)
1346 		printf("Testing system memory");
1347 	for (i = 0; i <= physmap_idx; i += 2) {
1348 		vm_paddr_t end;
1349 
1350 		end = ptoa((vm_paddr_t)Maxmem);
1351 		if (physmap[i + 1] < end)
1352 			end = trunc_page(physmap[i + 1]);
1353 		for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1354 			int tmp, page_bad, full;
1355 			int *ptr = (int *)CADDR1;
1356 
1357 			full = FALSE;
1358 			/*
1359 			 * block out kernel memory as not available.
1360 			 */
1361 			if (pa >= (vm_paddr_t)kernphys && pa < first)
1362 				goto do_dump_avail;
1363 
1364 			/*
1365 			 * block out dcons buffer
1366 			 */
1367 			if (dcons_addr > 0
1368 			    && pa >= trunc_page(dcons_addr)
1369 			    && pa < dcons_addr + dcons_size)
1370 				goto do_dump_avail;
1371 
1372 			page_bad = FALSE;
1373 			if (memtest == 0)
1374 				goto skip_memtest;
1375 
1376 			/*
1377 			 * Print a "." every GB to show we're making
1378 			 * progress.
1379 			 */
1380 			page_counter++;
1381 			if ((page_counter % PAGES_PER_GB) == 0)
1382 				printf(".");
1383 
1384 			/*
1385 			 * map page into kernel: valid, read/write,non-cacheable
1386 			 */
1387 			*pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1388 			invltlb();
1389 
1390 			tmp = *(int *)ptr;
1391 			/*
1392 			 * Test for alternating 1's and 0's
1393 			 */
1394 			*(volatile int *)ptr = 0xaaaaaaaa;
1395 			if (*(volatile int *)ptr != 0xaaaaaaaa)
1396 				page_bad = TRUE;
1397 			/*
1398 			 * Test for alternating 0's and 1's
1399 			 */
1400 			*(volatile int *)ptr = 0x55555555;
1401 			if (*(volatile int *)ptr != 0x55555555)
1402 				page_bad = TRUE;
1403 			/*
1404 			 * Test for all 1's
1405 			 */
1406 			*(volatile int *)ptr = 0xffffffff;
1407 			if (*(volatile int *)ptr != 0xffffffff)
1408 				page_bad = TRUE;
1409 			/*
1410 			 * Test for all 0's
1411 			 */
1412 			*(volatile int *)ptr = 0x0;
1413 			if (*(volatile int *)ptr != 0x0)
1414 				page_bad = TRUE;
1415 			/*
1416 			 * Restore original value.
1417 			 */
1418 			*(int *)ptr = tmp;
1419 
1420 skip_memtest:
1421 			/*
1422 			 * Adjust array of valid/good pages.
1423 			 */
1424 			if (page_bad == TRUE)
1425 				continue;
1426 			/*
1427 			 * If this good page is a continuation of the
1428 			 * previous set of good pages, then just increase
1429 			 * the end pointer. Otherwise start a new chunk.
1430 			 * Note that "end" points one higher than end,
1431 			 * making the range >= start and < end.
1432 			 * If we're also doing a speculative memory
1433 			 * test and we at or past the end, bump up Maxmem
1434 			 * so that we keep going. The first bad page
1435 			 * will terminate the loop.
1436 			 */
1437 			if (phys_avail[pa_indx] == pa) {
1438 				phys_avail[pa_indx] += PAGE_SIZE;
1439 			} else {
1440 				pa_indx++;
1441 				if (pa_indx == PHYS_AVAIL_ENTRIES) {
1442 					printf(
1443 		"Too many holes in the physical address space, giving up\n");
1444 					pa_indx--;
1445 					full = TRUE;
1446 					goto do_dump_avail;
1447 				}
1448 				phys_avail[pa_indx++] = pa;	/* start */
1449 				phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1450 			}
1451 			physmem++;
1452 do_dump_avail:
1453 			if (dump_avail[da_indx] == pa) {
1454 				dump_avail[da_indx] += PAGE_SIZE;
1455 			} else {
1456 				da_indx++;
1457 				if (da_indx == PHYS_AVAIL_ENTRIES) {
1458 					da_indx--;
1459 					goto do_next;
1460 				}
1461 				dump_avail[da_indx++] = pa; /* start */
1462 				dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1463 			}
1464 do_next:
1465 			if (full)
1466 				break;
1467 		}
1468 	}
1469 	*pte = 0;
1470 	invltlb();
1471 	if (memtest != 0)
1472 		printf("\n");
1473 
1474 	/*
1475 	 * XXX
1476 	 * The last chunk must contain at least one page plus the message
1477 	 * buffer to avoid complicating other code (message buffer address
1478 	 * calculation, etc.).
1479 	 */
1480 	while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1481 	    round_page(msgbufsize) >= phys_avail[pa_indx]) {
1482 		physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1483 		phys_avail[pa_indx--] = 0;
1484 		phys_avail[pa_indx--] = 0;
1485 	}
1486 
1487 	Maxmem = atop(phys_avail[pa_indx]);
1488 
1489 	/* Trim off space for the message buffer. */
1490 	phys_avail[pa_indx] -= round_page(msgbufsize);
1491 
1492 	/* Map the message buffer. */
1493 	msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1494 }
1495 
1496 static caddr_t
native_parse_preload_data(u_int64_t modulep)1497 native_parse_preload_data(u_int64_t modulep)
1498 {
1499 	caddr_t kmdp;
1500 	char *envp;
1501 #ifdef DDB
1502 	vm_offset_t ksym_start;
1503 	vm_offset_t ksym_end;
1504 #endif
1505 
1506 	preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1507 	preload_bootstrap_relocate(KERNBASE);
1508 	kmdp = preload_search_by_type("elf kernel");
1509 	if (kmdp == NULL)
1510 		kmdp = preload_search_by_type("elf64 kernel");
1511 	boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1512 	envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1513 	if (envp != NULL)
1514 		envp += KERNBASE;
1515 	init_static_kenv(envp, 0);
1516 #ifdef DDB
1517 	ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1518 	ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1519 	db_fetch_ksymtab(ksym_start, ksym_end, 0);
1520 #endif
1521 	efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1522 
1523 	return (kmdp);
1524 }
1525 
1526 static void
amd64_kdb_init(void)1527 amd64_kdb_init(void)
1528 {
1529 	kdb_init();
1530 #ifdef KDB
1531 	if (boothowto & RB_KDB)
1532 		kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1533 #endif
1534 }
1535 
1536 /* Set up the fast syscall stuff */
1537 void
amd64_conf_fast_syscall(void)1538 amd64_conf_fast_syscall(void)
1539 {
1540 	uint64_t msr;
1541 
1542 	msr = rdmsr(MSR_EFER) | EFER_SCE;
1543 	wrmsr(MSR_EFER, msr);
1544 	wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1545 	    (u_int64_t)IDTVEC(fast_syscall));
1546 	wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1547 	msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1548 	    ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1549 	wrmsr(MSR_STAR, msr);
1550 	wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1551 }
1552 
1553 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1554 amd64_bsp_pcpu_init1(struct pcpu *pc)
1555 {
1556 	struct user_segment_descriptor *gdt;
1557 
1558 	PCPU_SET(prvspace, pc);
1559 	gdt = *PCPU_PTR(gdt);
1560 	PCPU_SET(curthread, &thread0);
1561 	PCPU_SET(tssp, PCPU_PTR(common_tss));
1562 	PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1563 	PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1564 	PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1565 	PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1566 	PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1567 	PCPU_SET(smp_tlb_gen, 1);
1568 }
1569 
1570 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1571 amd64_bsp_pcpu_init2(uint64_t rsp0)
1572 {
1573 
1574 	PCPU_SET(rsp0, rsp0);
1575 	PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1576 	    PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1577 	PCPU_SET(curpcb, thread0.td_pcb);
1578 }
1579 
1580 void
amd64_bsp_ist_init(struct pcpu * pc)1581 amd64_bsp_ist_init(struct pcpu *pc)
1582 {
1583 	struct nmi_pcpu *np;
1584 	struct amd64tss *tssp;
1585 
1586 	tssp = &pc->pc_common_tss;
1587 
1588 	/* doublefault stack space, runs on ist1 */
1589 	np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1590 	np->np_pcpu = (register_t)pc;
1591 	tssp->tss_ist1 = (long)np;
1592 
1593 	/*
1594 	 * NMI stack, runs on ist2.  The pcpu pointer is stored just
1595 	 * above the start of the ist2 stack.
1596 	 */
1597 	np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1598 	np->np_pcpu = (register_t)pc;
1599 	tssp->tss_ist2 = (long)np;
1600 
1601 	/*
1602 	 * MC# stack, runs on ist3.  The pcpu pointer is stored just
1603 	 * above the start of the ist3 stack.
1604 	 */
1605 	np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1606 	np->np_pcpu = (register_t)pc;
1607 	tssp->tss_ist3 = (long)np;
1608 
1609 	/*
1610 	 * DB# stack, runs on ist4.
1611 	 */
1612 	np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1613 	np->np_pcpu = (register_t)pc;
1614 	tssp->tss_ist4 = (long)np;
1615 }
1616 
1617 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1618 hammer_time(u_int64_t modulep, u_int64_t physfree)
1619 {
1620 	caddr_t kmdp;
1621 	int gsel_tss, x;
1622 	struct pcpu *pc;
1623 	struct xstate_hdr *xhdr;
1624 	u_int64_t rsp0;
1625 	char *env;
1626 	struct user_segment_descriptor *gdt;
1627 	struct region_descriptor r_gdt;
1628 	size_t kstack0_sz;
1629 	int late_console;
1630 
1631 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
1632 
1633 	kmdp = init_ops.parse_preload_data(modulep);
1634 
1635 	physfree += ucode_load_bsp(physfree + KERNBASE);
1636 	physfree = roundup2(physfree, PAGE_SIZE);
1637 
1638 	identify_cpu1();
1639 	identify_hypervisor();
1640 	identify_cpu_fixup_bsp();
1641 	identify_cpu2();
1642 	initializecpucache();
1643 
1644 	/*
1645 	 * Check for pti, pcid, and invpcid before ifuncs are
1646 	 * resolved, to correctly select the implementation for
1647 	 * pmap_activate_sw_mode().
1648 	 */
1649 	pti = pti_get_default();
1650 	TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1651 	TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1652 	if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1653 		invpcid_works = (cpu_stdext_feature &
1654 		    CPUID_STDEXT_INVPCID) != 0;
1655 	} else {
1656 		pmap_pcid_enabled = 0;
1657 	}
1658 
1659 	link_elf_ireloc(kmdp);
1660 
1661 	/*
1662 	 * This may be done better later if it gets more high level
1663 	 * components in it. If so just link td->td_proc here.
1664 	 */
1665 	proc_linkup0(&proc0, &thread0);
1666 
1667 	/* Init basic tunables, hz etc */
1668 	init_param1();
1669 
1670 	thread0.td_kstack = physfree + KERNBASE;
1671 	thread0.td_kstack_pages = kstack_pages;
1672 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1673 	bzero((void *)thread0.td_kstack, kstack0_sz);
1674 	physfree += kstack0_sz;
1675 
1676 	/*
1677 	 * Initialize enough of thread0 for delayed invalidation to
1678 	 * work very early.  Rely on thread0.td_base_pri
1679 	 * zero-initialization, it is reset to PVM at proc0_init().
1680 	 */
1681 	pmap_thread_init_invl_gen(&thread0);
1682 
1683 	pc = &temp_bsp_pcpu;
1684 	pcpu_init(pc, 0, sizeof(struct pcpu));
1685 	gdt = &temp_bsp_pcpu.pc_gdt[0];
1686 
1687 	/*
1688 	 * make gdt memory segments
1689 	 */
1690 	for (x = 0; x < NGDT; x++) {
1691 		if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1692 		    x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1693 			ssdtosd(&gdt_segs[x], &gdt[x]);
1694 	}
1695 	gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1696 	ssdtosyssd(&gdt_segs[GPROC0_SEL],
1697 	    (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1698 
1699 	r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1700 	r_gdt.rd_base = (long)gdt;
1701 	lgdt(&r_gdt);
1702 
1703 	wrmsr(MSR_FSBASE, 0);		/* User value */
1704 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
1705 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
1706 
1707 	dpcpu_init((void *)(physfree + KERNBASE), 0);
1708 	physfree += DPCPU_SIZE;
1709 	amd64_bsp_pcpu_init1(pc);
1710 	/* Non-late cninit() and printf() can be moved up to here. */
1711 
1712 	/*
1713 	 * Initialize mutexes.
1714 	 *
1715 	 * icu_lock: in order to allow an interrupt to occur in a critical
1716 	 * 	     section, to set pcpu->ipending (etc...) properly, we
1717 	 *	     must be able to get the icu lock, so it can't be
1718 	 *	     under witness.
1719 	 */
1720 	mutex_init();
1721 	mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1722 	mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1723 
1724 	/* exceptions */
1725 	for (x = 0; x < NIDT; x++)
1726 		setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1727 		    SEL_KPL, 0);
1728 	setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1729 	    SEL_KPL, 0);
1730 	setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1731 	setidt(IDT_NMI, &IDTVEC(nmi),  SDT_SYSIGT, SEL_KPL, 2);
1732 	setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1733 	    SEL_UPL, 0);
1734 	setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1735 	    SEL_UPL, 0);
1736 	setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1737 	    SEL_KPL, 0);
1738 	setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1739 	    SEL_KPL, 0);
1740 	setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1741 	    SEL_KPL, 0);
1742 	setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1743 	setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1744 	    SDT_SYSIGT, SEL_KPL, 0);
1745 	setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1746 	    SEL_KPL, 0);
1747 	setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1748 	    SDT_SYSIGT, SEL_KPL, 0);
1749 	setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1750 	    SEL_KPL, 0);
1751 	setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1752 	    SEL_KPL, 0);
1753 	setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1754 	    SEL_KPL, 0);
1755 	setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1756 	    SEL_KPL, 0);
1757 	setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1758 	    SEL_KPL, 0);
1759 	setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1760 	setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1761 	    SEL_KPL, 0);
1762 #ifdef KDTRACE_HOOKS
1763 	setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1764 	    &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1765 #endif
1766 #ifdef XENHVM
1767 	setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1768 	    &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1769 #endif
1770 	r_idt.rd_limit = sizeof(idt0) - 1;
1771 	r_idt.rd_base = (long) idt;
1772 	lidt(&r_idt);
1773 
1774 	/*
1775 	 * Initialize the clock before the console so that console
1776 	 * initialization can use DELAY().
1777 	 */
1778 	clock_init();
1779 
1780 	/*
1781 	 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1782 	 * transition).
1783 	 * Once bootblocks have updated, we can test directly for
1784 	 * efi_systbl != NULL here...
1785 	 */
1786 	if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1787 	    != NULL)
1788 		vty_set_preferred(VTY_VT);
1789 
1790 	TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1791 	TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1792 
1793 	TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1794 	TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1795 
1796 	TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1797 	    &syscall_ret_l1d_flush_mode);
1798 
1799 	TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1800 	TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1801 
1802 	TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1803 
1804 	TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1805 	    &x86_rngds_mitg_enable);
1806 
1807 	finishidentcpu();	/* Final stage of CPU initialization */
1808 	initializecpu();	/* Initialize CPU registers */
1809 
1810 	amd64_bsp_ist_init(pc);
1811 
1812 	/* Set the IO permission bitmap (empty due to tss seg limit) */
1813 	pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1814 	    IOPERM_BITMAP_SIZE;
1815 
1816 	gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1817 	ltr(gsel_tss);
1818 
1819 	amd64_conf_fast_syscall();
1820 
1821 	/*
1822 	 * We initialize the PCB pointer early so that exception
1823 	 * handlers will work.  Also set up td_critnest to short-cut
1824 	 * the page fault handler.
1825 	 */
1826 	cpu_max_ext_state_size = sizeof(struct savefpu);
1827 	set_top_of_stack_td(&thread0);
1828 	thread0.td_pcb = get_pcb_td(&thread0);
1829 	thread0.td_critnest = 1;
1830 
1831 	/*
1832 	 * The console and kdb should be initialized even earlier than here,
1833 	 * but some console drivers don't work until after getmemsize().
1834 	 * Default to late console initialization to support these drivers.
1835 	 * This loses mainly printf()s in getmemsize() and early debugging.
1836 	 */
1837 	late_console = 1;
1838 	TUNABLE_INT_FETCH("debug.late_console", &late_console);
1839 	if (!late_console) {
1840 		cninit();
1841 		amd64_kdb_init();
1842 	}
1843 
1844 	getmemsize(kmdp, physfree);
1845 	init_param2(physmem);
1846 
1847 	/* now running on new page tables, configured,and u/iom is accessible */
1848 
1849 #ifdef DEV_PCI
1850         /* This call might adjust phys_avail[]. */
1851         pci_early_quirks();
1852 #endif
1853 
1854 	if (late_console)
1855 		cninit();
1856 
1857 	/*
1858 	 * Dump the boot metadata. We have to wait for cninit() since console
1859 	 * output is required. If it's grossly incorrect the kernel will never
1860 	 * make it this far.
1861 	 */
1862 	if (getenv_is_true("debug.dump_modinfo_at_boot"))
1863 		preload_dump();
1864 
1865 #ifdef DEV_ISA
1866 #ifdef DEV_ATPIC
1867 	elcr_probe();
1868 	atpic_startup();
1869 #else
1870 	/* Reset and mask the atpics and leave them shut down. */
1871 	atpic_reset();
1872 
1873 	/*
1874 	 * Point the ICU spurious interrupt vectors at the APIC spurious
1875 	 * interrupt handler.
1876 	 */
1877 	setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1878 	setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1879 #endif
1880 #else
1881 #error "have you forgotten the isa device?"
1882 #endif
1883 
1884 	if (late_console)
1885 		amd64_kdb_init();
1886 
1887 	msgbufinit(msgbufp, msgbufsize);
1888 	fpuinit();
1889 
1890 	/*
1891 	 * Reinitialize thread0's stack base now that the xsave area size is
1892 	 * known.  Set up thread0's pcb save area after fpuinit calculated fpu
1893 	 * save area size.  Zero out the extended state header in fpu save area.
1894 	 */
1895 	set_top_of_stack_td(&thread0);
1896 	thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1897 	bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size);
1898 	if (use_xsave) {
1899 		xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1900 		    1);
1901 		xhdr->xstate_bv = xsave_mask;
1902 	}
1903 	/* make an initial tss so cpu can get interrupt stack on syscall! */
1904 	rsp0 = thread0.td_md.md_stack_base;
1905 	/* Ensure the stack is aligned to 16 bytes */
1906 	rsp0 &= ~0xFul;
1907 	PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1908 	amd64_bsp_pcpu_init2(rsp0);
1909 
1910 	/* transfer to user mode */
1911 
1912 	_ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1913 	_udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1914 	_ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1915 	_ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1916 	_ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1917 
1918 	load_ds(_udatasel);
1919 	load_es(_udatasel);
1920 	load_fs(_ufssel);
1921 
1922 	/* setup proc 0's pcb */
1923 	thread0.td_pcb->pcb_flags = 0;
1924 	thread0.td_frame = &proc0_tf;
1925 
1926         env = kern_getenv("kernelname");
1927 	if (env != NULL)
1928 		strlcpy(kernelname, env, sizeof(kernelname));
1929 
1930 	kcsan_cpu_init(0);
1931 
1932 #ifdef FDT
1933 	x86_init_fdt();
1934 #endif
1935 	thread0.td_critnest = 0;
1936 
1937 	TSEXIT();
1938 
1939 	/* Location of kernel stack for locore */
1940 	return (thread0.td_md.md_stack_base);
1941 }
1942 
1943 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1944 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1945 {
1946 
1947 	pcpu->pc_acpi_id = 0xffffffff;
1948 }
1949 
1950 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1951 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1952 {
1953 	struct bios_smap *smapbase;
1954 	struct bios_smap_xattr smap;
1955 	caddr_t kmdp;
1956 	uint32_t *smapattr;
1957 	int count, error, i;
1958 
1959 	/* Retrieve the system memory map from the loader. */
1960 	kmdp = preload_search_by_type("elf kernel");
1961 	if (kmdp == NULL)
1962 		kmdp = preload_search_by_type("elf64 kernel");
1963 	smapbase = (struct bios_smap *)preload_search_info(kmdp,
1964 	    MODINFO_METADATA | MODINFOMD_SMAP);
1965 	if (smapbase == NULL)
1966 		return (0);
1967 	smapattr = (uint32_t *)preload_search_info(kmdp,
1968 	    MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1969 	count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1970 	error = 0;
1971 	for (i = 0; i < count; i++) {
1972 		smap.base = smapbase[i].base;
1973 		smap.length = smapbase[i].length;
1974 		smap.type = smapbase[i].type;
1975 		if (smapattr != NULL)
1976 			smap.xattr = smapattr[i];
1977 		else
1978 			smap.xattr = 0;
1979 		error = SYSCTL_OUT(req, &smap, sizeof(smap));
1980 	}
1981 	return (error);
1982 }
1983 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1984     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1985     smap_sysctl_handler, "S,bios_smap_xattr",
1986     "Raw BIOS SMAP data");
1987 
1988 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1989 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1990 {
1991 	struct efi_map_header *efihdr;
1992 	caddr_t kmdp;
1993 	uint32_t efisize;
1994 
1995 	kmdp = preload_search_by_type("elf kernel");
1996 	if (kmdp == NULL)
1997 		kmdp = preload_search_by_type("elf64 kernel");
1998 	efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1999 	    MODINFO_METADATA | MODINFOMD_EFI_MAP);
2000 	if (efihdr == NULL)
2001 		return (0);
2002 	efisize = *((uint32_t *)efihdr - 1);
2003 	return (SYSCTL_OUT(req, efihdr, efisize));
2004 }
2005 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
2006     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2007     efi_map_sysctl_handler, "S,efi_map_header",
2008     "Raw EFI Memory Map");
2009 
2010 void
spinlock_enter(void)2011 spinlock_enter(void)
2012 {
2013 	struct thread *td;
2014 	register_t flags;
2015 
2016 	td = curthread;
2017 	if (td->td_md.md_spinlock_count == 0) {
2018 		flags = intr_disable();
2019 		td->td_md.md_spinlock_count = 1;
2020 		td->td_md.md_saved_flags = flags;
2021 		critical_enter();
2022 	} else
2023 		td->td_md.md_spinlock_count++;
2024 }
2025 
2026 void
spinlock_exit(void)2027 spinlock_exit(void)
2028 {
2029 	struct thread *td;
2030 	register_t flags;
2031 
2032 	td = curthread;
2033 	flags = td->td_md.md_saved_flags;
2034 	td->td_md.md_spinlock_count--;
2035 	if (td->td_md.md_spinlock_count == 0) {
2036 		critical_exit();
2037 		intr_restore(flags);
2038 	}
2039 }
2040 
2041 /*
2042  * Construct a PCB from a trapframe. This is called from kdb_trap() where
2043  * we want to start a backtrace from the function that caused us to enter
2044  * the debugger. We have the context in the trapframe, but base the trace
2045  * on the PCB. The PCB doesn't have to be perfect, as long as it contains
2046  * enough for a backtrace.
2047  */
2048 void
makectx(struct trapframe * tf,struct pcb * pcb)2049 makectx(struct trapframe *tf, struct pcb *pcb)
2050 {
2051 
2052 	pcb->pcb_r12 = tf->tf_r12;
2053 	pcb->pcb_r13 = tf->tf_r13;
2054 	pcb->pcb_r14 = tf->tf_r14;
2055 	pcb->pcb_r15 = tf->tf_r15;
2056 	pcb->pcb_rbp = tf->tf_rbp;
2057 	pcb->pcb_rbx = tf->tf_rbx;
2058 	pcb->pcb_rip = tf->tf_rip;
2059 	pcb->pcb_rsp = tf->tf_rsp;
2060 }
2061 
2062 int
ptrace_set_pc(struct thread * td,unsigned long addr)2063 ptrace_set_pc(struct thread *td, unsigned long addr)
2064 {
2065 
2066 	td->td_frame->tf_rip = addr;
2067 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2068 	return (0);
2069 }
2070 
2071 int
ptrace_single_step(struct thread * td)2072 ptrace_single_step(struct thread *td)
2073 {
2074 
2075 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2076 	if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2077 		td->td_frame->tf_rflags |= PSL_T;
2078 		td->td_dbgflags |= TDB_STEP;
2079 	}
2080 	return (0);
2081 }
2082 
2083 int
ptrace_clear_single_step(struct thread * td)2084 ptrace_clear_single_step(struct thread *td)
2085 {
2086 
2087 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2088 	td->td_frame->tf_rflags &= ~PSL_T;
2089 	td->td_dbgflags &= ~TDB_STEP;
2090 	return (0);
2091 }
2092 
2093 int
fill_regs(struct thread * td,struct reg * regs)2094 fill_regs(struct thread *td, struct reg *regs)
2095 {
2096 	struct trapframe *tp;
2097 
2098 	tp = td->td_frame;
2099 	return (fill_frame_regs(tp, regs));
2100 }
2101 
2102 int
fill_frame_regs(struct trapframe * tp,struct reg * regs)2103 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2104 {
2105 
2106 	regs->r_r15 = tp->tf_r15;
2107 	regs->r_r14 = tp->tf_r14;
2108 	regs->r_r13 = tp->tf_r13;
2109 	regs->r_r12 = tp->tf_r12;
2110 	regs->r_r11 = tp->tf_r11;
2111 	regs->r_r10 = tp->tf_r10;
2112 	regs->r_r9  = tp->tf_r9;
2113 	regs->r_r8  = tp->tf_r8;
2114 	regs->r_rdi = tp->tf_rdi;
2115 	regs->r_rsi = tp->tf_rsi;
2116 	regs->r_rbp = tp->tf_rbp;
2117 	regs->r_rbx = tp->tf_rbx;
2118 	regs->r_rdx = tp->tf_rdx;
2119 	regs->r_rcx = tp->tf_rcx;
2120 	regs->r_rax = tp->tf_rax;
2121 	regs->r_rip = tp->tf_rip;
2122 	regs->r_cs = tp->tf_cs;
2123 	regs->r_rflags = tp->tf_rflags;
2124 	regs->r_rsp = tp->tf_rsp;
2125 	regs->r_ss = tp->tf_ss;
2126 	if (tp->tf_flags & TF_HASSEGS) {
2127 		regs->r_ds = tp->tf_ds;
2128 		regs->r_es = tp->tf_es;
2129 		regs->r_fs = tp->tf_fs;
2130 		regs->r_gs = tp->tf_gs;
2131 	} else {
2132 		regs->r_ds = 0;
2133 		regs->r_es = 0;
2134 		regs->r_fs = 0;
2135 		regs->r_gs = 0;
2136 	}
2137 	regs->r_err = 0;
2138 	regs->r_trapno = 0;
2139 	return (0);
2140 }
2141 
2142 int
set_regs(struct thread * td,struct reg * regs)2143 set_regs(struct thread *td, struct reg *regs)
2144 {
2145 	struct trapframe *tp;
2146 	register_t rflags;
2147 
2148 	tp = td->td_frame;
2149 	rflags = regs->r_rflags & 0xffffffff;
2150 	if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2151 		return (EINVAL);
2152 	tp->tf_r15 = regs->r_r15;
2153 	tp->tf_r14 = regs->r_r14;
2154 	tp->tf_r13 = regs->r_r13;
2155 	tp->tf_r12 = regs->r_r12;
2156 	tp->tf_r11 = regs->r_r11;
2157 	tp->tf_r10 = regs->r_r10;
2158 	tp->tf_r9  = regs->r_r9;
2159 	tp->tf_r8  = regs->r_r8;
2160 	tp->tf_rdi = regs->r_rdi;
2161 	tp->tf_rsi = regs->r_rsi;
2162 	tp->tf_rbp = regs->r_rbp;
2163 	tp->tf_rbx = regs->r_rbx;
2164 	tp->tf_rdx = regs->r_rdx;
2165 	tp->tf_rcx = regs->r_rcx;
2166 	tp->tf_rax = regs->r_rax;
2167 	tp->tf_rip = regs->r_rip;
2168 	tp->tf_cs = regs->r_cs;
2169 	tp->tf_rflags = rflags;
2170 	tp->tf_rsp = regs->r_rsp;
2171 	tp->tf_ss = regs->r_ss;
2172 	if (0) {	/* XXXKIB */
2173 		tp->tf_ds = regs->r_ds;
2174 		tp->tf_es = regs->r_es;
2175 		tp->tf_fs = regs->r_fs;
2176 		tp->tf_gs = regs->r_gs;
2177 		tp->tf_flags = TF_HASSEGS;
2178 	}
2179 	set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2180 	return (0);
2181 }
2182 
2183 /* XXX check all this stuff! */
2184 /* externalize from sv_xmm */
2185 static void
fill_fpregs_xmm(struct savefpu * sv_xmm,struct fpreg * fpregs)2186 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2187 {
2188 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2189 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2190 	int i;
2191 
2192 	/* pcb -> fpregs */
2193 	bzero(fpregs, sizeof(*fpregs));
2194 
2195 	/* FPU control/status */
2196 	penv_fpreg->en_cw = penv_xmm->en_cw;
2197 	penv_fpreg->en_sw = penv_xmm->en_sw;
2198 	penv_fpreg->en_tw = penv_xmm->en_tw;
2199 	penv_fpreg->en_opcode = penv_xmm->en_opcode;
2200 	penv_fpreg->en_rip = penv_xmm->en_rip;
2201 	penv_fpreg->en_rdp = penv_xmm->en_rdp;
2202 	penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2203 	penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2204 
2205 	/* FPU registers */
2206 	for (i = 0; i < 8; ++i)
2207 		bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2208 
2209 	/* SSE registers */
2210 	for (i = 0; i < 16; ++i)
2211 		bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2212 }
2213 
2214 /* internalize from fpregs into sv_xmm */
2215 static void
set_fpregs_xmm(struct fpreg * fpregs,struct savefpu * sv_xmm)2216 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2217 {
2218 	struct envxmm *penv_xmm = &sv_xmm->sv_env;
2219 	struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2220 	int i;
2221 
2222 	/* fpregs -> pcb */
2223 	/* FPU control/status */
2224 	penv_xmm->en_cw = penv_fpreg->en_cw;
2225 	penv_xmm->en_sw = penv_fpreg->en_sw;
2226 	penv_xmm->en_tw = penv_fpreg->en_tw;
2227 	penv_xmm->en_opcode = penv_fpreg->en_opcode;
2228 	penv_xmm->en_rip = penv_fpreg->en_rip;
2229 	penv_xmm->en_rdp = penv_fpreg->en_rdp;
2230 	penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2231 	penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2232 
2233 	/* FPU registers */
2234 	for (i = 0; i < 8; ++i)
2235 		bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2236 
2237 	/* SSE registers */
2238 	for (i = 0; i < 16; ++i)
2239 		bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2240 }
2241 
2242 /* externalize from td->pcb */
2243 int
fill_fpregs(struct thread * td,struct fpreg * fpregs)2244 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2245 {
2246 
2247 	KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2248 	    P_SHOULDSTOP(td->td_proc),
2249 	    ("not suspended thread %p", td));
2250 	fpugetregs(td);
2251 	fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2252 	return (0);
2253 }
2254 
2255 /* internalize to td->pcb */
2256 int
set_fpregs(struct thread * td,struct fpreg * fpregs)2257 set_fpregs(struct thread *td, struct fpreg *fpregs)
2258 {
2259 
2260 	critical_enter();
2261 	set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2262 	fpuuserinited(td);
2263 	critical_exit();
2264 	return (0);
2265 }
2266 
2267 /*
2268  * Get machine context.
2269  */
2270 int
get_mcontext(struct thread * td,mcontext_t * mcp,int flags)2271 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2272 {
2273 	struct pcb *pcb;
2274 	struct trapframe *tp;
2275 
2276 	pcb = td->td_pcb;
2277 	tp = td->td_frame;
2278 	PROC_LOCK(curthread->td_proc);
2279 	mcp->mc_onstack = sigonstack(tp->tf_rsp);
2280 	PROC_UNLOCK(curthread->td_proc);
2281 	mcp->mc_r15 = tp->tf_r15;
2282 	mcp->mc_r14 = tp->tf_r14;
2283 	mcp->mc_r13 = tp->tf_r13;
2284 	mcp->mc_r12 = tp->tf_r12;
2285 	mcp->mc_r11 = tp->tf_r11;
2286 	mcp->mc_r10 = tp->tf_r10;
2287 	mcp->mc_r9  = tp->tf_r9;
2288 	mcp->mc_r8  = tp->tf_r8;
2289 	mcp->mc_rdi = tp->tf_rdi;
2290 	mcp->mc_rsi = tp->tf_rsi;
2291 	mcp->mc_rbp = tp->tf_rbp;
2292 	mcp->mc_rbx = tp->tf_rbx;
2293 	mcp->mc_rcx = tp->tf_rcx;
2294 	mcp->mc_rflags = tp->tf_rflags;
2295 	if (flags & GET_MC_CLEAR_RET) {
2296 		mcp->mc_rax = 0;
2297 		mcp->mc_rdx = 0;
2298 		mcp->mc_rflags &= ~PSL_C;
2299 	} else {
2300 		mcp->mc_rax = tp->tf_rax;
2301 		mcp->mc_rdx = tp->tf_rdx;
2302 	}
2303 	mcp->mc_rip = tp->tf_rip;
2304 	mcp->mc_cs = tp->tf_cs;
2305 	mcp->mc_rsp = tp->tf_rsp;
2306 	mcp->mc_ss = tp->tf_ss;
2307 	mcp->mc_ds = tp->tf_ds;
2308 	mcp->mc_es = tp->tf_es;
2309 	mcp->mc_fs = tp->tf_fs;
2310 	mcp->mc_gs = tp->tf_gs;
2311 	mcp->mc_flags = tp->tf_flags;
2312 	mcp->mc_len = sizeof(*mcp);
2313 	get_fpcontext(td, mcp, NULL, 0);
2314 	update_pcb_bases(pcb);
2315 	mcp->mc_fsbase = pcb->pcb_fsbase;
2316 	mcp->mc_gsbase = pcb->pcb_gsbase;
2317 	mcp->mc_xfpustate = 0;
2318 	mcp->mc_xfpustate_len = 0;
2319 	bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2320 	return (0);
2321 }
2322 
2323 /*
2324  * Set machine context.
2325  *
2326  * However, we don't set any but the user modifiable flags, and we won't
2327  * touch the cs selector.
2328  */
2329 int
set_mcontext(struct thread * td,mcontext_t * mcp)2330 set_mcontext(struct thread *td, mcontext_t *mcp)
2331 {
2332 	struct pcb *pcb;
2333 	struct trapframe *tp;
2334 	char *xfpustate;
2335 	long rflags;
2336 	int ret;
2337 
2338 	pcb = td->td_pcb;
2339 	tp = td->td_frame;
2340 	if (mcp->mc_len != sizeof(*mcp) ||
2341 	    (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2342 		return (EINVAL);
2343 	rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2344 	    (tp->tf_rflags & ~PSL_USERCHANGE);
2345 	if (mcp->mc_flags & _MC_HASFPXSTATE) {
2346 		if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2347 		    sizeof(struct savefpu))
2348 			return (EINVAL);
2349 		xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2350 		ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2351 		    mcp->mc_xfpustate_len);
2352 		if (ret != 0)
2353 			return (ret);
2354 	} else
2355 		xfpustate = NULL;
2356 	ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2357 	if (ret != 0)
2358 		return (ret);
2359 	tp->tf_r15 = mcp->mc_r15;
2360 	tp->tf_r14 = mcp->mc_r14;
2361 	tp->tf_r13 = mcp->mc_r13;
2362 	tp->tf_r12 = mcp->mc_r12;
2363 	tp->tf_r11 = mcp->mc_r11;
2364 	tp->tf_r10 = mcp->mc_r10;
2365 	tp->tf_r9  = mcp->mc_r9;
2366 	tp->tf_r8  = mcp->mc_r8;
2367 	tp->tf_rdi = mcp->mc_rdi;
2368 	tp->tf_rsi = mcp->mc_rsi;
2369 	tp->tf_rbp = mcp->mc_rbp;
2370 	tp->tf_rbx = mcp->mc_rbx;
2371 	tp->tf_rdx = mcp->mc_rdx;
2372 	tp->tf_rcx = mcp->mc_rcx;
2373 	tp->tf_rax = mcp->mc_rax;
2374 	tp->tf_rip = mcp->mc_rip;
2375 	tp->tf_rflags = rflags;
2376 	tp->tf_rsp = mcp->mc_rsp;
2377 	tp->tf_ss = mcp->mc_ss;
2378 	tp->tf_flags = mcp->mc_flags;
2379 	if (tp->tf_flags & TF_HASSEGS) {
2380 		tp->tf_ds = mcp->mc_ds;
2381 		tp->tf_es = mcp->mc_es;
2382 		tp->tf_fs = mcp->mc_fs;
2383 		tp->tf_gs = mcp->mc_gs;
2384 	}
2385 	set_pcb_flags(pcb, PCB_FULL_IRET);
2386 	if (mcp->mc_flags & _MC_HASBASES) {
2387 		pcb->pcb_fsbase = mcp->mc_fsbase;
2388 		pcb->pcb_gsbase = mcp->mc_gsbase;
2389 	}
2390 	return (0);
2391 }
2392 
2393 static void
get_fpcontext(struct thread * td,mcontext_t * mcp,char * xfpusave,size_t xfpusave_len)2394 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2395     size_t xfpusave_len)
2396 {
2397 	size_t max_len, len;
2398 
2399 	mcp->mc_ownedfp = fpugetregs(td);
2400 	bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2401 	    sizeof(mcp->mc_fpstate));
2402 	mcp->mc_fpformat = fpuformat();
2403 	if (!use_xsave || xfpusave_len == 0)
2404 		return;
2405 	max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2406 	len = xfpusave_len;
2407 	if (len > max_len) {
2408 		len = max_len;
2409 		bzero(xfpusave + max_len, len - max_len);
2410 	}
2411 	mcp->mc_flags |= _MC_HASFPXSTATE;
2412 	mcp->mc_xfpustate_len = len;
2413 	bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2414 }
2415 
2416 static int
set_fpcontext(struct thread * td,mcontext_t * mcp,char * xfpustate,size_t xfpustate_len)2417 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2418     size_t xfpustate_len)
2419 {
2420 	int error;
2421 
2422 	if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2423 		return (0);
2424 	else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2425 		return (EINVAL);
2426 	else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2427 		/* We don't care what state is left in the FPU or PCB. */
2428 		fpstate_drop(td);
2429 		error = 0;
2430 	} else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2431 	    mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2432 		error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2433 		    xfpustate, xfpustate_len);
2434 	} else
2435 		return (EINVAL);
2436 	return (error);
2437 }
2438 
2439 void
fpstate_drop(struct thread * td)2440 fpstate_drop(struct thread *td)
2441 {
2442 
2443 	KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2444 	critical_enter();
2445 	if (PCPU_GET(fpcurthread) == td)
2446 		fpudrop();
2447 	/*
2448 	 * XXX force a full drop of the fpu.  The above only drops it if we
2449 	 * owned it.
2450 	 *
2451 	 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2452 	 * drop.  Dropping only to the pcb matches fnsave's behaviour.
2453 	 * We only need to drop to !PCB_INITDONE in sendsig().  But
2454 	 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2455 	 * have too many layers.
2456 	 */
2457 	clear_pcb_flags(curthread->td_pcb,
2458 	    PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2459 	critical_exit();
2460 }
2461 
2462 int
fill_dbregs(struct thread * td,struct dbreg * dbregs)2463 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2464 {
2465 	struct pcb *pcb;
2466 
2467 	if (td == NULL) {
2468 		dbregs->dr[0] = rdr0();
2469 		dbregs->dr[1] = rdr1();
2470 		dbregs->dr[2] = rdr2();
2471 		dbregs->dr[3] = rdr3();
2472 		dbregs->dr[6] = rdr6();
2473 		dbregs->dr[7] = rdr7();
2474 	} else {
2475 		pcb = td->td_pcb;
2476 		dbregs->dr[0] = pcb->pcb_dr0;
2477 		dbregs->dr[1] = pcb->pcb_dr1;
2478 		dbregs->dr[2] = pcb->pcb_dr2;
2479 		dbregs->dr[3] = pcb->pcb_dr3;
2480 		dbregs->dr[6] = pcb->pcb_dr6;
2481 		dbregs->dr[7] = pcb->pcb_dr7;
2482 	}
2483 	dbregs->dr[4] = 0;
2484 	dbregs->dr[5] = 0;
2485 	dbregs->dr[8] = 0;
2486 	dbregs->dr[9] = 0;
2487 	dbregs->dr[10] = 0;
2488 	dbregs->dr[11] = 0;
2489 	dbregs->dr[12] = 0;
2490 	dbregs->dr[13] = 0;
2491 	dbregs->dr[14] = 0;
2492 	dbregs->dr[15] = 0;
2493 	return (0);
2494 }
2495 
2496 int
set_dbregs(struct thread * td,struct dbreg * dbregs)2497 set_dbregs(struct thread *td, struct dbreg *dbregs)
2498 {
2499 	struct pcb *pcb;
2500 	int i;
2501 
2502 	if (td == NULL) {
2503 		load_dr0(dbregs->dr[0]);
2504 		load_dr1(dbregs->dr[1]);
2505 		load_dr2(dbregs->dr[2]);
2506 		load_dr3(dbregs->dr[3]);
2507 		load_dr6(dbregs->dr[6]);
2508 		load_dr7(dbregs->dr[7]);
2509 	} else {
2510 		/*
2511 		 * Don't let an illegal value for dr7 get set.  Specifically,
2512 		 * check for undefined settings.  Setting these bit patterns
2513 		 * result in undefined behaviour and can lead to an unexpected
2514 		 * TRCTRAP or a general protection fault right here.
2515 		 * Upper bits of dr6 and dr7 must not be set
2516 		 */
2517 		for (i = 0; i < 4; i++) {
2518 			if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2519 				return (EINVAL);
2520 			if (td->td_frame->tf_cs == _ucode32sel &&
2521 			    DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2522 				return (EINVAL);
2523 		}
2524 		if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2525 		    (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2526 			return (EINVAL);
2527 
2528 		pcb = td->td_pcb;
2529 
2530 		/*
2531 		 * Don't let a process set a breakpoint that is not within the
2532 		 * process's address space.  If a process could do this, it
2533 		 * could halt the system by setting a breakpoint in the kernel
2534 		 * (if ddb was enabled).  Thus, we need to check to make sure
2535 		 * that no breakpoints are being enabled for addresses outside
2536 		 * process's address space.
2537 		 *
2538 		 * XXX - what about when the watched area of the user's
2539 		 * address space is written into from within the kernel
2540 		 * ... wouldn't that still cause a breakpoint to be generated
2541 		 * from within kernel mode?
2542 		 */
2543 
2544 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2545 			/* dr0 is enabled */
2546 			if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2547 				return (EINVAL);
2548 		}
2549 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2550 			/* dr1 is enabled */
2551 			if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2552 				return (EINVAL);
2553 		}
2554 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2555 			/* dr2 is enabled */
2556 			if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2557 				return (EINVAL);
2558 		}
2559 		if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2560 			/* dr3 is enabled */
2561 			if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2562 				return (EINVAL);
2563 		}
2564 
2565 		pcb->pcb_dr0 = dbregs->dr[0];
2566 		pcb->pcb_dr1 = dbregs->dr[1];
2567 		pcb->pcb_dr2 = dbregs->dr[2];
2568 		pcb->pcb_dr3 = dbregs->dr[3];
2569 		pcb->pcb_dr6 = dbregs->dr[6];
2570 		pcb->pcb_dr7 = dbregs->dr[7];
2571 
2572 		set_pcb_flags(pcb, PCB_DBREGS);
2573 	}
2574 
2575 	return (0);
2576 }
2577 
2578 void
reset_dbregs(void)2579 reset_dbregs(void)
2580 {
2581 
2582 	load_dr7(0);	/* Turn off the control bits first */
2583 	load_dr0(0);
2584 	load_dr1(0);
2585 	load_dr2(0);
2586 	load_dr3(0);
2587 	load_dr6(0);
2588 }
2589 
2590 /*
2591  * Return > 0 if a hardware breakpoint has been hit, and the
2592  * breakpoint was in user space.  Return 0, otherwise.
2593  */
2594 int
user_dbreg_trap(register_t dr6)2595 user_dbreg_trap(register_t dr6)
2596 {
2597         u_int64_t dr7;
2598         u_int64_t bp;       /* breakpoint bits extracted from dr6 */
2599         int nbp;            /* number of breakpoints that triggered */
2600         caddr_t addr[4];    /* breakpoint addresses */
2601         int i;
2602 
2603         bp = dr6 & DBREG_DR6_BMASK;
2604         if (bp == 0) {
2605                 /*
2606                  * None of the breakpoint bits are set meaning this
2607                  * trap was not caused by any of the debug registers
2608                  */
2609                 return 0;
2610         }
2611 
2612         dr7 = rdr7();
2613         if ((dr7 & 0x000000ff) == 0) {
2614                 /*
2615                  * all GE and LE bits in the dr7 register are zero,
2616                  * thus the trap couldn't have been caused by the
2617                  * hardware debug registers
2618                  */
2619                 return 0;
2620         }
2621 
2622         nbp = 0;
2623 
2624         /*
2625          * at least one of the breakpoints were hit, check to see
2626          * which ones and if any of them are user space addresses
2627          */
2628 
2629         if (bp & 0x01) {
2630                 addr[nbp++] = (caddr_t)rdr0();
2631         }
2632         if (bp & 0x02) {
2633                 addr[nbp++] = (caddr_t)rdr1();
2634         }
2635         if (bp & 0x04) {
2636                 addr[nbp++] = (caddr_t)rdr2();
2637         }
2638         if (bp & 0x08) {
2639                 addr[nbp++] = (caddr_t)rdr3();
2640         }
2641 
2642         for (i = 0; i < nbp; i++) {
2643                 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2644                         /*
2645                          * addr[i] is in user space
2646                          */
2647                         return nbp;
2648                 }
2649         }
2650 
2651         /*
2652          * None of the breakpoints are in user space.
2653          */
2654         return 0;
2655 }
2656 
2657 /*
2658  * The pcb_flags is only modified by current thread, or by other threads
2659  * when current thread is stopped.  However, current thread may change it
2660  * from the interrupt context in cpu_switch(), or in the trap handler.
2661  * When we read-modify-write pcb_flags from C sources, compiler may generate
2662  * code that is not atomic regarding the interrupt handler.  If a trap or
2663  * interrupt happens and any flag is modified from the handler, it can be
2664  * clobbered with the cached value later.  Therefore, we implement setting
2665  * and clearing flags with single-instruction functions, which do not race
2666  * with possible modification of the flags from the trap or interrupt context,
2667  * because traps and interrupts are executed only on instruction boundary.
2668  */
2669 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)2670 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2671 {
2672 
2673 	__asm __volatile("orl %1,%0"
2674 	    : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2675 	    : "cc", "memory");
2676 
2677 }
2678 
2679 /*
2680  * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2681  * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2682  * pcb if user space modified the bases.  We must save on the context
2683  * switch or if the return to usermode happens through the doreti.
2684  *
2685  * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2686  * which have a consequence that the base MSRs must be saved each time
2687  * the PCB_FULL_IRET flag is set.  We disable interrupts to sync with
2688  * context switches.
2689  */
2690 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)2691 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
2692 {
2693 	register_t r;
2694 
2695 	if (curpcb == pcb &&
2696 	    (flags & PCB_FULL_IRET) != 0 &&
2697 	    (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2698 		r = intr_disable();
2699 		if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2700 			if (rfs() == _ufssel)
2701 				pcb->pcb_fsbase = rdfsbase();
2702 			if (rgs() == _ugssel)
2703 				pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2704 		}
2705 		set_pcb_flags_raw(pcb, flags);
2706 		intr_restore(r);
2707 	} else {
2708 		set_pcb_flags_raw(pcb, flags);
2709 	}
2710 }
2711 
2712 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
2713 {
2714 
2715 	return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
2716 	    set_pcb_flags_fsgsbase : set_pcb_flags_raw);
2717 }
2718 
2719 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)2720 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2721 {
2722 
2723 	__asm __volatile("andl %1,%0"
2724 	    : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2725 	    : "cc", "memory");
2726 }
2727 
2728 #ifdef KDB
2729 
2730 /*
2731  * Provide inb() and outb() as functions.  They are normally only available as
2732  * inline functions, thus cannot be called from the debugger.
2733  */
2734 
2735 /* silence compiler warnings */
2736 u_char inb_(u_short);
2737 void outb_(u_short, u_char);
2738 
2739 u_char
inb_(u_short port)2740 inb_(u_short port)
2741 {
2742 	return inb(port);
2743 }
2744 
2745 void
outb_(u_short port,u_char data)2746 outb_(u_short port, u_char data)
2747 {
2748 	outb(port, data);
2749 }
2750 
2751 #endif /* KDB */
2752 
2753 #undef memset
2754 #undef memmove
2755 #undef memcpy
2756 
2757 void	*memset_std(void *buf, int c, size_t len);
2758 void	*memset_erms(void *buf, int c, size_t len);
2759 void    *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
2760 	    size_t len);
2761 void    *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
2762 	    size_t len);
2763 void    *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
2764 	    size_t len);
2765 void    *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
2766 	    size_t len);
2767 
2768 #ifdef KCSAN
2769 /*
2770  * These fail to build as ifuncs when used with KCSAN.
2771  */
2772 void *
memset(void * buf,int c,size_t len)2773 memset(void *buf, int c, size_t len)
2774 {
2775 
2776 	return (memset_std(buf, c, len));
2777 }
2778 
2779 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)2780 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2781 {
2782 
2783 	return (memmove_std(dst, src, len));
2784 }
2785 
2786 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)2787 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2788 {
2789 
2790 	return (memcpy_std(dst, src, len));
2791 }
2792 #else
2793 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
2794 {
2795 
2796 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2797 	    memset_erms : memset_std);
2798 }
2799 
2800 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
2801     size_t))
2802 {
2803 
2804 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2805 	    memmove_erms : memmove_std);
2806 }
2807 
2808 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
2809 {
2810 
2811 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2812 	    memcpy_erms : memcpy_std);
2813 }
2814 #endif
2815 
2816 void	pagezero_std(void *addr);
2817 void	pagezero_erms(void *addr);
2818 DEFINE_IFUNC(, void , pagezero, (void *))
2819 {
2820 
2821 	return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2822 	    pagezero_erms : pagezero_std);
2823 }
2824