1 /*-
2 * SPDX-License-Identifier: BSD-4-Clause
3 *
4 * Copyright (c) 2003 Peter Wemm.
5 * Copyright (c) 1992 Terrence R. Lambert.
6 * Copyright (c) 1982, 1987, 1990 The Regents of the University of California.
7 * All rights reserved.
8 *
9 * This code is derived from software contributed to Berkeley by
10 * William Jolitz.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * from: @(#)machdep.c 7.4 (Berkeley) 6/3/91
41 */
42
43 #include <sys/cdefs.h>
44 __FBSDID("$FreeBSD$");
45
46 #include "opt_atpic.h"
47 #include "opt_cpu.h"
48 #include "opt_ddb.h"
49 #include "opt_inet.h"
50 #include "opt_isa.h"
51 #include "opt_kstack_pages.h"
52 #include "opt_maxmem.h"
53 #include "opt_mp_watchdog.h"
54 #include "opt_pci.h"
55 #include "opt_platform.h"
56 #include "opt_sched.h"
57
58 #include <sys/param.h>
59 #include <sys/proc.h>
60 #include <sys/systm.h>
61 #include <sys/bio.h>
62 #include <sys/buf.h>
63 #include <sys/bus.h>
64 #include <sys/callout.h>
65 #include <sys/cons.h>
66 #include <sys/cpu.h>
67 #include <sys/csan.h>
68 #include <sys/efi.h>
69 #include <sys/eventhandler.h>
70 #include <sys/exec.h>
71 #include <sys/imgact.h>
72 #include <sys/kdb.h>
73 #include <sys/kernel.h>
74 #include <sys/ktr.h>
75 #include <sys/linker.h>
76 #include <sys/lock.h>
77 #include <sys/malloc.h>
78 #include <sys/memrange.h>
79 #include <sys/msgbuf.h>
80 #include <sys/mutex.h>
81 #include <sys/pcpu.h>
82 #include <sys/ptrace.h>
83 #include <sys/reboot.h>
84 #include <sys/rwlock.h>
85 #include <sys/sched.h>
86 #include <sys/signalvar.h>
87 #ifdef SMP
88 #include <sys/smp.h>
89 #endif
90 #include <sys/syscallsubr.h>
91 #include <sys/sysctl.h>
92 #include <sys/sysent.h>
93 #include <sys/sysproto.h>
94 #include <sys/ucontext.h>
95 #include <sys/vmmeter.h>
96
97 #include <vm/vm.h>
98 #include <vm/vm_param.h>
99 #include <vm/vm_extern.h>
100 #include <vm/vm_kern.h>
101 #include <vm/vm_page.h>
102 #include <vm/vm_map.h>
103 #include <vm/vm_object.h>
104 #include <vm/vm_pager.h>
105 #include <vm/vm_phys.h>
106 #include <vm/vm_dumpset.h>
107
108 #ifdef DDB
109 #ifndef KDB
110 #error KDB must be enabled in order for DDB to work!
111 #endif
112 #include <ddb/ddb.h>
113 #include <ddb/db_sym.h>
114 #endif
115
116 #include <net/netisr.h>
117
118 #include <machine/clock.h>
119 #include <machine/cpu.h>
120 #include <machine/cputypes.h>
121 #include <machine/frame.h>
122 #include <machine/intr_machdep.h>
123 #include <x86/mca.h>
124 #include <machine/md_var.h>
125 #include <machine/metadata.h>
126 #include <machine/mp_watchdog.h>
127 #include <machine/pc/bios.h>
128 #include <machine/pcb.h>
129 #include <machine/proc.h>
130 #include <machine/reg.h>
131 #include <machine/sigframe.h>
132 #include <machine/specialreg.h>
133 #include <machine/trap.h>
134 #include <machine/tss.h>
135 #include <x86/ucode.h>
136 #include <x86/ifunc.h>
137 #ifdef SMP
138 #include <machine/smp.h>
139 #endif
140 #ifdef FDT
141 #include <x86/fdt.h>
142 #endif
143
144 #ifdef DEV_ATPIC
145 #include <x86/isa/icu.h>
146 #else
147 #include <x86/apicvar.h>
148 #endif
149
150 #include <isa/isareg.h>
151 #include <isa/rtc.h>
152 #include <x86/init.h>
153
154 /* Sanity check for __curthread() */
155 CTASSERT(offsetof(struct pcpu, pc_curthread) == 0);
156
157 /*
158 * The PTI trampoline stack needs enough space for a hardware trapframe and a
159 * couple of scratch registers, as well as the trapframe left behind after an
160 * iret fault.
161 */
162 CTASSERT(PC_PTI_STACK_SZ * sizeof(register_t) >= 2 * sizeof(struct pti_frame) -
163 offsetof(struct pti_frame, pti_rip));
164
165 extern u_int64_t hammer_time(u_int64_t, u_int64_t);
166
167 #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL)
168 #define EFL_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0)
169
170 static void cpu_startup(void *);
171 static void get_fpcontext(struct thread *td, mcontext_t *mcp,
172 char *xfpusave, size_t xfpusave_len);
173 static int set_fpcontext(struct thread *td, mcontext_t *mcp,
174 char *xfpustate, size_t xfpustate_len);
175 SYSINIT(cpu, SI_SUB_CPU, SI_ORDER_FIRST, cpu_startup, NULL);
176
177 /* Preload data parse function */
178 static caddr_t native_parse_preload_data(u_int64_t);
179
180 /* Native function to fetch and parse the e820 map */
181 static void native_parse_memmap(caddr_t, vm_paddr_t *, int *);
182
183 /* Default init_ops implementation. */
184 struct init_ops init_ops = {
185 .parse_preload_data = native_parse_preload_data,
186 .early_clock_source_init = i8254_init,
187 .early_delay = i8254_delay,
188 .parse_memmap = native_parse_memmap,
189 #ifdef SMP
190 .mp_bootaddress = mp_bootaddress,
191 .start_all_aps = native_start_all_aps,
192 #endif
193 #ifdef DEV_PCI
194 .msi_init = msi_init,
195 #endif
196 };
197
198 /*
199 * Physical address of the EFI System Table. Stashed from the metadata hints
200 * passed into the kernel and used by the EFI code to call runtime services.
201 */
202 vm_paddr_t efi_systbl_phys;
203
204 /* Intel ICH registers */
205 #define ICH_PMBASE 0x400
206 #define ICH_SMI_EN ICH_PMBASE + 0x30
207
208 int _udatasel, _ucodesel, _ucode32sel, _ufssel, _ugssel;
209
210 int cold = 1;
211
212 long Maxmem = 0;
213 long realmem = 0;
214
215 struct kva_md_info kmi;
216
217 static struct trapframe proc0_tf;
218 struct region_descriptor r_idt;
219
220 struct pcpu *__pcpu;
221 struct pcpu temp_bsp_pcpu;
222
223 struct mtx icu_lock;
224
225 struct mem_range_softc mem_range_softc;
226
227 struct mtx dt_lock; /* lock for GDT and LDT */
228
229 void (*vmm_resume_p)(void);
230
231 static void
cpu_startup(dummy)232 cpu_startup(dummy)
233 void *dummy;
234 {
235 uintmax_t memsize;
236 char *sysenv;
237
238 /*
239 * On MacBooks, we need to disallow the legacy USB circuit to
240 * generate an SMI# because this can cause several problems,
241 * namely: incorrect CPU frequency detection and failure to
242 * start the APs.
243 * We do this by disabling a bit in the SMI_EN (SMI Control and
244 * Enable register) of the Intel ICH LPC Interface Bridge.
245 */
246 sysenv = kern_getenv("smbios.system.product");
247 if (sysenv != NULL) {
248 if (strncmp(sysenv, "MacBook1,1", 10) == 0 ||
249 strncmp(sysenv, "MacBook3,1", 10) == 0 ||
250 strncmp(sysenv, "MacBook4,1", 10) == 0 ||
251 strncmp(sysenv, "MacBookPro1,1", 13) == 0 ||
252 strncmp(sysenv, "MacBookPro1,2", 13) == 0 ||
253 strncmp(sysenv, "MacBookPro3,1", 13) == 0 ||
254 strncmp(sysenv, "MacBookPro4,1", 13) == 0 ||
255 strncmp(sysenv, "Macmini1,1", 10) == 0) {
256 if (bootverbose)
257 printf("Disabling LEGACY_USB_EN bit on "
258 "Intel ICH.\n");
259 outl(ICH_SMI_EN, inl(ICH_SMI_EN) & ~0x8);
260 }
261 freeenv(sysenv);
262 }
263
264 /*
265 * Good {morning,afternoon,evening,night}.
266 */
267 startrtclock();
268 printcpuinfo();
269
270 /*
271 * Display physical memory if SMBIOS reports reasonable amount.
272 */
273 memsize = 0;
274 sysenv = kern_getenv("smbios.memory.enabled");
275 if (sysenv != NULL) {
276 memsize = (uintmax_t)strtoul(sysenv, (char **)NULL, 10) << 10;
277 freeenv(sysenv);
278 }
279 if (memsize < ptoa((uintmax_t)vm_free_count()))
280 memsize = ptoa((uintmax_t)Maxmem);
281 printf("real memory = %ju (%ju MB)\n", memsize, memsize >> 20);
282 realmem = atop(memsize);
283
284 /*
285 * Display any holes after the first chunk of extended memory.
286 */
287 if (bootverbose) {
288 int indx;
289
290 printf("Physical memory chunk(s):\n");
291 for (indx = 0; phys_avail[indx + 1] != 0; indx += 2) {
292 vm_paddr_t size;
293
294 size = phys_avail[indx + 1] - phys_avail[indx];
295 printf(
296 "0x%016jx - 0x%016jx, %ju bytes (%ju pages)\n",
297 (uintmax_t)phys_avail[indx],
298 (uintmax_t)phys_avail[indx + 1] - 1,
299 (uintmax_t)size, (uintmax_t)size / PAGE_SIZE);
300 }
301 }
302
303 vm_ksubmap_init(&kmi);
304
305 printf("avail memory = %ju (%ju MB)\n",
306 ptoa((uintmax_t)vm_free_count()),
307 ptoa((uintmax_t)vm_free_count()) / 1048576);
308 #ifdef DEV_PCI
309 if (bootverbose && intel_graphics_stolen_base != 0)
310 printf("intel stolen mem: base %#jx size %ju MB\n",
311 (uintmax_t)intel_graphics_stolen_base,
312 (uintmax_t)intel_graphics_stolen_size / 1024 / 1024);
313 #endif
314
315 /*
316 * Set up buffers, so they can be used to read disk labels.
317 */
318 bufinit();
319 vm_pager_bufferinit();
320
321 cpu_setregs();
322 }
323
324 static void
late_ifunc_resolve(void * dummy __unused)325 late_ifunc_resolve(void *dummy __unused)
326 {
327 link_elf_late_ireloc();
328 }
329 SYSINIT(late_ifunc_resolve, SI_SUB_CPU, SI_ORDER_ANY, late_ifunc_resolve, NULL);
330
331 /*
332 * Send an interrupt to process.
333 *
334 * Stack is set up to allow sigcode stored
335 * at top to call routine, followed by call
336 * to sigreturn routine below. After sigreturn
337 * resets the signal mask, the stack, and the
338 * frame pointer, it returns to the user
339 * specified pc, psl.
340 */
341 void
sendsig(sig_t catcher,ksiginfo_t * ksi,sigset_t * mask)342 sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask)
343 {
344 struct sigframe sf, *sfp;
345 struct pcb *pcb;
346 struct proc *p;
347 struct thread *td;
348 struct sigacts *psp;
349 char *sp;
350 struct trapframe *regs;
351 char *xfpusave;
352 size_t xfpusave_len;
353 int sig;
354 int oonstack;
355
356 td = curthread;
357 pcb = td->td_pcb;
358 p = td->td_proc;
359 PROC_LOCK_ASSERT(p, MA_OWNED);
360 sig = ksi->ksi_signo;
361 psp = p->p_sigacts;
362 mtx_assert(&psp->ps_mtx, MA_OWNED);
363 regs = td->td_frame;
364 oonstack = sigonstack(regs->tf_rsp);
365
366 if (cpu_max_ext_state_size > sizeof(struct savefpu) && use_xsave) {
367 xfpusave_len = cpu_max_ext_state_size - sizeof(struct savefpu);
368 xfpusave = __builtin_alloca(xfpusave_len);
369 } else {
370 xfpusave_len = 0;
371 xfpusave = NULL;
372 }
373
374 /* Save user context. */
375 bzero(&sf, sizeof(sf));
376 sf.sf_uc.uc_sigmask = *mask;
377 sf.sf_uc.uc_stack = td->td_sigstk;
378 sf.sf_uc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK)
379 ? ((oonstack) ? SS_ONSTACK : 0) : SS_DISABLE;
380 sf.sf_uc.uc_mcontext.mc_onstack = (oonstack) ? 1 : 0;
381 bcopy(regs, &sf.sf_uc.uc_mcontext.mc_rdi, sizeof(*regs));
382 sf.sf_uc.uc_mcontext.mc_len = sizeof(sf.sf_uc.uc_mcontext); /* magic */
383 get_fpcontext(td, &sf.sf_uc.uc_mcontext, xfpusave, xfpusave_len);
384 fpstate_drop(td);
385 update_pcb_bases(pcb);
386 sf.sf_uc.uc_mcontext.mc_fsbase = pcb->pcb_fsbase;
387 sf.sf_uc.uc_mcontext.mc_gsbase = pcb->pcb_gsbase;
388 bzero(sf.sf_uc.uc_mcontext.mc_spare,
389 sizeof(sf.sf_uc.uc_mcontext.mc_spare));
390
391 /* Allocate space for the signal handler context. */
392 if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack &&
393 SIGISMEMBER(psp->ps_sigonstack, sig)) {
394 sp = (char *)td->td_sigstk.ss_sp + td->td_sigstk.ss_size;
395 #if defined(COMPAT_43)
396 td->td_sigstk.ss_flags |= SS_ONSTACK;
397 #endif
398 } else
399 sp = (char *)regs->tf_rsp - 128;
400 if (xfpusave != NULL) {
401 sp -= xfpusave_len;
402 sp = (char *)((unsigned long)sp & ~0x3Ful);
403 sf.sf_uc.uc_mcontext.mc_xfpustate = (register_t)sp;
404 }
405 sp -= sizeof(struct sigframe);
406 /* Align to 16 bytes. */
407 sfp = (struct sigframe *)((unsigned long)sp & ~0xFul);
408
409 /* Build the argument list for the signal handler. */
410 regs->tf_rdi = sig; /* arg 1 in %rdi */
411 regs->tf_rdx = (register_t)&sfp->sf_uc; /* arg 3 in %rdx */
412 bzero(&sf.sf_si, sizeof(sf.sf_si));
413 if (SIGISMEMBER(psp->ps_siginfo, sig)) {
414 /* Signal handler installed with SA_SIGINFO. */
415 regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */
416 sf.sf_ahu.sf_action = (__siginfohandler_t *)catcher;
417
418 /* Fill in POSIX parts */
419 sf.sf_si = ksi->ksi_info;
420 sf.sf_si.si_signo = sig; /* maybe a translated signal */
421 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
422 } else {
423 /* Old FreeBSD-style arguments. */
424 regs->tf_rsi = ksi->ksi_code; /* arg 2 in %rsi */
425 regs->tf_rcx = (register_t)ksi->ksi_addr; /* arg 4 in %rcx */
426 sf.sf_ahu.sf_handler = catcher;
427 }
428 mtx_unlock(&psp->ps_mtx);
429 PROC_UNLOCK(p);
430
431 /*
432 * Copy the sigframe out to the user's stack.
433 */
434 if (copyout(&sf, sfp, sizeof(*sfp)) != 0 ||
435 (xfpusave != NULL && copyout(xfpusave,
436 (void *)sf.sf_uc.uc_mcontext.mc_xfpustate, xfpusave_len)
437 != 0)) {
438 #ifdef DEBUG
439 printf("process %ld has trashed its stack\n", (long)p->p_pid);
440 #endif
441 PROC_LOCK(p);
442 sigexit(td, SIGILL);
443 }
444
445 regs->tf_rsp = (long)sfp;
446 regs->tf_rip = p->p_sysent->sv_sigcode_base;
447 regs->tf_rflags &= ~(PSL_T | PSL_D);
448 regs->tf_cs = _ucodesel;
449 regs->tf_ds = _udatasel;
450 regs->tf_ss = _udatasel;
451 regs->tf_es = _udatasel;
452 regs->tf_fs = _ufssel;
453 regs->tf_gs = _ugssel;
454 regs->tf_flags = TF_HASSEGS;
455 PROC_LOCK(p);
456 mtx_lock(&psp->ps_mtx);
457 }
458
459 /*
460 * System call to cleanup state after a signal
461 * has been taken. Reset signal mask and
462 * stack state from context left by sendsig (above).
463 * Return to previous pc and psl as specified by
464 * context left by sendsig. Check carefully to
465 * make sure that the user has not modified the
466 * state to gain improper privileges.
467 *
468 * MPSAFE
469 */
470 int
sys_sigreturn(td,uap)471 sys_sigreturn(td, uap)
472 struct thread *td;
473 struct sigreturn_args /* {
474 const struct __ucontext *sigcntxp;
475 } */ *uap;
476 {
477 ucontext_t uc;
478 struct pcb *pcb;
479 struct proc *p;
480 struct trapframe *regs;
481 ucontext_t *ucp;
482 char *xfpustate;
483 size_t xfpustate_len;
484 long rflags;
485 int cs, error, ret;
486 ksiginfo_t ksi;
487
488 pcb = td->td_pcb;
489 p = td->td_proc;
490
491 error = copyin(uap->sigcntxp, &uc, sizeof(uc));
492 if (error != 0) {
493 uprintf("pid %d (%s): sigreturn copyin failed\n",
494 p->p_pid, td->td_name);
495 return (error);
496 }
497 ucp = &uc;
498 if ((ucp->uc_mcontext.mc_flags & ~_MC_FLAG_MASK) != 0) {
499 uprintf("pid %d (%s): sigreturn mc_flags %x\n", p->p_pid,
500 td->td_name, ucp->uc_mcontext.mc_flags);
501 return (EINVAL);
502 }
503 regs = td->td_frame;
504 rflags = ucp->uc_mcontext.mc_rflags;
505 /*
506 * Don't allow users to change privileged or reserved flags.
507 */
508 if (!EFL_SECURE(rflags, regs->tf_rflags)) {
509 uprintf("pid %d (%s): sigreturn rflags = 0x%lx\n", p->p_pid,
510 td->td_name, rflags);
511 return (EINVAL);
512 }
513
514 /*
515 * Don't allow users to load a valid privileged %cs. Let the
516 * hardware check for invalid selectors, excess privilege in
517 * other selectors, invalid %eip's and invalid %esp's.
518 */
519 cs = ucp->uc_mcontext.mc_cs;
520 if (!CS_SECURE(cs)) {
521 uprintf("pid %d (%s): sigreturn cs = 0x%x\n", p->p_pid,
522 td->td_name, cs);
523 ksiginfo_init_trap(&ksi);
524 ksi.ksi_signo = SIGBUS;
525 ksi.ksi_code = BUS_OBJERR;
526 ksi.ksi_trapno = T_PROTFLT;
527 ksi.ksi_addr = (void *)regs->tf_rip;
528 trapsignal(td, &ksi);
529 return (EINVAL);
530 }
531
532 if ((uc.uc_mcontext.mc_flags & _MC_HASFPXSTATE) != 0) {
533 xfpustate_len = uc.uc_mcontext.mc_xfpustate_len;
534 if (xfpustate_len > cpu_max_ext_state_size -
535 sizeof(struct savefpu)) {
536 uprintf("pid %d (%s): sigreturn xfpusave_len = 0x%zx\n",
537 p->p_pid, td->td_name, xfpustate_len);
538 return (EINVAL);
539 }
540 xfpustate = __builtin_alloca(xfpustate_len);
541 error = copyin((const void *)uc.uc_mcontext.mc_xfpustate,
542 xfpustate, xfpustate_len);
543 if (error != 0) {
544 uprintf(
545 "pid %d (%s): sigreturn copying xfpustate failed\n",
546 p->p_pid, td->td_name);
547 return (error);
548 }
549 } else {
550 xfpustate = NULL;
551 xfpustate_len = 0;
552 }
553 ret = set_fpcontext(td, &ucp->uc_mcontext, xfpustate, xfpustate_len);
554 if (ret != 0) {
555 uprintf("pid %d (%s): sigreturn set_fpcontext err %d\n",
556 p->p_pid, td->td_name, ret);
557 return (ret);
558 }
559 bcopy(&ucp->uc_mcontext.mc_rdi, regs, sizeof(*regs));
560 update_pcb_bases(pcb);
561 pcb->pcb_fsbase = ucp->uc_mcontext.mc_fsbase;
562 pcb->pcb_gsbase = ucp->uc_mcontext.mc_gsbase;
563
564 #if defined(COMPAT_43)
565 if (ucp->uc_mcontext.mc_onstack & 1)
566 td->td_sigstk.ss_flags |= SS_ONSTACK;
567 else
568 td->td_sigstk.ss_flags &= ~SS_ONSTACK;
569 #endif
570
571 kern_sigprocmask(td, SIG_SETMASK, &ucp->uc_sigmask, NULL, 0);
572 return (EJUSTRETURN);
573 }
574
575 #ifdef COMPAT_FREEBSD4
576 int
freebsd4_sigreturn(struct thread * td,struct freebsd4_sigreturn_args * uap)577 freebsd4_sigreturn(struct thread *td, struct freebsd4_sigreturn_args *uap)
578 {
579
580 return sys_sigreturn(td, (struct sigreturn_args *)uap);
581 }
582 #endif
583
584 /*
585 * Reset registers to default values on exec.
586 */
587 void
exec_setregs(struct thread * td,struct image_params * imgp,uintptr_t stack)588 exec_setregs(struct thread *td, struct image_params *imgp, uintptr_t stack)
589 {
590 struct trapframe *regs;
591 struct pcb *pcb;
592 register_t saved_rflags;
593
594 regs = td->td_frame;
595 pcb = td->td_pcb;
596
597 if (td->td_proc->p_md.md_ldt != NULL)
598 user_ldt_free(td);
599
600 update_pcb_bases(pcb);
601 pcb->pcb_fsbase = 0;
602 pcb->pcb_gsbase = 0;
603 clear_pcb_flags(pcb, PCB_32BIT);
604 pcb->pcb_initial_fpucw = __INITIAL_FPUCW__;
605
606 saved_rflags = regs->tf_rflags & PSL_T;
607 bzero((char *)regs, sizeof(struct trapframe));
608 regs->tf_rip = imgp->entry_addr;
609 regs->tf_rsp = ((stack - 8) & ~0xFul) + 8;
610 regs->tf_rdi = stack; /* argv */
611 regs->tf_rflags = PSL_USER | saved_rflags;
612 regs->tf_ss = _udatasel;
613 regs->tf_cs = _ucodesel;
614 regs->tf_ds = _udatasel;
615 regs->tf_es = _udatasel;
616 regs->tf_fs = _ufssel;
617 regs->tf_gs = _ugssel;
618 regs->tf_flags = TF_HASSEGS;
619
620 /*
621 * Reset the hardware debug registers if they were in use.
622 * They won't have any meaning for the newly exec'd process.
623 */
624 if (pcb->pcb_flags & PCB_DBREGS) {
625 pcb->pcb_dr0 = 0;
626 pcb->pcb_dr1 = 0;
627 pcb->pcb_dr2 = 0;
628 pcb->pcb_dr3 = 0;
629 pcb->pcb_dr6 = 0;
630 pcb->pcb_dr7 = 0;
631 if (pcb == curpcb) {
632 /*
633 * Clear the debug registers on the running
634 * CPU, otherwise they will end up affecting
635 * the next process we switch to.
636 */
637 reset_dbregs();
638 }
639 clear_pcb_flags(pcb, PCB_DBREGS);
640 }
641
642 /*
643 * Drop the FP state if we hold it, so that the process gets a
644 * clean FP state if it uses the FPU again.
645 */
646 fpstate_drop(td);
647 }
648
649 void
cpu_setregs(void)650 cpu_setregs(void)
651 {
652 register_t cr0;
653
654 cr0 = rcr0();
655 /*
656 * CR0_MP, CR0_NE and CR0_TS are also set by npx_probe() for the
657 * BSP. See the comments there about why we set them.
658 */
659 cr0 |= CR0_MP | CR0_NE | CR0_TS | CR0_WP | CR0_AM;
660 load_cr0(cr0);
661 }
662
663 /*
664 * Initialize amd64 and configure to run kernel
665 */
666
667 /*
668 * Initialize segments & interrupt table
669 */
670 static struct gate_descriptor idt0[NIDT];
671 struct gate_descriptor *idt = &idt0[0]; /* interrupt descriptor table */
672
673 static char dblfault_stack[DBLFAULT_STACK_SIZE] __aligned(16);
674 static char mce0_stack[MCE_STACK_SIZE] __aligned(16);
675 static char nmi0_stack[NMI_STACK_SIZE] __aligned(16);
676 static char dbg0_stack[DBG_STACK_SIZE] __aligned(16);
677 CTASSERT(sizeof(struct nmi_pcpu) == 16);
678
679 /*
680 * Software prototypes -- in more palatable form.
681 *
682 * Keep GUFS32, GUGS32, GUCODE32 and GUDATA at the same
683 * slots as corresponding segments for i386 kernel.
684 */
685 struct soft_segment_descriptor gdt_segs[] = {
686 /* GNULL_SEL 0 Null Descriptor */
687 { .ssd_base = 0x0,
688 .ssd_limit = 0x0,
689 .ssd_type = 0,
690 .ssd_dpl = 0,
691 .ssd_p = 0,
692 .ssd_long = 0,
693 .ssd_def32 = 0,
694 .ssd_gran = 0 },
695 /* GNULL2_SEL 1 Null Descriptor */
696 { .ssd_base = 0x0,
697 .ssd_limit = 0x0,
698 .ssd_type = 0,
699 .ssd_dpl = 0,
700 .ssd_p = 0,
701 .ssd_long = 0,
702 .ssd_def32 = 0,
703 .ssd_gran = 0 },
704 /* GUFS32_SEL 2 32 bit %gs Descriptor for user */
705 { .ssd_base = 0x0,
706 .ssd_limit = 0xfffff,
707 .ssd_type = SDT_MEMRWA,
708 .ssd_dpl = SEL_UPL,
709 .ssd_p = 1,
710 .ssd_long = 0,
711 .ssd_def32 = 1,
712 .ssd_gran = 1 },
713 /* GUGS32_SEL 3 32 bit %fs Descriptor for user */
714 { .ssd_base = 0x0,
715 .ssd_limit = 0xfffff,
716 .ssd_type = SDT_MEMRWA,
717 .ssd_dpl = SEL_UPL,
718 .ssd_p = 1,
719 .ssd_long = 0,
720 .ssd_def32 = 1,
721 .ssd_gran = 1 },
722 /* GCODE_SEL 4 Code Descriptor for kernel */
723 { .ssd_base = 0x0,
724 .ssd_limit = 0xfffff,
725 .ssd_type = SDT_MEMERA,
726 .ssd_dpl = SEL_KPL,
727 .ssd_p = 1,
728 .ssd_long = 1,
729 .ssd_def32 = 0,
730 .ssd_gran = 1 },
731 /* GDATA_SEL 5 Data Descriptor for kernel */
732 { .ssd_base = 0x0,
733 .ssd_limit = 0xfffff,
734 .ssd_type = SDT_MEMRWA,
735 .ssd_dpl = SEL_KPL,
736 .ssd_p = 1,
737 .ssd_long = 1,
738 .ssd_def32 = 0,
739 .ssd_gran = 1 },
740 /* GUCODE32_SEL 6 32 bit Code Descriptor for user */
741 { .ssd_base = 0x0,
742 .ssd_limit = 0xfffff,
743 .ssd_type = SDT_MEMERA,
744 .ssd_dpl = SEL_UPL,
745 .ssd_p = 1,
746 .ssd_long = 0,
747 .ssd_def32 = 1,
748 .ssd_gran = 1 },
749 /* GUDATA_SEL 7 32/64 bit Data Descriptor for user */
750 { .ssd_base = 0x0,
751 .ssd_limit = 0xfffff,
752 .ssd_type = SDT_MEMRWA,
753 .ssd_dpl = SEL_UPL,
754 .ssd_p = 1,
755 .ssd_long = 0,
756 .ssd_def32 = 1,
757 .ssd_gran = 1 },
758 /* GUCODE_SEL 8 64 bit Code Descriptor for user */
759 { .ssd_base = 0x0,
760 .ssd_limit = 0xfffff,
761 .ssd_type = SDT_MEMERA,
762 .ssd_dpl = SEL_UPL,
763 .ssd_p = 1,
764 .ssd_long = 1,
765 .ssd_def32 = 0,
766 .ssd_gran = 1 },
767 /* GPROC0_SEL 9 Proc 0 Tss Descriptor */
768 { .ssd_base = 0x0,
769 .ssd_limit = sizeof(struct amd64tss) + IOPERM_BITMAP_SIZE - 1,
770 .ssd_type = SDT_SYSTSS,
771 .ssd_dpl = SEL_KPL,
772 .ssd_p = 1,
773 .ssd_long = 0,
774 .ssd_def32 = 0,
775 .ssd_gran = 0 },
776 /* Actually, the TSS is a system descriptor which is double size */
777 { .ssd_base = 0x0,
778 .ssd_limit = 0x0,
779 .ssd_type = 0,
780 .ssd_dpl = 0,
781 .ssd_p = 0,
782 .ssd_long = 0,
783 .ssd_def32 = 0,
784 .ssd_gran = 0 },
785 /* GUSERLDT_SEL 11 LDT Descriptor */
786 { .ssd_base = 0x0,
787 .ssd_limit = 0x0,
788 .ssd_type = 0,
789 .ssd_dpl = 0,
790 .ssd_p = 0,
791 .ssd_long = 0,
792 .ssd_def32 = 0,
793 .ssd_gran = 0 },
794 /* GUSERLDT_SEL 12 LDT Descriptor, double size */
795 { .ssd_base = 0x0,
796 .ssd_limit = 0x0,
797 .ssd_type = 0,
798 .ssd_dpl = 0,
799 .ssd_p = 0,
800 .ssd_long = 0,
801 .ssd_def32 = 0,
802 .ssd_gran = 0 },
803 };
804 _Static_assert(nitems(gdt_segs) == NGDT, "Stale NGDT");
805
806 void
setidt(int idx,inthand_t * func,int typ,int dpl,int ist)807 setidt(int idx, inthand_t *func, int typ, int dpl, int ist)
808 {
809 struct gate_descriptor *ip;
810
811 ip = idt + idx;
812 ip->gd_looffset = (uintptr_t)func;
813 ip->gd_selector = GSEL(GCODE_SEL, SEL_KPL);
814 ip->gd_ist = ist;
815 ip->gd_xx = 0;
816 ip->gd_type = typ;
817 ip->gd_dpl = dpl;
818 ip->gd_p = 1;
819 ip->gd_hioffset = ((uintptr_t)func)>>16 ;
820 }
821
822 extern inthand_t
823 IDTVEC(div), IDTVEC(dbg), IDTVEC(nmi), IDTVEC(bpt), IDTVEC(ofl),
824 IDTVEC(bnd), IDTVEC(ill), IDTVEC(dna), IDTVEC(fpusegm),
825 IDTVEC(tss), IDTVEC(missing), IDTVEC(stk), IDTVEC(prot),
826 IDTVEC(page), IDTVEC(mchk), IDTVEC(rsvd), IDTVEC(fpu), IDTVEC(align),
827 IDTVEC(xmm), IDTVEC(dblfault),
828 IDTVEC(div_pti), IDTVEC(bpt_pti),
829 IDTVEC(ofl_pti), IDTVEC(bnd_pti), IDTVEC(ill_pti), IDTVEC(dna_pti),
830 IDTVEC(fpusegm_pti), IDTVEC(tss_pti), IDTVEC(missing_pti),
831 IDTVEC(stk_pti), IDTVEC(prot_pti), IDTVEC(page_pti),
832 IDTVEC(rsvd_pti), IDTVEC(fpu_pti), IDTVEC(align_pti),
833 IDTVEC(xmm_pti),
834 #ifdef KDTRACE_HOOKS
835 IDTVEC(dtrace_ret), IDTVEC(dtrace_ret_pti),
836 #endif
837 #ifdef XENHVM
838 IDTVEC(xen_intr_upcall), IDTVEC(xen_intr_upcall_pti),
839 #endif
840 IDTVEC(fast_syscall), IDTVEC(fast_syscall32),
841 IDTVEC(fast_syscall_pti);
842
843 #ifdef DDB
844 /*
845 * Display the index and function name of any IDT entries that don't use
846 * the default 'rsvd' entry point.
847 */
DB_SHOW_COMMAND(idt,db_show_idt)848 DB_SHOW_COMMAND(idt, db_show_idt)
849 {
850 struct gate_descriptor *ip;
851 int idx;
852 uintptr_t func;
853
854 ip = idt;
855 for (idx = 0; idx < NIDT && !db_pager_quit; idx++) {
856 func = ((long)ip->gd_hioffset << 16 | ip->gd_looffset);
857 if (func != (uintptr_t)&IDTVEC(rsvd)) {
858 db_printf("%3d\t", idx);
859 db_printsym(func, DB_STGY_PROC);
860 db_printf("\n");
861 }
862 ip++;
863 }
864 }
865
866 /* Show privileged registers. */
DB_SHOW_COMMAND(sysregs,db_show_sysregs)867 DB_SHOW_COMMAND(sysregs, db_show_sysregs)
868 {
869 struct {
870 uint16_t limit;
871 uint64_t base;
872 } __packed idtr, gdtr;
873 uint16_t ldt, tr;
874
875 __asm __volatile("sidt %0" : "=m" (idtr));
876 db_printf("idtr\t0x%016lx/%04x\n",
877 (u_long)idtr.base, (u_int)idtr.limit);
878 __asm __volatile("sgdt %0" : "=m" (gdtr));
879 db_printf("gdtr\t0x%016lx/%04x\n",
880 (u_long)gdtr.base, (u_int)gdtr.limit);
881 __asm __volatile("sldt %0" : "=r" (ldt));
882 db_printf("ldtr\t0x%04x\n", ldt);
883 __asm __volatile("str %0" : "=r" (tr));
884 db_printf("tr\t0x%04x\n", tr);
885 db_printf("cr0\t0x%016lx\n", rcr0());
886 db_printf("cr2\t0x%016lx\n", rcr2());
887 db_printf("cr3\t0x%016lx\n", rcr3());
888 db_printf("cr4\t0x%016lx\n", rcr4());
889 if (rcr4() & CR4_XSAVE)
890 db_printf("xcr0\t0x%016lx\n", rxcr(0));
891 db_printf("EFER\t0x%016lx\n", rdmsr(MSR_EFER));
892 if (cpu_feature2 & (CPUID2_VMX | CPUID2_SMX))
893 db_printf("FEATURES_CTL\t%016lx\n",
894 rdmsr(MSR_IA32_FEATURE_CONTROL));
895 db_printf("DEBUG_CTL\t0x%016lx\n", rdmsr(MSR_DEBUGCTLMSR));
896 db_printf("PAT\t0x%016lx\n", rdmsr(MSR_PAT));
897 db_printf("GSBASE\t0x%016lx\n", rdmsr(MSR_GSBASE));
898 }
899
DB_SHOW_COMMAND(dbregs,db_show_dbregs)900 DB_SHOW_COMMAND(dbregs, db_show_dbregs)
901 {
902
903 db_printf("dr0\t0x%016lx\n", rdr0());
904 db_printf("dr1\t0x%016lx\n", rdr1());
905 db_printf("dr2\t0x%016lx\n", rdr2());
906 db_printf("dr3\t0x%016lx\n", rdr3());
907 db_printf("dr6\t0x%016lx\n", rdr6());
908 db_printf("dr7\t0x%016lx\n", rdr7());
909 }
910 #endif
911
912 void
sdtossd(sd,ssd)913 sdtossd(sd, ssd)
914 struct user_segment_descriptor *sd;
915 struct soft_segment_descriptor *ssd;
916 {
917
918 ssd->ssd_base = (sd->sd_hibase << 24) | sd->sd_lobase;
919 ssd->ssd_limit = (sd->sd_hilimit << 16) | sd->sd_lolimit;
920 ssd->ssd_type = sd->sd_type;
921 ssd->ssd_dpl = sd->sd_dpl;
922 ssd->ssd_p = sd->sd_p;
923 ssd->ssd_long = sd->sd_long;
924 ssd->ssd_def32 = sd->sd_def32;
925 ssd->ssd_gran = sd->sd_gran;
926 }
927
928 void
ssdtosd(ssd,sd)929 ssdtosd(ssd, sd)
930 struct soft_segment_descriptor *ssd;
931 struct user_segment_descriptor *sd;
932 {
933
934 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
935 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xff;
936 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
937 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
938 sd->sd_type = ssd->ssd_type;
939 sd->sd_dpl = ssd->ssd_dpl;
940 sd->sd_p = ssd->ssd_p;
941 sd->sd_long = ssd->ssd_long;
942 sd->sd_def32 = ssd->ssd_def32;
943 sd->sd_gran = ssd->ssd_gran;
944 }
945
946 void
ssdtosyssd(ssd,sd)947 ssdtosyssd(ssd, sd)
948 struct soft_segment_descriptor *ssd;
949 struct system_segment_descriptor *sd;
950 {
951
952 sd->sd_lobase = (ssd->ssd_base) & 0xffffff;
953 sd->sd_hibase = (ssd->ssd_base >> 24) & 0xfffffffffful;
954 sd->sd_lolimit = (ssd->ssd_limit) & 0xffff;
955 sd->sd_hilimit = (ssd->ssd_limit >> 16) & 0xf;
956 sd->sd_type = ssd->ssd_type;
957 sd->sd_dpl = ssd->ssd_dpl;
958 sd->sd_p = ssd->ssd_p;
959 sd->sd_gran = ssd->ssd_gran;
960 }
961
962 #if !defined(DEV_ATPIC) && defined(DEV_ISA)
963 #include <isa/isavar.h>
964 #include <isa/isareg.h>
965 /*
966 * Return a bitmap of the current interrupt requests. This is 8259-specific
967 * and is only suitable for use at probe time.
968 * This is only here to pacify sio. It is NOT FATAL if this doesn't work.
969 * It shouldn't be here. There should probably be an APIC centric
970 * implementation in the apic driver code, if at all.
971 */
972 intrmask_t
isa_irq_pending(void)973 isa_irq_pending(void)
974 {
975 u_char irr1;
976 u_char irr2;
977
978 irr1 = inb(IO_ICU1);
979 irr2 = inb(IO_ICU2);
980 return ((irr2 << 8) | irr1);
981 }
982 #endif
983
984 u_int basemem;
985
986 static int
add_physmap_entry(uint64_t base,uint64_t length,vm_paddr_t * physmap,int * physmap_idxp)987 add_physmap_entry(uint64_t base, uint64_t length, vm_paddr_t *physmap,
988 int *physmap_idxp)
989 {
990 int i, insert_idx, physmap_idx;
991
992 physmap_idx = *physmap_idxp;
993
994 if (length == 0)
995 return (1);
996
997 /*
998 * Find insertion point while checking for overlap. Start off by
999 * assuming the new entry will be added to the end.
1000 *
1001 * NB: physmap_idx points to the next free slot.
1002 */
1003 insert_idx = physmap_idx;
1004 for (i = 0; i <= physmap_idx; i += 2) {
1005 if (base < physmap[i + 1]) {
1006 if (base + length <= physmap[i]) {
1007 insert_idx = i;
1008 break;
1009 }
1010 if (boothowto & RB_VERBOSE)
1011 printf(
1012 "Overlapping memory regions, ignoring second region\n");
1013 return (1);
1014 }
1015 }
1016
1017 /* See if we can prepend to the next entry. */
1018 if (insert_idx <= physmap_idx && base + length == physmap[insert_idx]) {
1019 physmap[insert_idx] = base;
1020 return (1);
1021 }
1022
1023 /* See if we can append to the previous entry. */
1024 if (insert_idx > 0 && base == physmap[insert_idx - 1]) {
1025 physmap[insert_idx - 1] += length;
1026 return (1);
1027 }
1028
1029 physmap_idx += 2;
1030 *physmap_idxp = physmap_idx;
1031 if (physmap_idx == PHYS_AVAIL_ENTRIES) {
1032 printf(
1033 "Too many segments in the physical address map, giving up\n");
1034 return (0);
1035 }
1036
1037 /*
1038 * Move the last 'N' entries down to make room for the new
1039 * entry if needed.
1040 */
1041 for (i = (physmap_idx - 2); i > insert_idx; i -= 2) {
1042 physmap[i] = physmap[i - 2];
1043 physmap[i + 1] = physmap[i - 1];
1044 }
1045
1046 /* Insert the new entry. */
1047 physmap[insert_idx] = base;
1048 physmap[insert_idx + 1] = base + length;
1049 return (1);
1050 }
1051
1052 void
bios_add_smap_entries(struct bios_smap * smapbase,u_int32_t smapsize,vm_paddr_t * physmap,int * physmap_idx)1053 bios_add_smap_entries(struct bios_smap *smapbase, u_int32_t smapsize,
1054 vm_paddr_t *physmap, int *physmap_idx)
1055 {
1056 struct bios_smap *smap, *smapend;
1057
1058 smapend = (struct bios_smap *)((uintptr_t)smapbase + smapsize);
1059
1060 for (smap = smapbase; smap < smapend; smap++) {
1061 if (boothowto & RB_VERBOSE)
1062 printf("SMAP type=%02x base=%016lx len=%016lx\n",
1063 smap->type, smap->base, smap->length);
1064
1065 if (smap->type != SMAP_TYPE_MEMORY)
1066 continue;
1067
1068 if (!add_physmap_entry(smap->base, smap->length, physmap,
1069 physmap_idx))
1070 break;
1071 }
1072 }
1073
1074 static void
add_efi_map_entries(struct efi_map_header * efihdr,vm_paddr_t * physmap,int * physmap_idx)1075 add_efi_map_entries(struct efi_map_header *efihdr, vm_paddr_t *physmap,
1076 int *physmap_idx)
1077 {
1078 struct efi_md *map, *p;
1079 const char *type;
1080 size_t efisz;
1081 int ndesc, i;
1082
1083 static const char *types[] = {
1084 "Reserved",
1085 "LoaderCode",
1086 "LoaderData",
1087 "BootServicesCode",
1088 "BootServicesData",
1089 "RuntimeServicesCode",
1090 "RuntimeServicesData",
1091 "ConventionalMemory",
1092 "UnusableMemory",
1093 "ACPIReclaimMemory",
1094 "ACPIMemoryNVS",
1095 "MemoryMappedIO",
1096 "MemoryMappedIOPortSpace",
1097 "PalCode",
1098 "PersistentMemory"
1099 };
1100
1101 /*
1102 * Memory map data provided by UEFI via the GetMemoryMap
1103 * Boot Services API.
1104 */
1105 efisz = (sizeof(struct efi_map_header) + 0xf) & ~0xf;
1106 map = (struct efi_md *)((uint8_t *)efihdr + efisz);
1107
1108 if (efihdr->descriptor_size == 0)
1109 return;
1110 ndesc = efihdr->memory_size / efihdr->descriptor_size;
1111
1112 if (boothowto & RB_VERBOSE)
1113 printf("%23s %12s %12s %8s %4s\n",
1114 "Type", "Physical", "Virtual", "#Pages", "Attr");
1115
1116 for (i = 0, p = map; i < ndesc; i++,
1117 p = efi_next_descriptor(p, efihdr->descriptor_size)) {
1118 if (boothowto & RB_VERBOSE) {
1119 if (p->md_type < nitems(types))
1120 type = types[p->md_type];
1121 else
1122 type = "<INVALID>";
1123 printf("%23s %012lx %12p %08lx ", type, p->md_phys,
1124 p->md_virt, p->md_pages);
1125 if (p->md_attr & EFI_MD_ATTR_UC)
1126 printf("UC ");
1127 if (p->md_attr & EFI_MD_ATTR_WC)
1128 printf("WC ");
1129 if (p->md_attr & EFI_MD_ATTR_WT)
1130 printf("WT ");
1131 if (p->md_attr & EFI_MD_ATTR_WB)
1132 printf("WB ");
1133 if (p->md_attr & EFI_MD_ATTR_UCE)
1134 printf("UCE ");
1135 if (p->md_attr & EFI_MD_ATTR_WP)
1136 printf("WP ");
1137 if (p->md_attr & EFI_MD_ATTR_RP)
1138 printf("RP ");
1139 if (p->md_attr & EFI_MD_ATTR_XP)
1140 printf("XP ");
1141 if (p->md_attr & EFI_MD_ATTR_NV)
1142 printf("NV ");
1143 if (p->md_attr & EFI_MD_ATTR_MORE_RELIABLE)
1144 printf("MORE_RELIABLE ");
1145 if (p->md_attr & EFI_MD_ATTR_RO)
1146 printf("RO ");
1147 if (p->md_attr & EFI_MD_ATTR_RT)
1148 printf("RUNTIME");
1149 printf("\n");
1150 }
1151
1152 switch (p->md_type) {
1153 case EFI_MD_TYPE_CODE:
1154 case EFI_MD_TYPE_DATA:
1155 case EFI_MD_TYPE_BS_CODE:
1156 case EFI_MD_TYPE_BS_DATA:
1157 case EFI_MD_TYPE_FREE:
1158 /*
1159 * We're allowed to use any entry with these types.
1160 */
1161 break;
1162 default:
1163 continue;
1164 }
1165
1166 if (!add_physmap_entry(p->md_phys, (p->md_pages * PAGE_SIZE),
1167 physmap, physmap_idx))
1168 break;
1169 }
1170 }
1171
1172 static char bootmethod[16] = "";
1173 SYSCTL_STRING(_machdep, OID_AUTO, bootmethod, CTLFLAG_RD, bootmethod, 0,
1174 "System firmware boot method");
1175
1176 static void
native_parse_memmap(caddr_t kmdp,vm_paddr_t * physmap,int * physmap_idx)1177 native_parse_memmap(caddr_t kmdp, vm_paddr_t *physmap, int *physmap_idx)
1178 {
1179 struct bios_smap *smap;
1180 struct efi_map_header *efihdr;
1181 u_int32_t size;
1182
1183 /*
1184 * Memory map from INT 15:E820.
1185 *
1186 * subr_module.c says:
1187 * "Consumer may safely assume that size value precedes data."
1188 * ie: an int32_t immediately precedes smap.
1189 */
1190
1191 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1192 MODINFO_METADATA | MODINFOMD_EFI_MAP);
1193 smap = (struct bios_smap *)preload_search_info(kmdp,
1194 MODINFO_METADATA | MODINFOMD_SMAP);
1195 if (efihdr == NULL && smap == NULL)
1196 panic("No BIOS smap or EFI map info from loader!");
1197
1198 if (efihdr != NULL) {
1199 add_efi_map_entries(efihdr, physmap, physmap_idx);
1200 strlcpy(bootmethod, "UEFI", sizeof(bootmethod));
1201 } else {
1202 size = *((u_int32_t *)smap - 1);
1203 bios_add_smap_entries(smap, size, physmap, physmap_idx);
1204 strlcpy(bootmethod, "BIOS", sizeof(bootmethod));
1205 }
1206 }
1207
1208 #define PAGES_PER_GB (1024 * 1024 * 1024 / PAGE_SIZE)
1209
1210 /*
1211 * Populate the (physmap) array with base/bound pairs describing the
1212 * available physical memory in the system, then test this memory and
1213 * build the phys_avail array describing the actually-available memory.
1214 *
1215 * Total memory size may be set by the kernel environment variable
1216 * hw.physmem or the compile-time define MAXMEM.
1217 *
1218 * XXX first should be vm_paddr_t.
1219 */
1220 static void
getmemsize(caddr_t kmdp,u_int64_t first)1221 getmemsize(caddr_t kmdp, u_int64_t first)
1222 {
1223 int i, physmap_idx, pa_indx, da_indx;
1224 vm_paddr_t pa, physmap[PHYS_AVAIL_ENTRIES];
1225 u_long physmem_start, physmem_tunable, memtest;
1226 pt_entry_t *pte;
1227 quad_t dcons_addr, dcons_size;
1228 int page_counter;
1229
1230 /*
1231 * Tell the physical memory allocator about pages used to store
1232 * the kernel and preloaded data. See kmem_bootstrap_free().
1233 */
1234 vm_phys_early_add_seg((vm_paddr_t)kernphys, trunc_page(first));
1235
1236 bzero(physmap, sizeof(physmap));
1237 physmap_idx = 0;
1238
1239 init_ops.parse_memmap(kmdp, physmap, &physmap_idx);
1240 physmap_idx -= 2;
1241
1242 /*
1243 * Find the 'base memory' segment for SMP
1244 */
1245 basemem = 0;
1246 for (i = 0; i <= physmap_idx; i += 2) {
1247 if (physmap[i] <= 0xA0000) {
1248 basemem = physmap[i + 1] / 1024;
1249 break;
1250 }
1251 }
1252 if (basemem == 0 || basemem > 640) {
1253 if (bootverbose)
1254 printf(
1255 "Memory map doesn't contain a basemem segment, faking it");
1256 basemem = 640;
1257 }
1258
1259 /*
1260 * Maxmem isn't the "maximum memory", it's one larger than the
1261 * highest page of the physical address space. It should be
1262 * called something like "Maxphyspage". We may adjust this
1263 * based on ``hw.physmem'' and the results of the memory test.
1264 */
1265 Maxmem = atop(physmap[physmap_idx + 1]);
1266
1267 #ifdef MAXMEM
1268 Maxmem = MAXMEM / 4;
1269 #endif
1270
1271 if (TUNABLE_ULONG_FETCH("hw.physmem", &physmem_tunable))
1272 Maxmem = atop(physmem_tunable);
1273
1274 /*
1275 * The boot memory test is disabled by default, as it takes a
1276 * significant amount of time on large-memory systems, and is
1277 * unfriendly to virtual machines as it unnecessarily touches all
1278 * pages.
1279 *
1280 * A general name is used as the code may be extended to support
1281 * additional tests beyond the current "page present" test.
1282 */
1283 memtest = 0;
1284 TUNABLE_ULONG_FETCH("hw.memtest.tests", &memtest);
1285
1286 /*
1287 * Don't allow MAXMEM or hw.physmem to extend the amount of memory
1288 * in the system.
1289 */
1290 if (Maxmem > atop(physmap[physmap_idx + 1]))
1291 Maxmem = atop(physmap[physmap_idx + 1]);
1292
1293 if (atop(physmap[physmap_idx + 1]) != Maxmem &&
1294 (boothowto & RB_VERBOSE))
1295 printf("Physical memory use set to %ldK\n", Maxmem * 4);
1296
1297 /*
1298 * Make hole for "AP -> long mode" bootstrap code. The
1299 * mp_bootaddress vector is only available when the kernel
1300 * is configured to support APs and APs for the system start
1301 * in real mode mode (e.g. SMP bare metal).
1302 */
1303 if (init_ops.mp_bootaddress)
1304 init_ops.mp_bootaddress(physmap, &physmap_idx);
1305
1306 /* call pmap initialization to make new kernel address space */
1307 pmap_bootstrap(&first);
1308
1309 /*
1310 * Size up each available chunk of physical memory.
1311 *
1312 * XXX Some BIOSes corrupt low 64KB between suspend and resume.
1313 * By default, mask off the first 16 pages unless we appear to be
1314 * running in a VM.
1315 */
1316 physmem_start = (vm_guest > VM_GUEST_NO ? 1 : 16) << PAGE_SHIFT;
1317 TUNABLE_ULONG_FETCH("hw.physmem.start", &physmem_start);
1318 if (physmap[0] < physmem_start) {
1319 if (physmem_start < PAGE_SIZE)
1320 physmap[0] = PAGE_SIZE;
1321 else if (physmem_start >= physmap[1])
1322 physmap[0] = round_page(physmap[1] - PAGE_SIZE);
1323 else
1324 physmap[0] = round_page(physmem_start);
1325 }
1326 pa_indx = 0;
1327 da_indx = 1;
1328 phys_avail[pa_indx++] = physmap[0];
1329 phys_avail[pa_indx] = physmap[0];
1330 dump_avail[da_indx] = physmap[0];
1331 pte = CMAP1;
1332
1333 /*
1334 * Get dcons buffer address
1335 */
1336 if (getenv_quad("dcons.addr", &dcons_addr) == 0 ||
1337 getenv_quad("dcons.size", &dcons_size) == 0)
1338 dcons_addr = 0;
1339
1340 /*
1341 * physmap is in bytes, so when converting to page boundaries,
1342 * round up the start address and round down the end address.
1343 */
1344 page_counter = 0;
1345 if (memtest != 0)
1346 printf("Testing system memory");
1347 for (i = 0; i <= physmap_idx; i += 2) {
1348 vm_paddr_t end;
1349
1350 end = ptoa((vm_paddr_t)Maxmem);
1351 if (physmap[i + 1] < end)
1352 end = trunc_page(physmap[i + 1]);
1353 for (pa = round_page(physmap[i]); pa < end; pa += PAGE_SIZE) {
1354 int tmp, page_bad, full;
1355 int *ptr = (int *)CADDR1;
1356
1357 full = FALSE;
1358 /*
1359 * block out kernel memory as not available.
1360 */
1361 if (pa >= (vm_paddr_t)kernphys && pa < first)
1362 goto do_dump_avail;
1363
1364 /*
1365 * block out dcons buffer
1366 */
1367 if (dcons_addr > 0
1368 && pa >= trunc_page(dcons_addr)
1369 && pa < dcons_addr + dcons_size)
1370 goto do_dump_avail;
1371
1372 page_bad = FALSE;
1373 if (memtest == 0)
1374 goto skip_memtest;
1375
1376 /*
1377 * Print a "." every GB to show we're making
1378 * progress.
1379 */
1380 page_counter++;
1381 if ((page_counter % PAGES_PER_GB) == 0)
1382 printf(".");
1383
1384 /*
1385 * map page into kernel: valid, read/write,non-cacheable
1386 */
1387 *pte = pa | PG_V | PG_RW | PG_NC_PWT | PG_NC_PCD;
1388 invltlb();
1389
1390 tmp = *(int *)ptr;
1391 /*
1392 * Test for alternating 1's and 0's
1393 */
1394 *(volatile int *)ptr = 0xaaaaaaaa;
1395 if (*(volatile int *)ptr != 0xaaaaaaaa)
1396 page_bad = TRUE;
1397 /*
1398 * Test for alternating 0's and 1's
1399 */
1400 *(volatile int *)ptr = 0x55555555;
1401 if (*(volatile int *)ptr != 0x55555555)
1402 page_bad = TRUE;
1403 /*
1404 * Test for all 1's
1405 */
1406 *(volatile int *)ptr = 0xffffffff;
1407 if (*(volatile int *)ptr != 0xffffffff)
1408 page_bad = TRUE;
1409 /*
1410 * Test for all 0's
1411 */
1412 *(volatile int *)ptr = 0x0;
1413 if (*(volatile int *)ptr != 0x0)
1414 page_bad = TRUE;
1415 /*
1416 * Restore original value.
1417 */
1418 *(int *)ptr = tmp;
1419
1420 skip_memtest:
1421 /*
1422 * Adjust array of valid/good pages.
1423 */
1424 if (page_bad == TRUE)
1425 continue;
1426 /*
1427 * If this good page is a continuation of the
1428 * previous set of good pages, then just increase
1429 * the end pointer. Otherwise start a new chunk.
1430 * Note that "end" points one higher than end,
1431 * making the range >= start and < end.
1432 * If we're also doing a speculative memory
1433 * test and we at or past the end, bump up Maxmem
1434 * so that we keep going. The first bad page
1435 * will terminate the loop.
1436 */
1437 if (phys_avail[pa_indx] == pa) {
1438 phys_avail[pa_indx] += PAGE_SIZE;
1439 } else {
1440 pa_indx++;
1441 if (pa_indx == PHYS_AVAIL_ENTRIES) {
1442 printf(
1443 "Too many holes in the physical address space, giving up\n");
1444 pa_indx--;
1445 full = TRUE;
1446 goto do_dump_avail;
1447 }
1448 phys_avail[pa_indx++] = pa; /* start */
1449 phys_avail[pa_indx] = pa + PAGE_SIZE; /* end */
1450 }
1451 physmem++;
1452 do_dump_avail:
1453 if (dump_avail[da_indx] == pa) {
1454 dump_avail[da_indx] += PAGE_SIZE;
1455 } else {
1456 da_indx++;
1457 if (da_indx == PHYS_AVAIL_ENTRIES) {
1458 da_indx--;
1459 goto do_next;
1460 }
1461 dump_avail[da_indx++] = pa; /* start */
1462 dump_avail[da_indx] = pa + PAGE_SIZE; /* end */
1463 }
1464 do_next:
1465 if (full)
1466 break;
1467 }
1468 }
1469 *pte = 0;
1470 invltlb();
1471 if (memtest != 0)
1472 printf("\n");
1473
1474 /*
1475 * XXX
1476 * The last chunk must contain at least one page plus the message
1477 * buffer to avoid complicating other code (message buffer address
1478 * calculation, etc.).
1479 */
1480 while (phys_avail[pa_indx - 1] + PAGE_SIZE +
1481 round_page(msgbufsize) >= phys_avail[pa_indx]) {
1482 physmem -= atop(phys_avail[pa_indx] - phys_avail[pa_indx - 1]);
1483 phys_avail[pa_indx--] = 0;
1484 phys_avail[pa_indx--] = 0;
1485 }
1486
1487 Maxmem = atop(phys_avail[pa_indx]);
1488
1489 /* Trim off space for the message buffer. */
1490 phys_avail[pa_indx] -= round_page(msgbufsize);
1491
1492 /* Map the message buffer. */
1493 msgbufp = (struct msgbuf *)PHYS_TO_DMAP(phys_avail[pa_indx]);
1494 }
1495
1496 static caddr_t
native_parse_preload_data(u_int64_t modulep)1497 native_parse_preload_data(u_int64_t modulep)
1498 {
1499 caddr_t kmdp;
1500 char *envp;
1501 #ifdef DDB
1502 vm_offset_t ksym_start;
1503 vm_offset_t ksym_end;
1504 #endif
1505
1506 preload_metadata = (caddr_t)(uintptr_t)(modulep + KERNBASE);
1507 preload_bootstrap_relocate(KERNBASE);
1508 kmdp = preload_search_by_type("elf kernel");
1509 if (kmdp == NULL)
1510 kmdp = preload_search_by_type("elf64 kernel");
1511 boothowto = MD_FETCH(kmdp, MODINFOMD_HOWTO, int);
1512 envp = MD_FETCH(kmdp, MODINFOMD_ENVP, char *);
1513 if (envp != NULL)
1514 envp += KERNBASE;
1515 init_static_kenv(envp, 0);
1516 #ifdef DDB
1517 ksym_start = MD_FETCH(kmdp, MODINFOMD_SSYM, uintptr_t);
1518 ksym_end = MD_FETCH(kmdp, MODINFOMD_ESYM, uintptr_t);
1519 db_fetch_ksymtab(ksym_start, ksym_end, 0);
1520 #endif
1521 efi_systbl_phys = MD_FETCH(kmdp, MODINFOMD_FW_HANDLE, vm_paddr_t);
1522
1523 return (kmdp);
1524 }
1525
1526 static void
amd64_kdb_init(void)1527 amd64_kdb_init(void)
1528 {
1529 kdb_init();
1530 #ifdef KDB
1531 if (boothowto & RB_KDB)
1532 kdb_enter(KDB_WHY_BOOTFLAGS, "Boot flags requested debugger");
1533 #endif
1534 }
1535
1536 /* Set up the fast syscall stuff */
1537 void
amd64_conf_fast_syscall(void)1538 amd64_conf_fast_syscall(void)
1539 {
1540 uint64_t msr;
1541
1542 msr = rdmsr(MSR_EFER) | EFER_SCE;
1543 wrmsr(MSR_EFER, msr);
1544 wrmsr(MSR_LSTAR, pti ? (u_int64_t)IDTVEC(fast_syscall_pti) :
1545 (u_int64_t)IDTVEC(fast_syscall));
1546 wrmsr(MSR_CSTAR, (u_int64_t)IDTVEC(fast_syscall32));
1547 msr = ((u_int64_t)GSEL(GCODE_SEL, SEL_KPL) << 32) |
1548 ((u_int64_t)GSEL(GUCODE32_SEL, SEL_UPL) << 48);
1549 wrmsr(MSR_STAR, msr);
1550 wrmsr(MSR_SF_MASK, PSL_NT | PSL_T | PSL_I | PSL_C | PSL_D | PSL_AC);
1551 }
1552
1553 void
amd64_bsp_pcpu_init1(struct pcpu * pc)1554 amd64_bsp_pcpu_init1(struct pcpu *pc)
1555 {
1556 struct user_segment_descriptor *gdt;
1557
1558 PCPU_SET(prvspace, pc);
1559 gdt = *PCPU_PTR(gdt);
1560 PCPU_SET(curthread, &thread0);
1561 PCPU_SET(tssp, PCPU_PTR(common_tss));
1562 PCPU_SET(tss, (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1563 PCPU_SET(ldt, (struct system_segment_descriptor *)&gdt[GUSERLDT_SEL]);
1564 PCPU_SET(fs32p, &gdt[GUFS32_SEL]);
1565 PCPU_SET(gs32p, &gdt[GUGS32_SEL]);
1566 PCPU_SET(ucr3_load_mask, PMAP_UCR3_NOMASK);
1567 PCPU_SET(smp_tlb_gen, 1);
1568 }
1569
1570 void
amd64_bsp_pcpu_init2(uint64_t rsp0)1571 amd64_bsp_pcpu_init2(uint64_t rsp0)
1572 {
1573
1574 PCPU_SET(rsp0, rsp0);
1575 PCPU_SET(pti_rsp0, ((vm_offset_t)PCPU_PTR(pti_stack) +
1576 PC_PTI_STACK_SZ * sizeof(uint64_t)) & ~0xful);
1577 PCPU_SET(curpcb, thread0.td_pcb);
1578 }
1579
1580 void
amd64_bsp_ist_init(struct pcpu * pc)1581 amd64_bsp_ist_init(struct pcpu *pc)
1582 {
1583 struct nmi_pcpu *np;
1584 struct amd64tss *tssp;
1585
1586 tssp = &pc->pc_common_tss;
1587
1588 /* doublefault stack space, runs on ist1 */
1589 np = ((struct nmi_pcpu *)&dblfault_stack[sizeof(dblfault_stack)]) - 1;
1590 np->np_pcpu = (register_t)pc;
1591 tssp->tss_ist1 = (long)np;
1592
1593 /*
1594 * NMI stack, runs on ist2. The pcpu pointer is stored just
1595 * above the start of the ist2 stack.
1596 */
1597 np = ((struct nmi_pcpu *)&nmi0_stack[sizeof(nmi0_stack)]) - 1;
1598 np->np_pcpu = (register_t)pc;
1599 tssp->tss_ist2 = (long)np;
1600
1601 /*
1602 * MC# stack, runs on ist3. The pcpu pointer is stored just
1603 * above the start of the ist3 stack.
1604 */
1605 np = ((struct nmi_pcpu *)&mce0_stack[sizeof(mce0_stack)]) - 1;
1606 np->np_pcpu = (register_t)pc;
1607 tssp->tss_ist3 = (long)np;
1608
1609 /*
1610 * DB# stack, runs on ist4.
1611 */
1612 np = ((struct nmi_pcpu *)&dbg0_stack[sizeof(dbg0_stack)]) - 1;
1613 np->np_pcpu = (register_t)pc;
1614 tssp->tss_ist4 = (long)np;
1615 }
1616
1617 u_int64_t
hammer_time(u_int64_t modulep,u_int64_t physfree)1618 hammer_time(u_int64_t modulep, u_int64_t physfree)
1619 {
1620 caddr_t kmdp;
1621 int gsel_tss, x;
1622 struct pcpu *pc;
1623 struct xstate_hdr *xhdr;
1624 u_int64_t rsp0;
1625 char *env;
1626 struct user_segment_descriptor *gdt;
1627 struct region_descriptor r_gdt;
1628 size_t kstack0_sz;
1629 int late_console;
1630
1631 TSRAW(&thread0, TS_ENTER, __func__, NULL);
1632
1633 kmdp = init_ops.parse_preload_data(modulep);
1634
1635 physfree += ucode_load_bsp(physfree + KERNBASE);
1636 physfree = roundup2(physfree, PAGE_SIZE);
1637
1638 identify_cpu1();
1639 identify_hypervisor();
1640 identify_cpu_fixup_bsp();
1641 identify_cpu2();
1642 initializecpucache();
1643
1644 /*
1645 * Check for pti, pcid, and invpcid before ifuncs are
1646 * resolved, to correctly select the implementation for
1647 * pmap_activate_sw_mode().
1648 */
1649 pti = pti_get_default();
1650 TUNABLE_INT_FETCH("vm.pmap.pti", &pti);
1651 TUNABLE_INT_FETCH("vm.pmap.pcid_enabled", &pmap_pcid_enabled);
1652 if ((cpu_feature2 & CPUID2_PCID) != 0 && pmap_pcid_enabled) {
1653 invpcid_works = (cpu_stdext_feature &
1654 CPUID_STDEXT_INVPCID) != 0;
1655 } else {
1656 pmap_pcid_enabled = 0;
1657 }
1658
1659 link_elf_ireloc(kmdp);
1660
1661 /*
1662 * This may be done better later if it gets more high level
1663 * components in it. If so just link td->td_proc here.
1664 */
1665 proc_linkup0(&proc0, &thread0);
1666
1667 /* Init basic tunables, hz etc */
1668 init_param1();
1669
1670 thread0.td_kstack = physfree + KERNBASE;
1671 thread0.td_kstack_pages = kstack_pages;
1672 kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
1673 bzero((void *)thread0.td_kstack, kstack0_sz);
1674 physfree += kstack0_sz;
1675
1676 /*
1677 * Initialize enough of thread0 for delayed invalidation to
1678 * work very early. Rely on thread0.td_base_pri
1679 * zero-initialization, it is reset to PVM at proc0_init().
1680 */
1681 pmap_thread_init_invl_gen(&thread0);
1682
1683 pc = &temp_bsp_pcpu;
1684 pcpu_init(pc, 0, sizeof(struct pcpu));
1685 gdt = &temp_bsp_pcpu.pc_gdt[0];
1686
1687 /*
1688 * make gdt memory segments
1689 */
1690 for (x = 0; x < NGDT; x++) {
1691 if (x != GPROC0_SEL && x != (GPROC0_SEL + 1) &&
1692 x != GUSERLDT_SEL && x != (GUSERLDT_SEL) + 1)
1693 ssdtosd(&gdt_segs[x], &gdt[x]);
1694 }
1695 gdt_segs[GPROC0_SEL].ssd_base = (uintptr_t)&pc->pc_common_tss;
1696 ssdtosyssd(&gdt_segs[GPROC0_SEL],
1697 (struct system_segment_descriptor *)&gdt[GPROC0_SEL]);
1698
1699 r_gdt.rd_limit = NGDT * sizeof(gdt[0]) - 1;
1700 r_gdt.rd_base = (long)gdt;
1701 lgdt(&r_gdt);
1702
1703 wrmsr(MSR_FSBASE, 0); /* User value */
1704 wrmsr(MSR_GSBASE, (u_int64_t)pc);
1705 wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
1706
1707 dpcpu_init((void *)(physfree + KERNBASE), 0);
1708 physfree += DPCPU_SIZE;
1709 amd64_bsp_pcpu_init1(pc);
1710 /* Non-late cninit() and printf() can be moved up to here. */
1711
1712 /*
1713 * Initialize mutexes.
1714 *
1715 * icu_lock: in order to allow an interrupt to occur in a critical
1716 * section, to set pcpu->ipending (etc...) properly, we
1717 * must be able to get the icu lock, so it can't be
1718 * under witness.
1719 */
1720 mutex_init();
1721 mtx_init(&icu_lock, "icu", NULL, MTX_SPIN | MTX_NOWITNESS);
1722 mtx_init(&dt_lock, "descriptor tables", NULL, MTX_DEF);
1723
1724 /* exceptions */
1725 for (x = 0; x < NIDT; x++)
1726 setidt(x, pti ? &IDTVEC(rsvd_pti) : &IDTVEC(rsvd), SDT_SYSIGT,
1727 SEL_KPL, 0);
1728 setidt(IDT_DE, pti ? &IDTVEC(div_pti) : &IDTVEC(div), SDT_SYSIGT,
1729 SEL_KPL, 0);
1730 setidt(IDT_DB, &IDTVEC(dbg), SDT_SYSIGT, SEL_KPL, 4);
1731 setidt(IDT_NMI, &IDTVEC(nmi), SDT_SYSIGT, SEL_KPL, 2);
1732 setidt(IDT_BP, pti ? &IDTVEC(bpt_pti) : &IDTVEC(bpt), SDT_SYSIGT,
1733 SEL_UPL, 0);
1734 setidt(IDT_OF, pti ? &IDTVEC(ofl_pti) : &IDTVEC(ofl), SDT_SYSIGT,
1735 SEL_UPL, 0);
1736 setidt(IDT_BR, pti ? &IDTVEC(bnd_pti) : &IDTVEC(bnd), SDT_SYSIGT,
1737 SEL_KPL, 0);
1738 setidt(IDT_UD, pti ? &IDTVEC(ill_pti) : &IDTVEC(ill), SDT_SYSIGT,
1739 SEL_KPL, 0);
1740 setidt(IDT_NM, pti ? &IDTVEC(dna_pti) : &IDTVEC(dna), SDT_SYSIGT,
1741 SEL_KPL, 0);
1742 setidt(IDT_DF, &IDTVEC(dblfault), SDT_SYSIGT, SEL_KPL, 1);
1743 setidt(IDT_FPUGP, pti ? &IDTVEC(fpusegm_pti) : &IDTVEC(fpusegm),
1744 SDT_SYSIGT, SEL_KPL, 0);
1745 setidt(IDT_TS, pti ? &IDTVEC(tss_pti) : &IDTVEC(tss), SDT_SYSIGT,
1746 SEL_KPL, 0);
1747 setidt(IDT_NP, pti ? &IDTVEC(missing_pti) : &IDTVEC(missing),
1748 SDT_SYSIGT, SEL_KPL, 0);
1749 setidt(IDT_SS, pti ? &IDTVEC(stk_pti) : &IDTVEC(stk), SDT_SYSIGT,
1750 SEL_KPL, 0);
1751 setidt(IDT_GP, pti ? &IDTVEC(prot_pti) : &IDTVEC(prot), SDT_SYSIGT,
1752 SEL_KPL, 0);
1753 setidt(IDT_PF, pti ? &IDTVEC(page_pti) : &IDTVEC(page), SDT_SYSIGT,
1754 SEL_KPL, 0);
1755 setidt(IDT_MF, pti ? &IDTVEC(fpu_pti) : &IDTVEC(fpu), SDT_SYSIGT,
1756 SEL_KPL, 0);
1757 setidt(IDT_AC, pti ? &IDTVEC(align_pti) : &IDTVEC(align), SDT_SYSIGT,
1758 SEL_KPL, 0);
1759 setidt(IDT_MC, &IDTVEC(mchk), SDT_SYSIGT, SEL_KPL, 3);
1760 setidt(IDT_XF, pti ? &IDTVEC(xmm_pti) : &IDTVEC(xmm), SDT_SYSIGT,
1761 SEL_KPL, 0);
1762 #ifdef KDTRACE_HOOKS
1763 setidt(IDT_DTRACE_RET, pti ? &IDTVEC(dtrace_ret_pti) :
1764 &IDTVEC(dtrace_ret), SDT_SYSIGT, SEL_UPL, 0);
1765 #endif
1766 #ifdef XENHVM
1767 setidt(IDT_EVTCHN, pti ? &IDTVEC(xen_intr_upcall_pti) :
1768 &IDTVEC(xen_intr_upcall), SDT_SYSIGT, SEL_KPL, 0);
1769 #endif
1770 r_idt.rd_limit = sizeof(idt0) - 1;
1771 r_idt.rd_base = (long) idt;
1772 lidt(&r_idt);
1773
1774 /*
1775 * Initialize the clock before the console so that console
1776 * initialization can use DELAY().
1777 */
1778 clock_init();
1779
1780 /*
1781 * Use vt(4) by default for UEFI boot (during the sc(4)/vt(4)
1782 * transition).
1783 * Once bootblocks have updated, we can test directly for
1784 * efi_systbl != NULL here...
1785 */
1786 if (preload_search_info(kmdp, MODINFO_METADATA | MODINFOMD_EFI_MAP)
1787 != NULL)
1788 vty_set_preferred(VTY_VT);
1789
1790 TUNABLE_INT_FETCH("hw.ibrs_disable", &hw_ibrs_disable);
1791 TUNABLE_INT_FETCH("machdep.mitigations.ibrs.disable", &hw_ibrs_disable);
1792
1793 TUNABLE_INT_FETCH("hw.spec_store_bypass_disable", &hw_ssb_disable);
1794 TUNABLE_INT_FETCH("machdep.mitigations.ssb.disable", &hw_ssb_disable);
1795
1796 TUNABLE_INT_FETCH("machdep.syscall_ret_l1d_flush",
1797 &syscall_ret_l1d_flush_mode);
1798
1799 TUNABLE_INT_FETCH("hw.mds_disable", &hw_mds_disable);
1800 TUNABLE_INT_FETCH("machdep.mitigations.mds.disable", &hw_mds_disable);
1801
1802 TUNABLE_INT_FETCH("machdep.mitigations.taa.enable", &x86_taa_enable);
1803
1804 TUNABLE_INT_FETCH("machdep.mitigations.rndgs.enable",
1805 &x86_rngds_mitg_enable);
1806
1807 finishidentcpu(); /* Final stage of CPU initialization */
1808 initializecpu(); /* Initialize CPU registers */
1809
1810 amd64_bsp_ist_init(pc);
1811
1812 /* Set the IO permission bitmap (empty due to tss seg limit) */
1813 pc->pc_common_tss.tss_iobase = sizeof(struct amd64tss) +
1814 IOPERM_BITMAP_SIZE;
1815
1816 gsel_tss = GSEL(GPROC0_SEL, SEL_KPL);
1817 ltr(gsel_tss);
1818
1819 amd64_conf_fast_syscall();
1820
1821 /*
1822 * We initialize the PCB pointer early so that exception
1823 * handlers will work. Also set up td_critnest to short-cut
1824 * the page fault handler.
1825 */
1826 cpu_max_ext_state_size = sizeof(struct savefpu);
1827 set_top_of_stack_td(&thread0);
1828 thread0.td_pcb = get_pcb_td(&thread0);
1829 thread0.td_critnest = 1;
1830
1831 /*
1832 * The console and kdb should be initialized even earlier than here,
1833 * but some console drivers don't work until after getmemsize().
1834 * Default to late console initialization to support these drivers.
1835 * This loses mainly printf()s in getmemsize() and early debugging.
1836 */
1837 late_console = 1;
1838 TUNABLE_INT_FETCH("debug.late_console", &late_console);
1839 if (!late_console) {
1840 cninit();
1841 amd64_kdb_init();
1842 }
1843
1844 getmemsize(kmdp, physfree);
1845 init_param2(physmem);
1846
1847 /* now running on new page tables, configured,and u/iom is accessible */
1848
1849 #ifdef DEV_PCI
1850 /* This call might adjust phys_avail[]. */
1851 pci_early_quirks();
1852 #endif
1853
1854 if (late_console)
1855 cninit();
1856
1857 /*
1858 * Dump the boot metadata. We have to wait for cninit() since console
1859 * output is required. If it's grossly incorrect the kernel will never
1860 * make it this far.
1861 */
1862 if (getenv_is_true("debug.dump_modinfo_at_boot"))
1863 preload_dump();
1864
1865 #ifdef DEV_ISA
1866 #ifdef DEV_ATPIC
1867 elcr_probe();
1868 atpic_startup();
1869 #else
1870 /* Reset and mask the atpics and leave them shut down. */
1871 atpic_reset();
1872
1873 /*
1874 * Point the ICU spurious interrupt vectors at the APIC spurious
1875 * interrupt handler.
1876 */
1877 setidt(IDT_IO_INTS + 7, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1878 setidt(IDT_IO_INTS + 15, IDTVEC(spuriousint), SDT_SYSIGT, SEL_KPL, 0);
1879 #endif
1880 #else
1881 #error "have you forgotten the isa device?"
1882 #endif
1883
1884 if (late_console)
1885 amd64_kdb_init();
1886
1887 msgbufinit(msgbufp, msgbufsize);
1888 fpuinit();
1889
1890 /*
1891 * Reinitialize thread0's stack base now that the xsave area size is
1892 * known. Set up thread0's pcb save area after fpuinit calculated fpu
1893 * save area size. Zero out the extended state header in fpu save area.
1894 */
1895 set_top_of_stack_td(&thread0);
1896 thread0.td_pcb->pcb_save = get_pcb_user_save_td(&thread0);
1897 bzero(thread0.td_pcb->pcb_save, cpu_max_ext_state_size);
1898 if (use_xsave) {
1899 xhdr = (struct xstate_hdr *)(get_pcb_user_save_td(&thread0) +
1900 1);
1901 xhdr->xstate_bv = xsave_mask;
1902 }
1903 /* make an initial tss so cpu can get interrupt stack on syscall! */
1904 rsp0 = thread0.td_md.md_stack_base;
1905 /* Ensure the stack is aligned to 16 bytes */
1906 rsp0 &= ~0xFul;
1907 PCPU_PTR(common_tss)->tss_rsp0 = rsp0;
1908 amd64_bsp_pcpu_init2(rsp0);
1909
1910 /* transfer to user mode */
1911
1912 _ucodesel = GSEL(GUCODE_SEL, SEL_UPL);
1913 _udatasel = GSEL(GUDATA_SEL, SEL_UPL);
1914 _ucode32sel = GSEL(GUCODE32_SEL, SEL_UPL);
1915 _ufssel = GSEL(GUFS32_SEL, SEL_UPL);
1916 _ugssel = GSEL(GUGS32_SEL, SEL_UPL);
1917
1918 load_ds(_udatasel);
1919 load_es(_udatasel);
1920 load_fs(_ufssel);
1921
1922 /* setup proc 0's pcb */
1923 thread0.td_pcb->pcb_flags = 0;
1924 thread0.td_frame = &proc0_tf;
1925
1926 env = kern_getenv("kernelname");
1927 if (env != NULL)
1928 strlcpy(kernelname, env, sizeof(kernelname));
1929
1930 kcsan_cpu_init(0);
1931
1932 #ifdef FDT
1933 x86_init_fdt();
1934 #endif
1935 thread0.td_critnest = 0;
1936
1937 TSEXIT();
1938
1939 /* Location of kernel stack for locore */
1940 return (thread0.td_md.md_stack_base);
1941 }
1942
1943 void
cpu_pcpu_init(struct pcpu * pcpu,int cpuid,size_t size)1944 cpu_pcpu_init(struct pcpu *pcpu, int cpuid, size_t size)
1945 {
1946
1947 pcpu->pc_acpi_id = 0xffffffff;
1948 }
1949
1950 static int
smap_sysctl_handler(SYSCTL_HANDLER_ARGS)1951 smap_sysctl_handler(SYSCTL_HANDLER_ARGS)
1952 {
1953 struct bios_smap *smapbase;
1954 struct bios_smap_xattr smap;
1955 caddr_t kmdp;
1956 uint32_t *smapattr;
1957 int count, error, i;
1958
1959 /* Retrieve the system memory map from the loader. */
1960 kmdp = preload_search_by_type("elf kernel");
1961 if (kmdp == NULL)
1962 kmdp = preload_search_by_type("elf64 kernel");
1963 smapbase = (struct bios_smap *)preload_search_info(kmdp,
1964 MODINFO_METADATA | MODINFOMD_SMAP);
1965 if (smapbase == NULL)
1966 return (0);
1967 smapattr = (uint32_t *)preload_search_info(kmdp,
1968 MODINFO_METADATA | MODINFOMD_SMAP_XATTR);
1969 count = *((uint32_t *)smapbase - 1) / sizeof(*smapbase);
1970 error = 0;
1971 for (i = 0; i < count; i++) {
1972 smap.base = smapbase[i].base;
1973 smap.length = smapbase[i].length;
1974 smap.type = smapbase[i].type;
1975 if (smapattr != NULL)
1976 smap.xattr = smapattr[i];
1977 else
1978 smap.xattr = 0;
1979 error = SYSCTL_OUT(req, &smap, sizeof(smap));
1980 }
1981 return (error);
1982 }
1983 SYSCTL_PROC(_machdep, OID_AUTO, smap,
1984 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
1985 smap_sysctl_handler, "S,bios_smap_xattr",
1986 "Raw BIOS SMAP data");
1987
1988 static int
efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)1989 efi_map_sysctl_handler(SYSCTL_HANDLER_ARGS)
1990 {
1991 struct efi_map_header *efihdr;
1992 caddr_t kmdp;
1993 uint32_t efisize;
1994
1995 kmdp = preload_search_by_type("elf kernel");
1996 if (kmdp == NULL)
1997 kmdp = preload_search_by_type("elf64 kernel");
1998 efihdr = (struct efi_map_header *)preload_search_info(kmdp,
1999 MODINFO_METADATA | MODINFOMD_EFI_MAP);
2000 if (efihdr == NULL)
2001 return (0);
2002 efisize = *((uint32_t *)efihdr - 1);
2003 return (SYSCTL_OUT(req, efihdr, efisize));
2004 }
2005 SYSCTL_PROC(_machdep, OID_AUTO, efi_map,
2006 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2007 efi_map_sysctl_handler, "S,efi_map_header",
2008 "Raw EFI Memory Map");
2009
2010 void
spinlock_enter(void)2011 spinlock_enter(void)
2012 {
2013 struct thread *td;
2014 register_t flags;
2015
2016 td = curthread;
2017 if (td->td_md.md_spinlock_count == 0) {
2018 flags = intr_disable();
2019 td->td_md.md_spinlock_count = 1;
2020 td->td_md.md_saved_flags = flags;
2021 critical_enter();
2022 } else
2023 td->td_md.md_spinlock_count++;
2024 }
2025
2026 void
spinlock_exit(void)2027 spinlock_exit(void)
2028 {
2029 struct thread *td;
2030 register_t flags;
2031
2032 td = curthread;
2033 flags = td->td_md.md_saved_flags;
2034 td->td_md.md_spinlock_count--;
2035 if (td->td_md.md_spinlock_count == 0) {
2036 critical_exit();
2037 intr_restore(flags);
2038 }
2039 }
2040
2041 /*
2042 * Construct a PCB from a trapframe. This is called from kdb_trap() where
2043 * we want to start a backtrace from the function that caused us to enter
2044 * the debugger. We have the context in the trapframe, but base the trace
2045 * on the PCB. The PCB doesn't have to be perfect, as long as it contains
2046 * enough for a backtrace.
2047 */
2048 void
makectx(struct trapframe * tf,struct pcb * pcb)2049 makectx(struct trapframe *tf, struct pcb *pcb)
2050 {
2051
2052 pcb->pcb_r12 = tf->tf_r12;
2053 pcb->pcb_r13 = tf->tf_r13;
2054 pcb->pcb_r14 = tf->tf_r14;
2055 pcb->pcb_r15 = tf->tf_r15;
2056 pcb->pcb_rbp = tf->tf_rbp;
2057 pcb->pcb_rbx = tf->tf_rbx;
2058 pcb->pcb_rip = tf->tf_rip;
2059 pcb->pcb_rsp = tf->tf_rsp;
2060 }
2061
2062 int
ptrace_set_pc(struct thread * td,unsigned long addr)2063 ptrace_set_pc(struct thread *td, unsigned long addr)
2064 {
2065
2066 td->td_frame->tf_rip = addr;
2067 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2068 return (0);
2069 }
2070
2071 int
ptrace_single_step(struct thread * td)2072 ptrace_single_step(struct thread *td)
2073 {
2074
2075 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2076 if ((td->td_frame->tf_rflags & PSL_T) == 0) {
2077 td->td_frame->tf_rflags |= PSL_T;
2078 td->td_dbgflags |= TDB_STEP;
2079 }
2080 return (0);
2081 }
2082
2083 int
ptrace_clear_single_step(struct thread * td)2084 ptrace_clear_single_step(struct thread *td)
2085 {
2086
2087 PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
2088 td->td_frame->tf_rflags &= ~PSL_T;
2089 td->td_dbgflags &= ~TDB_STEP;
2090 return (0);
2091 }
2092
2093 int
fill_regs(struct thread * td,struct reg * regs)2094 fill_regs(struct thread *td, struct reg *regs)
2095 {
2096 struct trapframe *tp;
2097
2098 tp = td->td_frame;
2099 return (fill_frame_regs(tp, regs));
2100 }
2101
2102 int
fill_frame_regs(struct trapframe * tp,struct reg * regs)2103 fill_frame_regs(struct trapframe *tp, struct reg *regs)
2104 {
2105
2106 regs->r_r15 = tp->tf_r15;
2107 regs->r_r14 = tp->tf_r14;
2108 regs->r_r13 = tp->tf_r13;
2109 regs->r_r12 = tp->tf_r12;
2110 regs->r_r11 = tp->tf_r11;
2111 regs->r_r10 = tp->tf_r10;
2112 regs->r_r9 = tp->tf_r9;
2113 regs->r_r8 = tp->tf_r8;
2114 regs->r_rdi = tp->tf_rdi;
2115 regs->r_rsi = tp->tf_rsi;
2116 regs->r_rbp = tp->tf_rbp;
2117 regs->r_rbx = tp->tf_rbx;
2118 regs->r_rdx = tp->tf_rdx;
2119 regs->r_rcx = tp->tf_rcx;
2120 regs->r_rax = tp->tf_rax;
2121 regs->r_rip = tp->tf_rip;
2122 regs->r_cs = tp->tf_cs;
2123 regs->r_rflags = tp->tf_rflags;
2124 regs->r_rsp = tp->tf_rsp;
2125 regs->r_ss = tp->tf_ss;
2126 if (tp->tf_flags & TF_HASSEGS) {
2127 regs->r_ds = tp->tf_ds;
2128 regs->r_es = tp->tf_es;
2129 regs->r_fs = tp->tf_fs;
2130 regs->r_gs = tp->tf_gs;
2131 } else {
2132 regs->r_ds = 0;
2133 regs->r_es = 0;
2134 regs->r_fs = 0;
2135 regs->r_gs = 0;
2136 }
2137 regs->r_err = 0;
2138 regs->r_trapno = 0;
2139 return (0);
2140 }
2141
2142 int
set_regs(struct thread * td,struct reg * regs)2143 set_regs(struct thread *td, struct reg *regs)
2144 {
2145 struct trapframe *tp;
2146 register_t rflags;
2147
2148 tp = td->td_frame;
2149 rflags = regs->r_rflags & 0xffffffff;
2150 if (!EFL_SECURE(rflags, tp->tf_rflags) || !CS_SECURE(regs->r_cs))
2151 return (EINVAL);
2152 tp->tf_r15 = regs->r_r15;
2153 tp->tf_r14 = regs->r_r14;
2154 tp->tf_r13 = regs->r_r13;
2155 tp->tf_r12 = regs->r_r12;
2156 tp->tf_r11 = regs->r_r11;
2157 tp->tf_r10 = regs->r_r10;
2158 tp->tf_r9 = regs->r_r9;
2159 tp->tf_r8 = regs->r_r8;
2160 tp->tf_rdi = regs->r_rdi;
2161 tp->tf_rsi = regs->r_rsi;
2162 tp->tf_rbp = regs->r_rbp;
2163 tp->tf_rbx = regs->r_rbx;
2164 tp->tf_rdx = regs->r_rdx;
2165 tp->tf_rcx = regs->r_rcx;
2166 tp->tf_rax = regs->r_rax;
2167 tp->tf_rip = regs->r_rip;
2168 tp->tf_cs = regs->r_cs;
2169 tp->tf_rflags = rflags;
2170 tp->tf_rsp = regs->r_rsp;
2171 tp->tf_ss = regs->r_ss;
2172 if (0) { /* XXXKIB */
2173 tp->tf_ds = regs->r_ds;
2174 tp->tf_es = regs->r_es;
2175 tp->tf_fs = regs->r_fs;
2176 tp->tf_gs = regs->r_gs;
2177 tp->tf_flags = TF_HASSEGS;
2178 }
2179 set_pcb_flags(td->td_pcb, PCB_FULL_IRET);
2180 return (0);
2181 }
2182
2183 /* XXX check all this stuff! */
2184 /* externalize from sv_xmm */
2185 static void
fill_fpregs_xmm(struct savefpu * sv_xmm,struct fpreg * fpregs)2186 fill_fpregs_xmm(struct savefpu *sv_xmm, struct fpreg *fpregs)
2187 {
2188 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2189 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2190 int i;
2191
2192 /* pcb -> fpregs */
2193 bzero(fpregs, sizeof(*fpregs));
2194
2195 /* FPU control/status */
2196 penv_fpreg->en_cw = penv_xmm->en_cw;
2197 penv_fpreg->en_sw = penv_xmm->en_sw;
2198 penv_fpreg->en_tw = penv_xmm->en_tw;
2199 penv_fpreg->en_opcode = penv_xmm->en_opcode;
2200 penv_fpreg->en_rip = penv_xmm->en_rip;
2201 penv_fpreg->en_rdp = penv_xmm->en_rdp;
2202 penv_fpreg->en_mxcsr = penv_xmm->en_mxcsr;
2203 penv_fpreg->en_mxcsr_mask = penv_xmm->en_mxcsr_mask;
2204
2205 /* FPU registers */
2206 for (i = 0; i < 8; ++i)
2207 bcopy(sv_xmm->sv_fp[i].fp_acc.fp_bytes, fpregs->fpr_acc[i], 10);
2208
2209 /* SSE registers */
2210 for (i = 0; i < 16; ++i)
2211 bcopy(sv_xmm->sv_xmm[i].xmm_bytes, fpregs->fpr_xacc[i], 16);
2212 }
2213
2214 /* internalize from fpregs into sv_xmm */
2215 static void
set_fpregs_xmm(struct fpreg * fpregs,struct savefpu * sv_xmm)2216 set_fpregs_xmm(struct fpreg *fpregs, struct savefpu *sv_xmm)
2217 {
2218 struct envxmm *penv_xmm = &sv_xmm->sv_env;
2219 struct envxmm *penv_fpreg = (struct envxmm *)&fpregs->fpr_env;
2220 int i;
2221
2222 /* fpregs -> pcb */
2223 /* FPU control/status */
2224 penv_xmm->en_cw = penv_fpreg->en_cw;
2225 penv_xmm->en_sw = penv_fpreg->en_sw;
2226 penv_xmm->en_tw = penv_fpreg->en_tw;
2227 penv_xmm->en_opcode = penv_fpreg->en_opcode;
2228 penv_xmm->en_rip = penv_fpreg->en_rip;
2229 penv_xmm->en_rdp = penv_fpreg->en_rdp;
2230 penv_xmm->en_mxcsr = penv_fpreg->en_mxcsr;
2231 penv_xmm->en_mxcsr_mask = penv_fpreg->en_mxcsr_mask & cpu_mxcsr_mask;
2232
2233 /* FPU registers */
2234 for (i = 0; i < 8; ++i)
2235 bcopy(fpregs->fpr_acc[i], sv_xmm->sv_fp[i].fp_acc.fp_bytes, 10);
2236
2237 /* SSE registers */
2238 for (i = 0; i < 16; ++i)
2239 bcopy(fpregs->fpr_xacc[i], sv_xmm->sv_xmm[i].xmm_bytes, 16);
2240 }
2241
2242 /* externalize from td->pcb */
2243 int
fill_fpregs(struct thread * td,struct fpreg * fpregs)2244 fill_fpregs(struct thread *td, struct fpreg *fpregs)
2245 {
2246
2247 KASSERT(td == curthread || TD_IS_SUSPENDED(td) ||
2248 P_SHOULDSTOP(td->td_proc),
2249 ("not suspended thread %p", td));
2250 fpugetregs(td);
2251 fill_fpregs_xmm(get_pcb_user_save_td(td), fpregs);
2252 return (0);
2253 }
2254
2255 /* internalize to td->pcb */
2256 int
set_fpregs(struct thread * td,struct fpreg * fpregs)2257 set_fpregs(struct thread *td, struct fpreg *fpregs)
2258 {
2259
2260 critical_enter();
2261 set_fpregs_xmm(fpregs, get_pcb_user_save_td(td));
2262 fpuuserinited(td);
2263 critical_exit();
2264 return (0);
2265 }
2266
2267 /*
2268 * Get machine context.
2269 */
2270 int
get_mcontext(struct thread * td,mcontext_t * mcp,int flags)2271 get_mcontext(struct thread *td, mcontext_t *mcp, int flags)
2272 {
2273 struct pcb *pcb;
2274 struct trapframe *tp;
2275
2276 pcb = td->td_pcb;
2277 tp = td->td_frame;
2278 PROC_LOCK(curthread->td_proc);
2279 mcp->mc_onstack = sigonstack(tp->tf_rsp);
2280 PROC_UNLOCK(curthread->td_proc);
2281 mcp->mc_r15 = tp->tf_r15;
2282 mcp->mc_r14 = tp->tf_r14;
2283 mcp->mc_r13 = tp->tf_r13;
2284 mcp->mc_r12 = tp->tf_r12;
2285 mcp->mc_r11 = tp->tf_r11;
2286 mcp->mc_r10 = tp->tf_r10;
2287 mcp->mc_r9 = tp->tf_r9;
2288 mcp->mc_r8 = tp->tf_r8;
2289 mcp->mc_rdi = tp->tf_rdi;
2290 mcp->mc_rsi = tp->tf_rsi;
2291 mcp->mc_rbp = tp->tf_rbp;
2292 mcp->mc_rbx = tp->tf_rbx;
2293 mcp->mc_rcx = tp->tf_rcx;
2294 mcp->mc_rflags = tp->tf_rflags;
2295 if (flags & GET_MC_CLEAR_RET) {
2296 mcp->mc_rax = 0;
2297 mcp->mc_rdx = 0;
2298 mcp->mc_rflags &= ~PSL_C;
2299 } else {
2300 mcp->mc_rax = tp->tf_rax;
2301 mcp->mc_rdx = tp->tf_rdx;
2302 }
2303 mcp->mc_rip = tp->tf_rip;
2304 mcp->mc_cs = tp->tf_cs;
2305 mcp->mc_rsp = tp->tf_rsp;
2306 mcp->mc_ss = tp->tf_ss;
2307 mcp->mc_ds = tp->tf_ds;
2308 mcp->mc_es = tp->tf_es;
2309 mcp->mc_fs = tp->tf_fs;
2310 mcp->mc_gs = tp->tf_gs;
2311 mcp->mc_flags = tp->tf_flags;
2312 mcp->mc_len = sizeof(*mcp);
2313 get_fpcontext(td, mcp, NULL, 0);
2314 update_pcb_bases(pcb);
2315 mcp->mc_fsbase = pcb->pcb_fsbase;
2316 mcp->mc_gsbase = pcb->pcb_gsbase;
2317 mcp->mc_xfpustate = 0;
2318 mcp->mc_xfpustate_len = 0;
2319 bzero(mcp->mc_spare, sizeof(mcp->mc_spare));
2320 return (0);
2321 }
2322
2323 /*
2324 * Set machine context.
2325 *
2326 * However, we don't set any but the user modifiable flags, and we won't
2327 * touch the cs selector.
2328 */
2329 int
set_mcontext(struct thread * td,mcontext_t * mcp)2330 set_mcontext(struct thread *td, mcontext_t *mcp)
2331 {
2332 struct pcb *pcb;
2333 struct trapframe *tp;
2334 char *xfpustate;
2335 long rflags;
2336 int ret;
2337
2338 pcb = td->td_pcb;
2339 tp = td->td_frame;
2340 if (mcp->mc_len != sizeof(*mcp) ||
2341 (mcp->mc_flags & ~_MC_FLAG_MASK) != 0)
2342 return (EINVAL);
2343 rflags = (mcp->mc_rflags & PSL_USERCHANGE) |
2344 (tp->tf_rflags & ~PSL_USERCHANGE);
2345 if (mcp->mc_flags & _MC_HASFPXSTATE) {
2346 if (mcp->mc_xfpustate_len > cpu_max_ext_state_size -
2347 sizeof(struct savefpu))
2348 return (EINVAL);
2349 xfpustate = __builtin_alloca(mcp->mc_xfpustate_len);
2350 ret = copyin((void *)mcp->mc_xfpustate, xfpustate,
2351 mcp->mc_xfpustate_len);
2352 if (ret != 0)
2353 return (ret);
2354 } else
2355 xfpustate = NULL;
2356 ret = set_fpcontext(td, mcp, xfpustate, mcp->mc_xfpustate_len);
2357 if (ret != 0)
2358 return (ret);
2359 tp->tf_r15 = mcp->mc_r15;
2360 tp->tf_r14 = mcp->mc_r14;
2361 tp->tf_r13 = mcp->mc_r13;
2362 tp->tf_r12 = mcp->mc_r12;
2363 tp->tf_r11 = mcp->mc_r11;
2364 tp->tf_r10 = mcp->mc_r10;
2365 tp->tf_r9 = mcp->mc_r9;
2366 tp->tf_r8 = mcp->mc_r8;
2367 tp->tf_rdi = mcp->mc_rdi;
2368 tp->tf_rsi = mcp->mc_rsi;
2369 tp->tf_rbp = mcp->mc_rbp;
2370 tp->tf_rbx = mcp->mc_rbx;
2371 tp->tf_rdx = mcp->mc_rdx;
2372 tp->tf_rcx = mcp->mc_rcx;
2373 tp->tf_rax = mcp->mc_rax;
2374 tp->tf_rip = mcp->mc_rip;
2375 tp->tf_rflags = rflags;
2376 tp->tf_rsp = mcp->mc_rsp;
2377 tp->tf_ss = mcp->mc_ss;
2378 tp->tf_flags = mcp->mc_flags;
2379 if (tp->tf_flags & TF_HASSEGS) {
2380 tp->tf_ds = mcp->mc_ds;
2381 tp->tf_es = mcp->mc_es;
2382 tp->tf_fs = mcp->mc_fs;
2383 tp->tf_gs = mcp->mc_gs;
2384 }
2385 set_pcb_flags(pcb, PCB_FULL_IRET);
2386 if (mcp->mc_flags & _MC_HASBASES) {
2387 pcb->pcb_fsbase = mcp->mc_fsbase;
2388 pcb->pcb_gsbase = mcp->mc_gsbase;
2389 }
2390 return (0);
2391 }
2392
2393 static void
get_fpcontext(struct thread * td,mcontext_t * mcp,char * xfpusave,size_t xfpusave_len)2394 get_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpusave,
2395 size_t xfpusave_len)
2396 {
2397 size_t max_len, len;
2398
2399 mcp->mc_ownedfp = fpugetregs(td);
2400 bcopy(get_pcb_user_save_td(td), &mcp->mc_fpstate[0],
2401 sizeof(mcp->mc_fpstate));
2402 mcp->mc_fpformat = fpuformat();
2403 if (!use_xsave || xfpusave_len == 0)
2404 return;
2405 max_len = cpu_max_ext_state_size - sizeof(struct savefpu);
2406 len = xfpusave_len;
2407 if (len > max_len) {
2408 len = max_len;
2409 bzero(xfpusave + max_len, len - max_len);
2410 }
2411 mcp->mc_flags |= _MC_HASFPXSTATE;
2412 mcp->mc_xfpustate_len = len;
2413 bcopy(get_pcb_user_save_td(td) + 1, xfpusave, len);
2414 }
2415
2416 static int
set_fpcontext(struct thread * td,mcontext_t * mcp,char * xfpustate,size_t xfpustate_len)2417 set_fpcontext(struct thread *td, mcontext_t *mcp, char *xfpustate,
2418 size_t xfpustate_len)
2419 {
2420 int error;
2421
2422 if (mcp->mc_fpformat == _MC_FPFMT_NODEV)
2423 return (0);
2424 else if (mcp->mc_fpformat != _MC_FPFMT_XMM)
2425 return (EINVAL);
2426 else if (mcp->mc_ownedfp == _MC_FPOWNED_NONE) {
2427 /* We don't care what state is left in the FPU or PCB. */
2428 fpstate_drop(td);
2429 error = 0;
2430 } else if (mcp->mc_ownedfp == _MC_FPOWNED_FPU ||
2431 mcp->mc_ownedfp == _MC_FPOWNED_PCB) {
2432 error = fpusetregs(td, (struct savefpu *)&mcp->mc_fpstate,
2433 xfpustate, xfpustate_len);
2434 } else
2435 return (EINVAL);
2436 return (error);
2437 }
2438
2439 void
fpstate_drop(struct thread * td)2440 fpstate_drop(struct thread *td)
2441 {
2442
2443 KASSERT(PCB_USER_FPU(td->td_pcb), ("fpstate_drop: kernel-owned fpu"));
2444 critical_enter();
2445 if (PCPU_GET(fpcurthread) == td)
2446 fpudrop();
2447 /*
2448 * XXX force a full drop of the fpu. The above only drops it if we
2449 * owned it.
2450 *
2451 * XXX I don't much like fpugetuserregs()'s semantics of doing a full
2452 * drop. Dropping only to the pcb matches fnsave's behaviour.
2453 * We only need to drop to !PCB_INITDONE in sendsig(). But
2454 * sendsig() is the only caller of fpugetuserregs()... perhaps we just
2455 * have too many layers.
2456 */
2457 clear_pcb_flags(curthread->td_pcb,
2458 PCB_FPUINITDONE | PCB_USERFPUINITDONE);
2459 critical_exit();
2460 }
2461
2462 int
fill_dbregs(struct thread * td,struct dbreg * dbregs)2463 fill_dbregs(struct thread *td, struct dbreg *dbregs)
2464 {
2465 struct pcb *pcb;
2466
2467 if (td == NULL) {
2468 dbregs->dr[0] = rdr0();
2469 dbregs->dr[1] = rdr1();
2470 dbregs->dr[2] = rdr2();
2471 dbregs->dr[3] = rdr3();
2472 dbregs->dr[6] = rdr6();
2473 dbregs->dr[7] = rdr7();
2474 } else {
2475 pcb = td->td_pcb;
2476 dbregs->dr[0] = pcb->pcb_dr0;
2477 dbregs->dr[1] = pcb->pcb_dr1;
2478 dbregs->dr[2] = pcb->pcb_dr2;
2479 dbregs->dr[3] = pcb->pcb_dr3;
2480 dbregs->dr[6] = pcb->pcb_dr6;
2481 dbregs->dr[7] = pcb->pcb_dr7;
2482 }
2483 dbregs->dr[4] = 0;
2484 dbregs->dr[5] = 0;
2485 dbregs->dr[8] = 0;
2486 dbregs->dr[9] = 0;
2487 dbregs->dr[10] = 0;
2488 dbregs->dr[11] = 0;
2489 dbregs->dr[12] = 0;
2490 dbregs->dr[13] = 0;
2491 dbregs->dr[14] = 0;
2492 dbregs->dr[15] = 0;
2493 return (0);
2494 }
2495
2496 int
set_dbregs(struct thread * td,struct dbreg * dbregs)2497 set_dbregs(struct thread *td, struct dbreg *dbregs)
2498 {
2499 struct pcb *pcb;
2500 int i;
2501
2502 if (td == NULL) {
2503 load_dr0(dbregs->dr[0]);
2504 load_dr1(dbregs->dr[1]);
2505 load_dr2(dbregs->dr[2]);
2506 load_dr3(dbregs->dr[3]);
2507 load_dr6(dbregs->dr[6]);
2508 load_dr7(dbregs->dr[7]);
2509 } else {
2510 /*
2511 * Don't let an illegal value for dr7 get set. Specifically,
2512 * check for undefined settings. Setting these bit patterns
2513 * result in undefined behaviour and can lead to an unexpected
2514 * TRCTRAP or a general protection fault right here.
2515 * Upper bits of dr6 and dr7 must not be set
2516 */
2517 for (i = 0; i < 4; i++) {
2518 if (DBREG_DR7_ACCESS(dbregs->dr[7], i) == 0x02)
2519 return (EINVAL);
2520 if (td->td_frame->tf_cs == _ucode32sel &&
2521 DBREG_DR7_LEN(dbregs->dr[7], i) == DBREG_DR7_LEN_8)
2522 return (EINVAL);
2523 }
2524 if ((dbregs->dr[6] & 0xffffffff00000000ul) != 0 ||
2525 (dbregs->dr[7] & 0xffffffff00000000ul) != 0)
2526 return (EINVAL);
2527
2528 pcb = td->td_pcb;
2529
2530 /*
2531 * Don't let a process set a breakpoint that is not within the
2532 * process's address space. If a process could do this, it
2533 * could halt the system by setting a breakpoint in the kernel
2534 * (if ddb was enabled). Thus, we need to check to make sure
2535 * that no breakpoints are being enabled for addresses outside
2536 * process's address space.
2537 *
2538 * XXX - what about when the watched area of the user's
2539 * address space is written into from within the kernel
2540 * ... wouldn't that still cause a breakpoint to be generated
2541 * from within kernel mode?
2542 */
2543
2544 if (DBREG_DR7_ENABLED(dbregs->dr[7], 0)) {
2545 /* dr0 is enabled */
2546 if (dbregs->dr[0] >= VM_MAXUSER_ADDRESS)
2547 return (EINVAL);
2548 }
2549 if (DBREG_DR7_ENABLED(dbregs->dr[7], 1)) {
2550 /* dr1 is enabled */
2551 if (dbregs->dr[1] >= VM_MAXUSER_ADDRESS)
2552 return (EINVAL);
2553 }
2554 if (DBREG_DR7_ENABLED(dbregs->dr[7], 2)) {
2555 /* dr2 is enabled */
2556 if (dbregs->dr[2] >= VM_MAXUSER_ADDRESS)
2557 return (EINVAL);
2558 }
2559 if (DBREG_DR7_ENABLED(dbregs->dr[7], 3)) {
2560 /* dr3 is enabled */
2561 if (dbregs->dr[3] >= VM_MAXUSER_ADDRESS)
2562 return (EINVAL);
2563 }
2564
2565 pcb->pcb_dr0 = dbregs->dr[0];
2566 pcb->pcb_dr1 = dbregs->dr[1];
2567 pcb->pcb_dr2 = dbregs->dr[2];
2568 pcb->pcb_dr3 = dbregs->dr[3];
2569 pcb->pcb_dr6 = dbregs->dr[6];
2570 pcb->pcb_dr7 = dbregs->dr[7];
2571
2572 set_pcb_flags(pcb, PCB_DBREGS);
2573 }
2574
2575 return (0);
2576 }
2577
2578 void
reset_dbregs(void)2579 reset_dbregs(void)
2580 {
2581
2582 load_dr7(0); /* Turn off the control bits first */
2583 load_dr0(0);
2584 load_dr1(0);
2585 load_dr2(0);
2586 load_dr3(0);
2587 load_dr6(0);
2588 }
2589
2590 /*
2591 * Return > 0 if a hardware breakpoint has been hit, and the
2592 * breakpoint was in user space. Return 0, otherwise.
2593 */
2594 int
user_dbreg_trap(register_t dr6)2595 user_dbreg_trap(register_t dr6)
2596 {
2597 u_int64_t dr7;
2598 u_int64_t bp; /* breakpoint bits extracted from dr6 */
2599 int nbp; /* number of breakpoints that triggered */
2600 caddr_t addr[4]; /* breakpoint addresses */
2601 int i;
2602
2603 bp = dr6 & DBREG_DR6_BMASK;
2604 if (bp == 0) {
2605 /*
2606 * None of the breakpoint bits are set meaning this
2607 * trap was not caused by any of the debug registers
2608 */
2609 return 0;
2610 }
2611
2612 dr7 = rdr7();
2613 if ((dr7 & 0x000000ff) == 0) {
2614 /*
2615 * all GE and LE bits in the dr7 register are zero,
2616 * thus the trap couldn't have been caused by the
2617 * hardware debug registers
2618 */
2619 return 0;
2620 }
2621
2622 nbp = 0;
2623
2624 /*
2625 * at least one of the breakpoints were hit, check to see
2626 * which ones and if any of them are user space addresses
2627 */
2628
2629 if (bp & 0x01) {
2630 addr[nbp++] = (caddr_t)rdr0();
2631 }
2632 if (bp & 0x02) {
2633 addr[nbp++] = (caddr_t)rdr1();
2634 }
2635 if (bp & 0x04) {
2636 addr[nbp++] = (caddr_t)rdr2();
2637 }
2638 if (bp & 0x08) {
2639 addr[nbp++] = (caddr_t)rdr3();
2640 }
2641
2642 for (i = 0; i < nbp; i++) {
2643 if (addr[i] < (caddr_t)VM_MAXUSER_ADDRESS) {
2644 /*
2645 * addr[i] is in user space
2646 */
2647 return nbp;
2648 }
2649 }
2650
2651 /*
2652 * None of the breakpoints are in user space.
2653 */
2654 return 0;
2655 }
2656
2657 /*
2658 * The pcb_flags is only modified by current thread, or by other threads
2659 * when current thread is stopped. However, current thread may change it
2660 * from the interrupt context in cpu_switch(), or in the trap handler.
2661 * When we read-modify-write pcb_flags from C sources, compiler may generate
2662 * code that is not atomic regarding the interrupt handler. If a trap or
2663 * interrupt happens and any flag is modified from the handler, it can be
2664 * clobbered with the cached value later. Therefore, we implement setting
2665 * and clearing flags with single-instruction functions, which do not race
2666 * with possible modification of the flags from the trap or interrupt context,
2667 * because traps and interrupts are executed only on instruction boundary.
2668 */
2669 void
set_pcb_flags_raw(struct pcb * pcb,const u_int flags)2670 set_pcb_flags_raw(struct pcb *pcb, const u_int flags)
2671 {
2672
2673 __asm __volatile("orl %1,%0"
2674 : "=m" (pcb->pcb_flags) : "ir" (flags), "m" (pcb->pcb_flags)
2675 : "cc", "memory");
2676
2677 }
2678
2679 /*
2680 * The support for RDFSBASE, WRFSBASE and similar instructions for %gs
2681 * base requires that kernel saves MSR_FSBASE and MSR_{K,}GSBASE into
2682 * pcb if user space modified the bases. We must save on the context
2683 * switch or if the return to usermode happens through the doreti.
2684 *
2685 * Tracking of both events is performed by the pcb flag PCB_FULL_IRET,
2686 * which have a consequence that the base MSRs must be saved each time
2687 * the PCB_FULL_IRET flag is set. We disable interrupts to sync with
2688 * context switches.
2689 */
2690 static void
set_pcb_flags_fsgsbase(struct pcb * pcb,const u_int flags)2691 set_pcb_flags_fsgsbase(struct pcb *pcb, const u_int flags)
2692 {
2693 register_t r;
2694
2695 if (curpcb == pcb &&
2696 (flags & PCB_FULL_IRET) != 0 &&
2697 (pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2698 r = intr_disable();
2699 if ((pcb->pcb_flags & PCB_FULL_IRET) == 0) {
2700 if (rfs() == _ufssel)
2701 pcb->pcb_fsbase = rdfsbase();
2702 if (rgs() == _ugssel)
2703 pcb->pcb_gsbase = rdmsr(MSR_KGSBASE);
2704 }
2705 set_pcb_flags_raw(pcb, flags);
2706 intr_restore(r);
2707 } else {
2708 set_pcb_flags_raw(pcb, flags);
2709 }
2710 }
2711
2712 DEFINE_IFUNC(, void, set_pcb_flags, (struct pcb *, const u_int))
2713 {
2714
2715 return ((cpu_stdext_feature & CPUID_STDEXT_FSGSBASE) != 0 ?
2716 set_pcb_flags_fsgsbase : set_pcb_flags_raw);
2717 }
2718
2719 void
clear_pcb_flags(struct pcb * pcb,const u_int flags)2720 clear_pcb_flags(struct pcb *pcb, const u_int flags)
2721 {
2722
2723 __asm __volatile("andl %1,%0"
2724 : "=m" (pcb->pcb_flags) : "ir" (~flags), "m" (pcb->pcb_flags)
2725 : "cc", "memory");
2726 }
2727
2728 #ifdef KDB
2729
2730 /*
2731 * Provide inb() and outb() as functions. They are normally only available as
2732 * inline functions, thus cannot be called from the debugger.
2733 */
2734
2735 /* silence compiler warnings */
2736 u_char inb_(u_short);
2737 void outb_(u_short, u_char);
2738
2739 u_char
inb_(u_short port)2740 inb_(u_short port)
2741 {
2742 return inb(port);
2743 }
2744
2745 void
outb_(u_short port,u_char data)2746 outb_(u_short port, u_char data)
2747 {
2748 outb(port, data);
2749 }
2750
2751 #endif /* KDB */
2752
2753 #undef memset
2754 #undef memmove
2755 #undef memcpy
2756
2757 void *memset_std(void *buf, int c, size_t len);
2758 void *memset_erms(void *buf, int c, size_t len);
2759 void *memmove_std(void * _Nonnull dst, const void * _Nonnull src,
2760 size_t len);
2761 void *memmove_erms(void * _Nonnull dst, const void * _Nonnull src,
2762 size_t len);
2763 void *memcpy_std(void * _Nonnull dst, const void * _Nonnull src,
2764 size_t len);
2765 void *memcpy_erms(void * _Nonnull dst, const void * _Nonnull src,
2766 size_t len);
2767
2768 #ifdef KCSAN
2769 /*
2770 * These fail to build as ifuncs when used with KCSAN.
2771 */
2772 void *
memset(void * buf,int c,size_t len)2773 memset(void *buf, int c, size_t len)
2774 {
2775
2776 return (memset_std(buf, c, len));
2777 }
2778
2779 void *
memmove(void * _Nonnull dst,const void * _Nonnull src,size_t len)2780 memmove(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2781 {
2782
2783 return (memmove_std(dst, src, len));
2784 }
2785
2786 void *
memcpy(void * _Nonnull dst,const void * _Nonnull src,size_t len)2787 memcpy(void * _Nonnull dst, const void * _Nonnull src, size_t len)
2788 {
2789
2790 return (memcpy_std(dst, src, len));
2791 }
2792 #else
2793 DEFINE_IFUNC(, void *, memset, (void *, int, size_t))
2794 {
2795
2796 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2797 memset_erms : memset_std);
2798 }
2799
2800 DEFINE_IFUNC(, void *, memmove, (void * _Nonnull, const void * _Nonnull,
2801 size_t))
2802 {
2803
2804 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2805 memmove_erms : memmove_std);
2806 }
2807
2808 DEFINE_IFUNC(, void *, memcpy, (void * _Nonnull, const void * _Nonnull,size_t))
2809 {
2810
2811 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2812 memcpy_erms : memcpy_std);
2813 }
2814 #endif
2815
2816 void pagezero_std(void *addr);
2817 void pagezero_erms(void *addr);
2818 DEFINE_IFUNC(, void , pagezero, (void *))
2819 {
2820
2821 return ((cpu_stdext_feature & CPUID_STDEXT_ERMS) != 0 ?
2822 pagezero_erms : pagezero_std);
2823 }
2824