1*a9643ea8Slogwang /*- 2*a9643ea8Slogwang * Copyright (c) 2013 Dmitry Chagin 3*a9643ea8Slogwang * Copyright (c) 2004 Tim J. Robbins 4*a9643ea8Slogwang * Copyright (c) 2003 Peter Wemm 5*a9643ea8Slogwang * Copyright (c) 2002 Doug Rabson 6*a9643ea8Slogwang * Copyright (c) 1998-1999 Andrew Gallatin 7*a9643ea8Slogwang * Copyright (c) 1994-1996 Søren Schmidt 8*a9643ea8Slogwang * All rights reserved. 9*a9643ea8Slogwang * 10*a9643ea8Slogwang * Redistribution and use in source and binary forms, with or without 11*a9643ea8Slogwang * modification, are permitted provided that the following conditions 12*a9643ea8Slogwang * are met: 13*a9643ea8Slogwang * 1. Redistributions of source code must retain the above copyright 14*a9643ea8Slogwang * notice, this list of conditions and the following disclaimer 15*a9643ea8Slogwang * in this position and unchanged. 16*a9643ea8Slogwang * 2. Redistributions in binary form must reproduce the above copyright 17*a9643ea8Slogwang * notice, this list of conditions and the following disclaimer in the 18*a9643ea8Slogwang * documentation and/or other materials provided with the distribution. 19*a9643ea8Slogwang * 3. The name of the author may not be used to endorse or promote products 20*a9643ea8Slogwang * derived from this software without specific prior written permission 21*a9643ea8Slogwang * 22*a9643ea8Slogwang * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 23*a9643ea8Slogwang * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 24*a9643ea8Slogwang * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 25*a9643ea8Slogwang * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 26*a9643ea8Slogwang * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 27*a9643ea8Slogwang * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28*a9643ea8Slogwang * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29*a9643ea8Slogwang * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30*a9643ea8Slogwang * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 31*a9643ea8Slogwang * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32*a9643ea8Slogwang */ 33*a9643ea8Slogwang 34*a9643ea8Slogwang #include <sys/cdefs.h> 35*a9643ea8Slogwang __FBSDID("$FreeBSD$"); 36*a9643ea8Slogwang 37*a9643ea8Slogwang #include "opt_compat.h" 38*a9643ea8Slogwang 39*a9643ea8Slogwang #define __ELF_WORD_SIZE 64 40*a9643ea8Slogwang 41*a9643ea8Slogwang #include <sys/param.h> 42*a9643ea8Slogwang #include <sys/systm.h> 43*a9643ea8Slogwang #include <sys/exec.h> 44*a9643ea8Slogwang #include <sys/fcntl.h> 45*a9643ea8Slogwang #include <sys/imgact.h> 46*a9643ea8Slogwang #include <sys/imgact_elf.h> 47*a9643ea8Slogwang #include <sys/kernel.h> 48*a9643ea8Slogwang #include <sys/ktr.h> 49*a9643ea8Slogwang #include <sys/lock.h> 50*a9643ea8Slogwang #include <sys/malloc.h> 51*a9643ea8Slogwang #include <sys/module.h> 52*a9643ea8Slogwang #include <sys/mutex.h> 53*a9643ea8Slogwang #include <sys/proc.h> 54*a9643ea8Slogwang #include <sys/resourcevar.h> 55*a9643ea8Slogwang #include <sys/signalvar.h> 56*a9643ea8Slogwang #include <sys/sysctl.h> 57*a9643ea8Slogwang #include <sys/syscallsubr.h> 58*a9643ea8Slogwang #include <sys/sysent.h> 59*a9643ea8Slogwang #include <sys/sysproto.h> 60*a9643ea8Slogwang #include <sys/vnode.h> 61*a9643ea8Slogwang #include <sys/eventhandler.h> 62*a9643ea8Slogwang 63*a9643ea8Slogwang #include <vm/vm.h> 64*a9643ea8Slogwang #include <vm/pmap.h> 65*a9643ea8Slogwang #include <vm/vm_extern.h> 66*a9643ea8Slogwang #include <vm/vm_map.h> 67*a9643ea8Slogwang #include <vm/vm_object.h> 68*a9643ea8Slogwang #include <vm/vm_page.h> 69*a9643ea8Slogwang #include <vm/vm_param.h> 70*a9643ea8Slogwang 71*a9643ea8Slogwang #include <machine/cpu.h> 72*a9643ea8Slogwang #include <machine/md_var.h> 73*a9643ea8Slogwang #include <machine/pcb.h> 74*a9643ea8Slogwang #include <machine/specialreg.h> 75*a9643ea8Slogwang 76*a9643ea8Slogwang #include <amd64/linux/linux.h> 77*a9643ea8Slogwang #include <amd64/linux/linux_proto.h> 78*a9643ea8Slogwang #include <compat/linux/linux_emul.h> 79*a9643ea8Slogwang #include <compat/linux/linux_futex.h> 80*a9643ea8Slogwang #include <compat/linux/linux_ioctl.h> 81*a9643ea8Slogwang #include <compat/linux/linux_mib.h> 82*a9643ea8Slogwang #include <compat/linux/linux_misc.h> 83*a9643ea8Slogwang #include <compat/linux/linux_signal.h> 84*a9643ea8Slogwang #include <compat/linux/linux_sysproto.h> 85*a9643ea8Slogwang #include <compat/linux/linux_util.h> 86*a9643ea8Slogwang #include <compat/linux/linux_vdso.h> 87*a9643ea8Slogwang 88*a9643ea8Slogwang MODULE_VERSION(linux64, 1); 89*a9643ea8Slogwang 90*a9643ea8Slogwang #if BYTE_ORDER == LITTLE_ENDIAN 91*a9643ea8Slogwang #define SHELLMAGIC 0x2123 /* #! */ 92*a9643ea8Slogwang #else 93*a9643ea8Slogwang #define SHELLMAGIC 0x2321 94*a9643ea8Slogwang #endif 95*a9643ea8Slogwang 96*a9643ea8Slogwang #if defined(DEBUG) 97*a9643ea8Slogwang SYSCTL_PROC(_compat_linux, OID_AUTO, debug, 98*a9643ea8Slogwang CTLTYPE_STRING | CTLFLAG_RW, 99*a9643ea8Slogwang 0, 0, linux_sysctl_debug, "A", 100*a9643ea8Slogwang "Linux 64 debugging control"); 101*a9643ea8Slogwang #endif 102*a9643ea8Slogwang 103*a9643ea8Slogwang /* 104*a9643ea8Slogwang * Allow the this functions to use the ldebug() facility 105*a9643ea8Slogwang * even though they are not syscalls themselves. Map them 106*a9643ea8Slogwang * to syscall 0. This is slightly less bogus than using 107*a9643ea8Slogwang * ldebug(sigreturn). 108*a9643ea8Slogwang */ 109*a9643ea8Slogwang #define LINUX_SYS_linux_rt_sendsig 0 110*a9643ea8Slogwang 111*a9643ea8Slogwang const char *linux_kplatform; 112*a9643ea8Slogwang static int linux_szsigcode; 113*a9643ea8Slogwang static vm_object_t linux_shared_page_obj; 114*a9643ea8Slogwang static char *linux_shared_page_mapping; 115*a9643ea8Slogwang extern char _binary_linux_locore_o_start; 116*a9643ea8Slogwang extern char _binary_linux_locore_o_end; 117*a9643ea8Slogwang 118*a9643ea8Slogwang extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL]; 119*a9643ea8Slogwang 120*a9643ea8Slogwang SET_DECLARE(linux_ioctl_handler_set, struct linux_ioctl_handler); 121*a9643ea8Slogwang 122*a9643ea8Slogwang static register_t * linux_copyout_strings(struct image_params *imgp); 123*a9643ea8Slogwang static int elf_linux_fixup(register_t **stack_base, 124*a9643ea8Slogwang struct image_params *iparams); 125*a9643ea8Slogwang static boolean_t linux_trans_osrel(const Elf_Note *note, int32_t *osrel); 126*a9643ea8Slogwang static void linux_vdso_install(void *param); 127*a9643ea8Slogwang static void linux_vdso_deinstall(void *param); 128*a9643ea8Slogwang static void linux_set_syscall_retval(struct thread *td, int error); 129*a9643ea8Slogwang static int linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa); 130*a9643ea8Slogwang static void linux_exec_setregs(struct thread *td, struct image_params *imgp, 131*a9643ea8Slogwang u_long stack); 132*a9643ea8Slogwang static int linux_vsyscall(struct thread *td); 133*a9643ea8Slogwang 134*a9643ea8Slogwang /* 135*a9643ea8Slogwang * Linux syscalls return negative errno's, we do positive and map them 136*a9643ea8Slogwang * Reference: 137*a9643ea8Slogwang * FreeBSD: src/sys/sys/errno.h 138*a9643ea8Slogwang * Linux: linux-2.6.17.8/include/asm-generic/errno-base.h 139*a9643ea8Slogwang * linux-2.6.17.8/include/asm-generic/errno.h 140*a9643ea8Slogwang */ 141*a9643ea8Slogwang static int bsd_to_linux_errno[ELAST + 1] = { 142*a9643ea8Slogwang -0, -1, -2, -3, -4, -5, -6, -7, -8, -9, 143*a9643ea8Slogwang -10, -35, -12, -13, -14, -15, -16, -17, -18, -19, 144*a9643ea8Slogwang -20, -21, -22, -23, -24, -25, -26, -27, -28, -29, 145*a9643ea8Slogwang -30, -31, -32, -33, -34, -11,-115,-114, -88, -89, 146*a9643ea8Slogwang -90, -91, -92, -93, -94, -95, -96, -97, -98, -99, 147*a9643ea8Slogwang -100,-101,-102,-103,-104,-105,-106,-107,-108,-109, 148*a9643ea8Slogwang -110,-111, -40, -36,-112,-113, -39, -11, -87,-122, 149*a9643ea8Slogwang -116, -66, -6, -6, -6, -6, -6, -37, -38, -9, 150*a9643ea8Slogwang -6, -6, -43, -42, -75,-125, -84, -95, -16, -74, 151*a9643ea8Slogwang -72, -67, -71 152*a9643ea8Slogwang }; 153*a9643ea8Slogwang 154*a9643ea8Slogwang #define LINUX_T_UNKNOWN 255 155*a9643ea8Slogwang static int _bsd_to_linux_trapcode[] = { 156*a9643ea8Slogwang LINUX_T_UNKNOWN, /* 0 */ 157*a9643ea8Slogwang 6, /* 1 T_PRIVINFLT */ 158*a9643ea8Slogwang LINUX_T_UNKNOWN, /* 2 */ 159*a9643ea8Slogwang 3, /* 3 T_BPTFLT */ 160*a9643ea8Slogwang LINUX_T_UNKNOWN, /* 4 */ 161*a9643ea8Slogwang LINUX_T_UNKNOWN, /* 5 */ 162*a9643ea8Slogwang 16, /* 6 T_ARITHTRAP */ 163*a9643ea8Slogwang 254, /* 7 T_ASTFLT */ 164*a9643ea8Slogwang LINUX_T_UNKNOWN, /* 8 */ 165*a9643ea8Slogwang 13, /* 9 T_PROTFLT */ 166*a9643ea8Slogwang 1, /* 10 T_TRCTRAP */ 167*a9643ea8Slogwang LINUX_T_UNKNOWN, /* 11 */ 168*a9643ea8Slogwang 14, /* 12 T_PAGEFLT */ 169*a9643ea8Slogwang LINUX_T_UNKNOWN, /* 13 */ 170*a9643ea8Slogwang 17, /* 14 T_ALIGNFLT */ 171*a9643ea8Slogwang LINUX_T_UNKNOWN, /* 15 */ 172*a9643ea8Slogwang LINUX_T_UNKNOWN, /* 16 */ 173*a9643ea8Slogwang LINUX_T_UNKNOWN, /* 17 */ 174*a9643ea8Slogwang 0, /* 18 T_DIVIDE */ 175*a9643ea8Slogwang 2, /* 19 T_NMI */ 176*a9643ea8Slogwang 4, /* 20 T_OFLOW */ 177*a9643ea8Slogwang 5, /* 21 T_BOUND */ 178*a9643ea8Slogwang 7, /* 22 T_DNA */ 179*a9643ea8Slogwang 8, /* 23 T_DOUBLEFLT */ 180*a9643ea8Slogwang 9, /* 24 T_FPOPFLT */ 181*a9643ea8Slogwang 10, /* 25 T_TSSFLT */ 182*a9643ea8Slogwang 11, /* 26 T_SEGNPFLT */ 183*a9643ea8Slogwang 12, /* 27 T_STKFLT */ 184*a9643ea8Slogwang 18, /* 28 T_MCHK */ 185*a9643ea8Slogwang 19, /* 29 T_XMMFLT */ 186*a9643ea8Slogwang 15 /* 30 T_RESERVED */ 187*a9643ea8Slogwang }; 188*a9643ea8Slogwang #define bsd_to_linux_trapcode(code) \ 189*a9643ea8Slogwang ((code)<nitems(_bsd_to_linux_trapcode)? \ 190*a9643ea8Slogwang _bsd_to_linux_trapcode[(code)]: \ 191*a9643ea8Slogwang LINUX_T_UNKNOWN) 192*a9643ea8Slogwang 193*a9643ea8Slogwang LINUX_VDSO_SYM_INTPTR(linux_rt_sigcode); 194*a9643ea8Slogwang LINUX_VDSO_SYM_CHAR(linux_platform); 195*a9643ea8Slogwang 196*a9643ea8Slogwang /* 197*a9643ea8Slogwang * If FreeBSD & Linux have a difference of opinion about what a trap 198*a9643ea8Slogwang * means, deal with it here. 199*a9643ea8Slogwang * 200*a9643ea8Slogwang * MPSAFE 201*a9643ea8Slogwang */ 202*a9643ea8Slogwang static int 203*a9643ea8Slogwang translate_traps(int signal, int trap_code) 204*a9643ea8Slogwang { 205*a9643ea8Slogwang 206*a9643ea8Slogwang if (signal != SIGBUS) 207*a9643ea8Slogwang return signal; 208*a9643ea8Slogwang switch (trap_code) { 209*a9643ea8Slogwang case T_PROTFLT: 210*a9643ea8Slogwang case T_TSSFLT: 211*a9643ea8Slogwang case T_DOUBLEFLT: 212*a9643ea8Slogwang case T_PAGEFLT: 213*a9643ea8Slogwang return SIGSEGV; 214*a9643ea8Slogwang default: 215*a9643ea8Slogwang return signal; 216*a9643ea8Slogwang } 217*a9643ea8Slogwang } 218*a9643ea8Slogwang 219*a9643ea8Slogwang static int 220*a9643ea8Slogwang linux_fetch_syscall_args(struct thread *td, struct syscall_args *sa) 221*a9643ea8Slogwang { 222*a9643ea8Slogwang struct proc *p; 223*a9643ea8Slogwang struct trapframe *frame; 224*a9643ea8Slogwang 225*a9643ea8Slogwang p = td->td_proc; 226*a9643ea8Slogwang frame = td->td_frame; 227*a9643ea8Slogwang 228*a9643ea8Slogwang sa->args[0] = frame->tf_rdi; 229*a9643ea8Slogwang sa->args[1] = frame->tf_rsi; 230*a9643ea8Slogwang sa->args[2] = frame->tf_rdx; 231*a9643ea8Slogwang sa->args[3] = frame->tf_rcx; 232*a9643ea8Slogwang sa->args[4] = frame->tf_r8; 233*a9643ea8Slogwang sa->args[5] = frame->tf_r9; 234*a9643ea8Slogwang sa->code = frame->tf_rax; 235*a9643ea8Slogwang 236*a9643ea8Slogwang if (sa->code >= p->p_sysent->sv_size) 237*a9643ea8Slogwang /* nosys */ 238*a9643ea8Slogwang sa->callp = &p->p_sysent->sv_table[p->p_sysent->sv_size - 1]; 239*a9643ea8Slogwang else 240*a9643ea8Slogwang sa->callp = &p->p_sysent->sv_table[sa->code]; 241*a9643ea8Slogwang sa->narg = sa->callp->sy_narg; 242*a9643ea8Slogwang 243*a9643ea8Slogwang td->td_retval[0] = 0; 244*a9643ea8Slogwang return (0); 245*a9643ea8Slogwang } 246*a9643ea8Slogwang 247*a9643ea8Slogwang static void 248*a9643ea8Slogwang linux_set_syscall_retval(struct thread *td, int error) 249*a9643ea8Slogwang { 250*a9643ea8Slogwang struct trapframe *frame = td->td_frame; 251*a9643ea8Slogwang 252*a9643ea8Slogwang /* 253*a9643ea8Slogwang * On Linux only %rcx and %r11 values are not preserved across 254*a9643ea8Slogwang * the syscall. 255*a9643ea8Slogwang * So, do not clobber %rdx and %r10 256*a9643ea8Slogwang */ 257*a9643ea8Slogwang td->td_retval[1] = frame->tf_rdx; 258*a9643ea8Slogwang frame->tf_r10 = frame->tf_rcx; 259*a9643ea8Slogwang 260*a9643ea8Slogwang cpu_set_syscall_retval(td, error); 261*a9643ea8Slogwang 262*a9643ea8Slogwang /* Restore all registers. */ 263*a9643ea8Slogwang set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 264*a9643ea8Slogwang } 265*a9643ea8Slogwang 266*a9643ea8Slogwang static int 267*a9643ea8Slogwang elf_linux_fixup(register_t **stack_base, struct image_params *imgp) 268*a9643ea8Slogwang { 269*a9643ea8Slogwang Elf_Auxargs *args; 270*a9643ea8Slogwang Elf_Addr *base; 271*a9643ea8Slogwang Elf_Addr *pos; 272*a9643ea8Slogwang struct ps_strings *arginfo; 273*a9643ea8Slogwang struct proc *p; 274*a9643ea8Slogwang int issetugid; 275*a9643ea8Slogwang 276*a9643ea8Slogwang p = imgp->proc; 277*a9643ea8Slogwang arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; 278*a9643ea8Slogwang 279*a9643ea8Slogwang KASSERT(curthread->td_proc == imgp->proc, 280*a9643ea8Slogwang ("unsafe elf_linux_fixup(), should be curproc")); 281*a9643ea8Slogwang base = (Elf64_Addr *)*stack_base; 282*a9643ea8Slogwang args = (Elf64_Auxargs *)imgp->auxargs; 283*a9643ea8Slogwang pos = base + (imgp->args->argc + imgp->args->envc + 2); 284*a9643ea8Slogwang 285*a9643ea8Slogwang issetugid = p->p_flag & P_SUGID ? 1 : 0; 286*a9643ea8Slogwang AUXARGS_ENTRY(pos, LINUX_AT_SYSINFO_EHDR, 287*a9643ea8Slogwang imgp->proc->p_sysent->sv_shared_page_base); 288*a9643ea8Slogwang AUXARGS_ENTRY(pos, LINUX_AT_HWCAP, cpu_feature); 289*a9643ea8Slogwang AUXARGS_ENTRY(pos, LINUX_AT_CLKTCK, stclohz); 290*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); 291*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_PHENT, args->phent); 292*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); 293*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); 294*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_BASE, args->base); 295*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); 296*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); 297*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_UID, imgp->proc->p_ucred->cr_ruid); 298*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_EUID, imgp->proc->p_ucred->cr_svuid); 299*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_GID, imgp->proc->p_ucred->cr_rgid); 300*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_EGID, imgp->proc->p_ucred->cr_svgid); 301*a9643ea8Slogwang AUXARGS_ENTRY(pos, LINUX_AT_SECURE, issetugid); 302*a9643ea8Slogwang AUXARGS_ENTRY(pos, LINUX_AT_PLATFORM, PTROUT(linux_platform)); 303*a9643ea8Slogwang AUXARGS_ENTRY(pos, LINUX_AT_RANDOM, imgp->canary); 304*a9643ea8Slogwang if (imgp->execpathp != 0) 305*a9643ea8Slogwang AUXARGS_ENTRY(pos, LINUX_AT_EXECFN, imgp->execpathp); 306*a9643ea8Slogwang if (args->execfd != -1) 307*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); 308*a9643ea8Slogwang AUXARGS_ENTRY(pos, AT_NULL, 0); 309*a9643ea8Slogwang free(imgp->auxargs, M_TEMP); 310*a9643ea8Slogwang imgp->auxargs = NULL; 311*a9643ea8Slogwang 312*a9643ea8Slogwang base--; 313*a9643ea8Slogwang suword(base, (uint64_t)imgp->args->argc); 314*a9643ea8Slogwang 315*a9643ea8Slogwang *stack_base = (register_t *)base; 316*a9643ea8Slogwang return (0); 317*a9643ea8Slogwang } 318*a9643ea8Slogwang 319*a9643ea8Slogwang /* 320*a9643ea8Slogwang * Copy strings out to the new process address space, constructing new arg 321*a9643ea8Slogwang * and env vector tables. Return a pointer to the base so that it can be used 322*a9643ea8Slogwang * as the initial stack pointer. 323*a9643ea8Slogwang */ 324*a9643ea8Slogwang static register_t * 325*a9643ea8Slogwang linux_copyout_strings(struct image_params *imgp) 326*a9643ea8Slogwang { 327*a9643ea8Slogwang int argc, envc; 328*a9643ea8Slogwang char **vectp; 329*a9643ea8Slogwang char *stringp, *destp; 330*a9643ea8Slogwang register_t *stack_base; 331*a9643ea8Slogwang struct ps_strings *arginfo; 332*a9643ea8Slogwang char canary[LINUX_AT_RANDOM_LEN]; 333*a9643ea8Slogwang size_t execpath_len; 334*a9643ea8Slogwang struct proc *p; 335*a9643ea8Slogwang 336*a9643ea8Slogwang /* 337*a9643ea8Slogwang * Calculate string base and vector table pointers. 338*a9643ea8Slogwang */ 339*a9643ea8Slogwang if (imgp->execpath != NULL && imgp->auxargs != NULL) 340*a9643ea8Slogwang execpath_len = strlen(imgp->execpath) + 1; 341*a9643ea8Slogwang else 342*a9643ea8Slogwang execpath_len = 0; 343*a9643ea8Slogwang 344*a9643ea8Slogwang p = imgp->proc; 345*a9643ea8Slogwang arginfo = (struct ps_strings *)p->p_sysent->sv_psstrings; 346*a9643ea8Slogwang destp = (caddr_t)arginfo - SPARE_USRSPACE - 347*a9643ea8Slogwang roundup(sizeof(canary), sizeof(char *)) - 348*a9643ea8Slogwang roundup(execpath_len, sizeof(char *)) - 349*a9643ea8Slogwang roundup(ARG_MAX - imgp->args->stringspace, sizeof(char *)); 350*a9643ea8Slogwang 351*a9643ea8Slogwang if (execpath_len != 0) { 352*a9643ea8Slogwang imgp->execpathp = (uintptr_t)arginfo - execpath_len; 353*a9643ea8Slogwang copyout(imgp->execpath, (void *)imgp->execpathp, execpath_len); 354*a9643ea8Slogwang } 355*a9643ea8Slogwang 356*a9643ea8Slogwang /* 357*a9643ea8Slogwang * Prepare the canary for SSP. 358*a9643ea8Slogwang */ 359*a9643ea8Slogwang arc4rand(canary, sizeof(canary), 0); 360*a9643ea8Slogwang imgp->canary = (uintptr_t)arginfo - 361*a9643ea8Slogwang roundup(execpath_len, sizeof(char *)) - 362*a9643ea8Slogwang roundup(sizeof(canary), sizeof(char *)); 363*a9643ea8Slogwang copyout(canary, (void *)imgp->canary, sizeof(canary)); 364*a9643ea8Slogwang 365*a9643ea8Slogwang /* 366*a9643ea8Slogwang * If we have a valid auxargs ptr, prepare some room 367*a9643ea8Slogwang * on the stack. 368*a9643ea8Slogwang */ 369*a9643ea8Slogwang if (imgp->auxargs) { 370*a9643ea8Slogwang /* 371*a9643ea8Slogwang * 'AT_COUNT*2' is size for the ELF Auxargs data. This is for 372*a9643ea8Slogwang * lower compatibility. 373*a9643ea8Slogwang */ 374*a9643ea8Slogwang imgp->auxarg_size = (imgp->auxarg_size) ? imgp->auxarg_size : 375*a9643ea8Slogwang (LINUX_AT_COUNT * 2); 376*a9643ea8Slogwang 377*a9643ea8Slogwang /* 378*a9643ea8Slogwang * The '+ 2' is for the null pointers at the end of each of 379*a9643ea8Slogwang * the arg and env vector sets,and imgp->auxarg_size is room 380*a9643ea8Slogwang * for argument of Runtime loader. 381*a9643ea8Slogwang */ 382*a9643ea8Slogwang vectp = (char **)(destp - (imgp->args->argc + 383*a9643ea8Slogwang imgp->args->envc + 2 + imgp->auxarg_size) * sizeof(char *)); 384*a9643ea8Slogwang 385*a9643ea8Slogwang } else { 386*a9643ea8Slogwang /* 387*a9643ea8Slogwang * The '+ 2' is for the null pointers at the end of each of 388*a9643ea8Slogwang * the arg and env vector sets 389*a9643ea8Slogwang */ 390*a9643ea8Slogwang vectp = (char **)(destp - (imgp->args->argc + 391*a9643ea8Slogwang imgp->args->envc + 2) * sizeof(char *)); 392*a9643ea8Slogwang } 393*a9643ea8Slogwang 394*a9643ea8Slogwang /* 395*a9643ea8Slogwang * vectp also becomes our initial stack base 396*a9643ea8Slogwang */ 397*a9643ea8Slogwang stack_base = (register_t *)vectp; 398*a9643ea8Slogwang 399*a9643ea8Slogwang stringp = imgp->args->begin_argv; 400*a9643ea8Slogwang argc = imgp->args->argc; 401*a9643ea8Slogwang envc = imgp->args->envc; 402*a9643ea8Slogwang 403*a9643ea8Slogwang /* 404*a9643ea8Slogwang * Copy out strings - arguments and environment. 405*a9643ea8Slogwang */ 406*a9643ea8Slogwang copyout(stringp, destp, ARG_MAX - imgp->args->stringspace); 407*a9643ea8Slogwang 408*a9643ea8Slogwang /* 409*a9643ea8Slogwang * Fill in "ps_strings" struct for ps, w, etc. 410*a9643ea8Slogwang */ 411*a9643ea8Slogwang suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); 412*a9643ea8Slogwang suword(&arginfo->ps_nargvstr, argc); 413*a9643ea8Slogwang 414*a9643ea8Slogwang /* 415*a9643ea8Slogwang * Fill in argument portion of vector table. 416*a9643ea8Slogwang */ 417*a9643ea8Slogwang for (; argc > 0; --argc) { 418*a9643ea8Slogwang suword(vectp++, (long)(intptr_t)destp); 419*a9643ea8Slogwang while (*stringp++ != 0) 420*a9643ea8Slogwang destp++; 421*a9643ea8Slogwang destp++; 422*a9643ea8Slogwang } 423*a9643ea8Slogwang 424*a9643ea8Slogwang /* a null vector table pointer separates the argp's from the envp's */ 425*a9643ea8Slogwang suword(vectp++, 0); 426*a9643ea8Slogwang 427*a9643ea8Slogwang suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); 428*a9643ea8Slogwang suword(&arginfo->ps_nenvstr, envc); 429*a9643ea8Slogwang 430*a9643ea8Slogwang /* 431*a9643ea8Slogwang * Fill in environment portion of vector table. 432*a9643ea8Slogwang */ 433*a9643ea8Slogwang for (; envc > 0; --envc) { 434*a9643ea8Slogwang suword(vectp++, (long)(intptr_t)destp); 435*a9643ea8Slogwang while (*stringp++ != 0) 436*a9643ea8Slogwang destp++; 437*a9643ea8Slogwang destp++; 438*a9643ea8Slogwang } 439*a9643ea8Slogwang 440*a9643ea8Slogwang /* end of vector table is a null pointer */ 441*a9643ea8Slogwang suword(vectp, 0); 442*a9643ea8Slogwang return (stack_base); 443*a9643ea8Slogwang } 444*a9643ea8Slogwang 445*a9643ea8Slogwang /* 446*a9643ea8Slogwang * Reset registers to default values on exec. 447*a9643ea8Slogwang */ 448*a9643ea8Slogwang static void 449*a9643ea8Slogwang linux_exec_setregs(struct thread *td, struct image_params *imgp, u_long stack) 450*a9643ea8Slogwang { 451*a9643ea8Slogwang struct trapframe *regs = td->td_frame; 452*a9643ea8Slogwang struct pcb *pcb = td->td_pcb; 453*a9643ea8Slogwang 454*a9643ea8Slogwang mtx_lock(&dt_lock); 455*a9643ea8Slogwang if (td->td_proc->p_md.md_ldt != NULL) 456*a9643ea8Slogwang user_ldt_free(td); 457*a9643ea8Slogwang else 458*a9643ea8Slogwang mtx_unlock(&dt_lock); 459*a9643ea8Slogwang 460*a9643ea8Slogwang pcb->pcb_fsbase = 0; 461*a9643ea8Slogwang pcb->pcb_gsbase = 0; 462*a9643ea8Slogwang clear_pcb_flags(pcb, PCB_32BIT); 463*a9643ea8Slogwang pcb->pcb_initial_fpucw = __LINUX_NPXCW__; 464*a9643ea8Slogwang set_pcb_flags(pcb, PCB_FULL_IRET); 465*a9643ea8Slogwang 466*a9643ea8Slogwang bzero((char *)regs, sizeof(struct trapframe)); 467*a9643ea8Slogwang regs->tf_rip = imgp->entry_addr; 468*a9643ea8Slogwang regs->tf_rsp = stack; 469*a9643ea8Slogwang regs->tf_rflags = PSL_USER | (regs->tf_rflags & PSL_T); 470*a9643ea8Slogwang regs->tf_ss = _udatasel; 471*a9643ea8Slogwang regs->tf_cs = _ucodesel; 472*a9643ea8Slogwang regs->tf_ds = _udatasel; 473*a9643ea8Slogwang regs->tf_es = _udatasel; 474*a9643ea8Slogwang regs->tf_fs = _ufssel; 475*a9643ea8Slogwang regs->tf_gs = _ugssel; 476*a9643ea8Slogwang regs->tf_flags = TF_HASSEGS; 477*a9643ea8Slogwang 478*a9643ea8Slogwang /* 479*a9643ea8Slogwang * Reset the hardware debug registers if they were in use. 480*a9643ea8Slogwang * They won't have any meaning for the newly exec'd process. 481*a9643ea8Slogwang */ 482*a9643ea8Slogwang if (pcb->pcb_flags & PCB_DBREGS) { 483*a9643ea8Slogwang pcb->pcb_dr0 = 0; 484*a9643ea8Slogwang pcb->pcb_dr1 = 0; 485*a9643ea8Slogwang pcb->pcb_dr2 = 0; 486*a9643ea8Slogwang pcb->pcb_dr3 = 0; 487*a9643ea8Slogwang pcb->pcb_dr6 = 0; 488*a9643ea8Slogwang pcb->pcb_dr7 = 0; 489*a9643ea8Slogwang if (pcb == curpcb) { 490*a9643ea8Slogwang /* 491*a9643ea8Slogwang * Clear the debug registers on the running 492*a9643ea8Slogwang * CPU, otherwise they will end up affecting 493*a9643ea8Slogwang * the next process we switch to. 494*a9643ea8Slogwang */ 495*a9643ea8Slogwang reset_dbregs(); 496*a9643ea8Slogwang } 497*a9643ea8Slogwang clear_pcb_flags(pcb, PCB_DBREGS); 498*a9643ea8Slogwang } 499*a9643ea8Slogwang 500*a9643ea8Slogwang /* 501*a9643ea8Slogwang * Drop the FP state if we hold it, so that the process gets a 502*a9643ea8Slogwang * clean FP state if it uses the FPU again. 503*a9643ea8Slogwang */ 504*a9643ea8Slogwang fpstate_drop(td); 505*a9643ea8Slogwang } 506*a9643ea8Slogwang 507*a9643ea8Slogwang /* 508*a9643ea8Slogwang * Copied from amd64/amd64/machdep.c 509*a9643ea8Slogwang * 510*a9643ea8Slogwang * XXX fpu state need? don't think so 511*a9643ea8Slogwang */ 512*a9643ea8Slogwang int 513*a9643ea8Slogwang linux_rt_sigreturn(struct thread *td, struct linux_rt_sigreturn_args *args) 514*a9643ea8Slogwang { 515*a9643ea8Slogwang struct proc *p; 516*a9643ea8Slogwang struct l_ucontext uc; 517*a9643ea8Slogwang struct l_sigcontext *context; 518*a9643ea8Slogwang struct trapframe *regs; 519*a9643ea8Slogwang unsigned long rflags; 520*a9643ea8Slogwang int error; 521*a9643ea8Slogwang ksiginfo_t ksi; 522*a9643ea8Slogwang 523*a9643ea8Slogwang regs = td->td_frame; 524*a9643ea8Slogwang error = copyin((void *)regs->tf_rbx, &uc, sizeof(uc)); 525*a9643ea8Slogwang if (error != 0) 526*a9643ea8Slogwang return (error); 527*a9643ea8Slogwang 528*a9643ea8Slogwang p = td->td_proc; 529*a9643ea8Slogwang context = &uc.uc_mcontext; 530*a9643ea8Slogwang rflags = context->sc_rflags; 531*a9643ea8Slogwang 532*a9643ea8Slogwang /* 533*a9643ea8Slogwang * Don't allow users to change privileged or reserved flags. 534*a9643ea8Slogwang */ 535*a9643ea8Slogwang /* 536*a9643ea8Slogwang * XXX do allow users to change the privileged flag PSL_RF. 537*a9643ea8Slogwang * The cpu sets PSL_RF in tf_rflags for faults. Debuggers 538*a9643ea8Slogwang * should sometimes set it there too. tf_rflags is kept in 539*a9643ea8Slogwang * the signal context during signal handling and there is no 540*a9643ea8Slogwang * other place to remember it, so the PSL_RF bit may be 541*a9643ea8Slogwang * corrupted by the signal handler without us knowing. 542*a9643ea8Slogwang * Corruption of the PSL_RF bit at worst causes one more or 543*a9643ea8Slogwang * one less debugger trap, so allowing it is fairly harmless. 544*a9643ea8Slogwang */ 545*a9643ea8Slogwang 546*a9643ea8Slogwang #define RFLAG_SECURE(ef, oef) ((((ef) ^ (oef)) & ~PSL_USERCHANGE) == 0) 547*a9643ea8Slogwang if (!RFLAG_SECURE(rflags & ~PSL_RF, regs->tf_rflags & ~PSL_RF)) { 548*a9643ea8Slogwang printf("linux_rt_sigreturn: rflags = 0x%lx\n", rflags); 549*a9643ea8Slogwang return (EINVAL); 550*a9643ea8Slogwang } 551*a9643ea8Slogwang 552*a9643ea8Slogwang /* 553*a9643ea8Slogwang * Don't allow users to load a valid privileged %cs. Let the 554*a9643ea8Slogwang * hardware check for invalid selectors, excess privilege in 555*a9643ea8Slogwang * other selectors, invalid %eip's and invalid %esp's. 556*a9643ea8Slogwang */ 557*a9643ea8Slogwang #define CS_SECURE(cs) (ISPL(cs) == SEL_UPL) 558*a9643ea8Slogwang if (!CS_SECURE(context->sc_cs)) { 559*a9643ea8Slogwang printf("linux_rt_sigreturn: cs = 0x%x\n", context->sc_cs); 560*a9643ea8Slogwang ksiginfo_init_trap(&ksi); 561*a9643ea8Slogwang ksi.ksi_signo = SIGBUS; 562*a9643ea8Slogwang ksi.ksi_code = BUS_OBJERR; 563*a9643ea8Slogwang ksi.ksi_trapno = T_PROTFLT; 564*a9643ea8Slogwang ksi.ksi_addr = (void *)regs->tf_rip; 565*a9643ea8Slogwang trapsignal(td, &ksi); 566*a9643ea8Slogwang return (EINVAL); 567*a9643ea8Slogwang } 568*a9643ea8Slogwang 569*a9643ea8Slogwang PROC_LOCK(p); 570*a9643ea8Slogwang linux_to_bsd_sigset(&uc.uc_sigmask, &td->td_sigmask); 571*a9643ea8Slogwang SIG_CANTMASK(td->td_sigmask); 572*a9643ea8Slogwang signotify(td); 573*a9643ea8Slogwang PROC_UNLOCK(p); 574*a9643ea8Slogwang 575*a9643ea8Slogwang regs->tf_rdi = context->sc_rdi; 576*a9643ea8Slogwang regs->tf_rsi = context->sc_rsi; 577*a9643ea8Slogwang regs->tf_rdx = context->sc_rdx; 578*a9643ea8Slogwang regs->tf_rbp = context->sc_rbp; 579*a9643ea8Slogwang regs->tf_rbx = context->sc_rbx; 580*a9643ea8Slogwang regs->tf_rcx = context->sc_rcx; 581*a9643ea8Slogwang regs->tf_rax = context->sc_rax; 582*a9643ea8Slogwang regs->tf_rip = context->sc_rip; 583*a9643ea8Slogwang regs->tf_rsp = context->sc_rsp; 584*a9643ea8Slogwang regs->tf_r8 = context->sc_r8; 585*a9643ea8Slogwang regs->tf_r9 = context->sc_r9; 586*a9643ea8Slogwang regs->tf_r10 = context->sc_r10; 587*a9643ea8Slogwang regs->tf_r11 = context->sc_r11; 588*a9643ea8Slogwang regs->tf_r12 = context->sc_r12; 589*a9643ea8Slogwang regs->tf_r13 = context->sc_r13; 590*a9643ea8Slogwang regs->tf_r14 = context->sc_r14; 591*a9643ea8Slogwang regs->tf_r15 = context->sc_r15; 592*a9643ea8Slogwang regs->tf_cs = context->sc_cs; 593*a9643ea8Slogwang regs->tf_err = context->sc_err; 594*a9643ea8Slogwang regs->tf_rflags = rflags; 595*a9643ea8Slogwang 596*a9643ea8Slogwang set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 597*a9643ea8Slogwang return (EJUSTRETURN); 598*a9643ea8Slogwang } 599*a9643ea8Slogwang 600*a9643ea8Slogwang /* 601*a9643ea8Slogwang * copied from amd64/amd64/machdep.c 602*a9643ea8Slogwang * 603*a9643ea8Slogwang * Send an interrupt to process. 604*a9643ea8Slogwang */ 605*a9643ea8Slogwang static void 606*a9643ea8Slogwang linux_rt_sendsig(sig_t catcher, ksiginfo_t *ksi, sigset_t *mask) 607*a9643ea8Slogwang { 608*a9643ea8Slogwang struct l_rt_sigframe sf, *sfp; 609*a9643ea8Slogwang struct proc *p; 610*a9643ea8Slogwang struct thread *td; 611*a9643ea8Slogwang struct sigacts *psp; 612*a9643ea8Slogwang caddr_t sp; 613*a9643ea8Slogwang struct trapframe *regs; 614*a9643ea8Slogwang int sig, code; 615*a9643ea8Slogwang int oonstack; 616*a9643ea8Slogwang 617*a9643ea8Slogwang td = curthread; 618*a9643ea8Slogwang p = td->td_proc; 619*a9643ea8Slogwang PROC_LOCK_ASSERT(p, MA_OWNED); 620*a9643ea8Slogwang sig = ksi->ksi_signo; 621*a9643ea8Slogwang psp = p->p_sigacts; 622*a9643ea8Slogwang code = ksi->ksi_code; 623*a9643ea8Slogwang mtx_assert(&psp->ps_mtx, MA_OWNED); 624*a9643ea8Slogwang regs = td->td_frame; 625*a9643ea8Slogwang oonstack = sigonstack(regs->tf_rsp); 626*a9643ea8Slogwang 627*a9643ea8Slogwang LINUX_CTR4(rt_sendsig, "%p, %d, %p, %u", 628*a9643ea8Slogwang catcher, sig, mask, code); 629*a9643ea8Slogwang 630*a9643ea8Slogwang /* Allocate space for the signal handler context. */ 631*a9643ea8Slogwang if ((td->td_pflags & TDP_ALTSTACK) != 0 && !oonstack && 632*a9643ea8Slogwang SIGISMEMBER(psp->ps_sigonstack, sig)) { 633*a9643ea8Slogwang sp = (caddr_t)td->td_sigstk.ss_sp + td->td_sigstk.ss_size - 634*a9643ea8Slogwang sizeof(struct l_rt_sigframe); 635*a9643ea8Slogwang } else 636*a9643ea8Slogwang sp = (caddr_t)regs->tf_rsp - sizeof(struct l_rt_sigframe) - 128; 637*a9643ea8Slogwang /* Align to 16 bytes. */ 638*a9643ea8Slogwang sfp = (struct l_rt_sigframe *)((unsigned long)sp & ~0xFul); 639*a9643ea8Slogwang mtx_unlock(&psp->ps_mtx); 640*a9643ea8Slogwang 641*a9643ea8Slogwang /* Translate the signal. */ 642*a9643ea8Slogwang sig = bsd_to_linux_signal(sig); 643*a9643ea8Slogwang 644*a9643ea8Slogwang /* Save user context. */ 645*a9643ea8Slogwang bzero(&sf, sizeof(sf)); 646*a9643ea8Slogwang bsd_to_linux_sigset(mask, &sf.sf_sc.uc_sigmask); 647*a9643ea8Slogwang bsd_to_linux_sigset(mask, &sf.sf_sc.uc_mcontext.sc_mask); 648*a9643ea8Slogwang 649*a9643ea8Slogwang sf.sf_sc.uc_stack.ss_sp = PTROUT(td->td_sigstk.ss_sp); 650*a9643ea8Slogwang sf.sf_sc.uc_stack.ss_size = td->td_sigstk.ss_size; 651*a9643ea8Slogwang sf.sf_sc.uc_stack.ss_flags = (td->td_pflags & TDP_ALTSTACK) 652*a9643ea8Slogwang ? ((oonstack) ? LINUX_SS_ONSTACK : 0) : LINUX_SS_DISABLE; 653*a9643ea8Slogwang PROC_UNLOCK(p); 654*a9643ea8Slogwang 655*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_rdi = regs->tf_rdi; 656*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_rsi = regs->tf_rsi; 657*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_rdx = regs->tf_rdx; 658*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_rbp = regs->tf_rbp; 659*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_rbx = regs->tf_rbx; 660*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_rcx = regs->tf_rcx; 661*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_rax = regs->tf_rax; 662*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_rip = regs->tf_rip; 663*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_rsp = regs->tf_rsp; 664*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_r8 = regs->tf_r8; 665*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_r9 = regs->tf_r9; 666*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_r10 = regs->tf_r10; 667*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_r11 = regs->tf_r11; 668*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_r12 = regs->tf_r12; 669*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_r13 = regs->tf_r13; 670*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_r14 = regs->tf_r14; 671*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_r15 = regs->tf_r15; 672*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_cs = regs->tf_cs; 673*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_rflags = regs->tf_rflags; 674*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_err = regs->tf_err; 675*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_trapno = bsd_to_linux_trapcode(code); 676*a9643ea8Slogwang sf.sf_sc.uc_mcontext.sc_cr2 = (register_t)ksi->ksi_addr; 677*a9643ea8Slogwang 678*a9643ea8Slogwang /* Build the argument list for the signal handler. */ 679*a9643ea8Slogwang regs->tf_rdi = sig; /* arg 1 in %rdi */ 680*a9643ea8Slogwang regs->tf_rax = 0; 681*a9643ea8Slogwang regs->tf_rsi = (register_t)&sfp->sf_si; /* arg 2 in %rsi */ 682*a9643ea8Slogwang regs->tf_rdx = (register_t)&sfp->sf_sc; /* arg 3 in %rdx */ 683*a9643ea8Slogwang 684*a9643ea8Slogwang sf.sf_handler = catcher; 685*a9643ea8Slogwang /* Fill in POSIX parts */ 686*a9643ea8Slogwang ksiginfo_to_lsiginfo(ksi, &sf.sf_si, sig); 687*a9643ea8Slogwang 688*a9643ea8Slogwang /* 689*a9643ea8Slogwang * Copy the sigframe out to the user's stack. 690*a9643ea8Slogwang */ 691*a9643ea8Slogwang if (copyout(&sf, sfp, sizeof(*sfp)) != 0) { 692*a9643ea8Slogwang #ifdef DEBUG 693*a9643ea8Slogwang printf("process %ld has trashed its stack\n", (long)p->p_pid); 694*a9643ea8Slogwang #endif 695*a9643ea8Slogwang PROC_LOCK(p); 696*a9643ea8Slogwang sigexit(td, SIGILL); 697*a9643ea8Slogwang } 698*a9643ea8Slogwang 699*a9643ea8Slogwang regs->tf_rsp = (long)sfp; 700*a9643ea8Slogwang regs->tf_rip = linux_rt_sigcode; 701*a9643ea8Slogwang regs->tf_rflags &= ~(PSL_T | PSL_D); 702*a9643ea8Slogwang regs->tf_cs = _ucodesel; 703*a9643ea8Slogwang set_pcb_flags(td->td_pcb, PCB_FULL_IRET); 704*a9643ea8Slogwang PROC_LOCK(p); 705*a9643ea8Slogwang mtx_lock(&psp->ps_mtx); 706*a9643ea8Slogwang } 707*a9643ea8Slogwang 708*a9643ea8Slogwang /* 709*a9643ea8Slogwang * If a linux binary is exec'ing something, try this image activator 710*a9643ea8Slogwang * first. We override standard shell script execution in order to 711*a9643ea8Slogwang * be able to modify the interpreter path. We only do this if a linux 712*a9643ea8Slogwang * binary is doing the exec, so we do not create an EXEC module for it. 713*a9643ea8Slogwang */ 714*a9643ea8Slogwang static int exec_linux_imgact_try(struct image_params *iparams); 715*a9643ea8Slogwang 716*a9643ea8Slogwang static int 717*a9643ea8Slogwang exec_linux_imgact_try(struct image_params *imgp) 718*a9643ea8Slogwang { 719*a9643ea8Slogwang const char *head = (const char *)imgp->image_header; 720*a9643ea8Slogwang char *rpath; 721*a9643ea8Slogwang int error = -1, len; 722*a9643ea8Slogwang 723*a9643ea8Slogwang /* 724*a9643ea8Slogwang * The interpreter for shell scripts run from a linux binary needs 725*a9643ea8Slogwang * to be located in /compat/linux if possible in order to recursively 726*a9643ea8Slogwang * maintain linux path emulation. 727*a9643ea8Slogwang */ 728*a9643ea8Slogwang if (((const short *)head)[0] == SHELLMAGIC) { 729*a9643ea8Slogwang /* 730*a9643ea8Slogwang * Run our normal shell image activator. If it succeeds 731*a9643ea8Slogwang * attempt to use the alternate path for the interpreter. 732*a9643ea8Slogwang * If an alternate path is found, use our stringspace 733*a9643ea8Slogwang * to store it. 734*a9643ea8Slogwang */ 735*a9643ea8Slogwang if ((error = exec_shell_imgact(imgp)) == 0) { 736*a9643ea8Slogwang linux_emul_convpath(FIRST_THREAD_IN_PROC(imgp->proc), 737*a9643ea8Slogwang imgp->interpreter_name, UIO_SYSSPACE, 738*a9643ea8Slogwang &rpath, 0, AT_FDCWD); 739*a9643ea8Slogwang if (rpath != NULL) { 740*a9643ea8Slogwang len = strlen(rpath) + 1; 741*a9643ea8Slogwang 742*a9643ea8Slogwang if (len <= MAXSHELLCMDLEN) 743*a9643ea8Slogwang memcpy(imgp->interpreter_name, 744*a9643ea8Slogwang rpath, len); 745*a9643ea8Slogwang free(rpath, M_TEMP); 746*a9643ea8Slogwang } 747*a9643ea8Slogwang } 748*a9643ea8Slogwang } 749*a9643ea8Slogwang return(error); 750*a9643ea8Slogwang } 751*a9643ea8Slogwang 752*a9643ea8Slogwang #define LINUX_VSYSCALL_START (-10UL << 20) 753*a9643ea8Slogwang #define LINUX_VSYSCALL_SZ 1024 754*a9643ea8Slogwang 755*a9643ea8Slogwang const unsigned long linux_vsyscall_vector[] = { 756*a9643ea8Slogwang LINUX_SYS_gettimeofday, 757*a9643ea8Slogwang LINUX_SYS_linux_time, 758*a9643ea8Slogwang /* getcpu not implemented */ 759*a9643ea8Slogwang }; 760*a9643ea8Slogwang 761*a9643ea8Slogwang static int 762*a9643ea8Slogwang linux_vsyscall(struct thread *td) 763*a9643ea8Slogwang { 764*a9643ea8Slogwang struct trapframe *frame; 765*a9643ea8Slogwang uint64_t retqaddr; 766*a9643ea8Slogwang int code, traced; 767*a9643ea8Slogwang int error; 768*a9643ea8Slogwang 769*a9643ea8Slogwang frame = td->td_frame; 770*a9643ea8Slogwang 771*a9643ea8Slogwang /* Check %rip for vsyscall area */ 772*a9643ea8Slogwang if (__predict_true(frame->tf_rip < LINUX_VSYSCALL_START)) 773*a9643ea8Slogwang return (EINVAL); 774*a9643ea8Slogwang if ((frame->tf_rip & (LINUX_VSYSCALL_SZ - 1)) != 0) 775*a9643ea8Slogwang return (EINVAL); 776*a9643ea8Slogwang code = (frame->tf_rip - LINUX_VSYSCALL_START) / LINUX_VSYSCALL_SZ; 777*a9643ea8Slogwang if (code >= nitems(linux_vsyscall_vector)) 778*a9643ea8Slogwang return (EINVAL); 779*a9643ea8Slogwang 780*a9643ea8Slogwang /* 781*a9643ea8Slogwang * vsyscall called as callq *(%rax), so we must 782*a9643ea8Slogwang * use return address from %rsp and also fixup %rsp 783*a9643ea8Slogwang */ 784*a9643ea8Slogwang error = copyin((void *)frame->tf_rsp, &retqaddr, sizeof(retqaddr)); 785*a9643ea8Slogwang if (error) 786*a9643ea8Slogwang return (error); 787*a9643ea8Slogwang 788*a9643ea8Slogwang frame->tf_rip = retqaddr; 789*a9643ea8Slogwang frame->tf_rax = linux_vsyscall_vector[code]; 790*a9643ea8Slogwang frame->tf_rsp += 8; 791*a9643ea8Slogwang 792*a9643ea8Slogwang traced = (frame->tf_flags & PSL_T); 793*a9643ea8Slogwang 794*a9643ea8Slogwang amd64_syscall(td, traced); 795*a9643ea8Slogwang 796*a9643ea8Slogwang return (0); 797*a9643ea8Slogwang } 798*a9643ea8Slogwang 799*a9643ea8Slogwang struct sysentvec elf_linux_sysvec = { 800*a9643ea8Slogwang .sv_size = LINUX_SYS_MAXSYSCALL, 801*a9643ea8Slogwang .sv_table = linux_sysent, 802*a9643ea8Slogwang .sv_mask = 0, 803*a9643ea8Slogwang .sv_errsize = ELAST + 1, 804*a9643ea8Slogwang .sv_errtbl = bsd_to_linux_errno, 805*a9643ea8Slogwang .sv_transtrap = translate_traps, 806*a9643ea8Slogwang .sv_fixup = elf_linux_fixup, 807*a9643ea8Slogwang .sv_sendsig = linux_rt_sendsig, 808*a9643ea8Slogwang .sv_sigcode = &_binary_linux_locore_o_start, 809*a9643ea8Slogwang .sv_szsigcode = &linux_szsigcode, 810*a9643ea8Slogwang .sv_name = "Linux ELF64", 811*a9643ea8Slogwang .sv_coredump = elf64_coredump, 812*a9643ea8Slogwang .sv_imgact_try = exec_linux_imgact_try, 813*a9643ea8Slogwang .sv_minsigstksz = LINUX_MINSIGSTKSZ, 814*a9643ea8Slogwang .sv_pagesize = PAGE_SIZE, 815*a9643ea8Slogwang .sv_minuser = VM_MIN_ADDRESS, 816*a9643ea8Slogwang .sv_maxuser = VM_MAXUSER_ADDRESS, 817*a9643ea8Slogwang .sv_usrstack = USRSTACK, 818*a9643ea8Slogwang .sv_psstrings = PS_STRINGS, 819*a9643ea8Slogwang .sv_stackprot = VM_PROT_ALL, 820*a9643ea8Slogwang .sv_copyout_strings = linux_copyout_strings, 821*a9643ea8Slogwang .sv_setregs = linux_exec_setregs, 822*a9643ea8Slogwang .sv_fixlimit = NULL, 823*a9643ea8Slogwang .sv_maxssiz = NULL, 824*a9643ea8Slogwang .sv_flags = SV_ABI_LINUX | SV_LP64 | SV_SHP, 825*a9643ea8Slogwang .sv_set_syscall_retval = linux_set_syscall_retval, 826*a9643ea8Slogwang .sv_fetch_syscall_args = linux_fetch_syscall_args, 827*a9643ea8Slogwang .sv_syscallnames = NULL, 828*a9643ea8Slogwang .sv_shared_page_base = SHAREDPAGE, 829*a9643ea8Slogwang .sv_shared_page_len = PAGE_SIZE, 830*a9643ea8Slogwang .sv_schedtail = linux_schedtail, 831*a9643ea8Slogwang .sv_thread_detach = linux_thread_detach, 832*a9643ea8Slogwang .sv_trap = linux_vsyscall, 833*a9643ea8Slogwang }; 834*a9643ea8Slogwang 835*a9643ea8Slogwang static void 836*a9643ea8Slogwang linux_vdso_install(void *param) 837*a9643ea8Slogwang { 838*a9643ea8Slogwang 839*a9643ea8Slogwang linux_szsigcode = (&_binary_linux_locore_o_end - 840*a9643ea8Slogwang &_binary_linux_locore_o_start); 841*a9643ea8Slogwang 842*a9643ea8Slogwang if (linux_szsigcode > elf_linux_sysvec.sv_shared_page_len) 843*a9643ea8Slogwang panic("Linux invalid vdso size\n"); 844*a9643ea8Slogwang 845*a9643ea8Slogwang __elfN(linux_vdso_fixup)(&elf_linux_sysvec); 846*a9643ea8Slogwang 847*a9643ea8Slogwang linux_shared_page_obj = __elfN(linux_shared_page_init) 848*a9643ea8Slogwang (&linux_shared_page_mapping); 849*a9643ea8Slogwang 850*a9643ea8Slogwang __elfN(linux_vdso_reloc)(&elf_linux_sysvec, SHAREDPAGE); 851*a9643ea8Slogwang 852*a9643ea8Slogwang bcopy(elf_linux_sysvec.sv_sigcode, linux_shared_page_mapping, 853*a9643ea8Slogwang linux_szsigcode); 854*a9643ea8Slogwang elf_linux_sysvec.sv_shared_page_obj = linux_shared_page_obj; 855*a9643ea8Slogwang 856*a9643ea8Slogwang linux_kplatform = linux_shared_page_mapping + 857*a9643ea8Slogwang (linux_platform - (caddr_t)SHAREDPAGE); 858*a9643ea8Slogwang } 859*a9643ea8Slogwang SYSINIT(elf_linux_vdso_init, SI_SUB_EXEC, SI_ORDER_ANY, 860*a9643ea8Slogwang (sysinit_cfunc_t)linux_vdso_install, NULL); 861*a9643ea8Slogwang 862*a9643ea8Slogwang static void 863*a9643ea8Slogwang linux_vdso_deinstall(void *param) 864*a9643ea8Slogwang { 865*a9643ea8Slogwang 866*a9643ea8Slogwang __elfN(linux_shared_page_fini)(linux_shared_page_obj); 867*a9643ea8Slogwang }; 868*a9643ea8Slogwang SYSUNINIT(elf_linux_vdso_uninit, SI_SUB_EXEC, SI_ORDER_FIRST, 869*a9643ea8Slogwang (sysinit_cfunc_t)linux_vdso_deinstall, NULL); 870*a9643ea8Slogwang 871*a9643ea8Slogwang static char GNULINUX_ABI_VENDOR[] = "GNU"; 872*a9643ea8Slogwang static int GNULINUX_ABI_DESC = 0; 873*a9643ea8Slogwang 874*a9643ea8Slogwang static boolean_t 875*a9643ea8Slogwang linux_trans_osrel(const Elf_Note *note, int32_t *osrel) 876*a9643ea8Slogwang { 877*a9643ea8Slogwang const Elf32_Word *desc; 878*a9643ea8Slogwang uintptr_t p; 879*a9643ea8Slogwang 880*a9643ea8Slogwang p = (uintptr_t)(note + 1); 881*a9643ea8Slogwang p += roundup2(note->n_namesz, sizeof(Elf32_Addr)); 882*a9643ea8Slogwang 883*a9643ea8Slogwang desc = (const Elf32_Word *)p; 884*a9643ea8Slogwang if (desc[0] != GNULINUX_ABI_DESC) 885*a9643ea8Slogwang return (FALSE); 886*a9643ea8Slogwang 887*a9643ea8Slogwang /* 888*a9643ea8Slogwang * For linux we encode osrel as follows (see linux_mib.c): 889*a9643ea8Slogwang * VVVMMMIII (version, major, minor), see linux_mib.c. 890*a9643ea8Slogwang */ 891*a9643ea8Slogwang *osrel = desc[1] * 1000000 + desc[2] * 1000 + desc[3]; 892*a9643ea8Slogwang 893*a9643ea8Slogwang return (TRUE); 894*a9643ea8Slogwang } 895*a9643ea8Slogwang 896*a9643ea8Slogwang static Elf_Brandnote linux64_brandnote = { 897*a9643ea8Slogwang .hdr.n_namesz = sizeof(GNULINUX_ABI_VENDOR), 898*a9643ea8Slogwang .hdr.n_descsz = 16, 899*a9643ea8Slogwang .hdr.n_type = 1, 900*a9643ea8Slogwang .vendor = GNULINUX_ABI_VENDOR, 901*a9643ea8Slogwang .flags = BN_TRANSLATE_OSREL, 902*a9643ea8Slogwang .trans_osrel = linux_trans_osrel 903*a9643ea8Slogwang }; 904*a9643ea8Slogwang 905*a9643ea8Slogwang static Elf64_Brandinfo linux_glibc2brand = { 906*a9643ea8Slogwang .brand = ELFOSABI_LINUX, 907*a9643ea8Slogwang .machine = EM_X86_64, 908*a9643ea8Slogwang .compat_3_brand = "Linux", 909*a9643ea8Slogwang .emul_path = "/compat/linux", 910*a9643ea8Slogwang .interp_path = "/lib64/ld-linux-x86-64.so.2", 911*a9643ea8Slogwang .sysvec = &elf_linux_sysvec, 912*a9643ea8Slogwang .interp_newpath = NULL, 913*a9643ea8Slogwang .brand_note = &linux64_brandnote, 914*a9643ea8Slogwang .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE 915*a9643ea8Slogwang }; 916*a9643ea8Slogwang 917*a9643ea8Slogwang static Elf64_Brandinfo linux_glibc2brandshort = { 918*a9643ea8Slogwang .brand = ELFOSABI_LINUX, 919*a9643ea8Slogwang .machine = EM_X86_64, 920*a9643ea8Slogwang .compat_3_brand = "Linux", 921*a9643ea8Slogwang .emul_path = "/compat/linux", 922*a9643ea8Slogwang .interp_path = "/lib64/ld-linux.so.2", 923*a9643ea8Slogwang .sysvec = &elf_linux_sysvec, 924*a9643ea8Slogwang .interp_newpath = NULL, 925*a9643ea8Slogwang .brand_note = &linux64_brandnote, 926*a9643ea8Slogwang .flags = BI_CAN_EXEC_DYN | BI_BRAND_NOTE 927*a9643ea8Slogwang }; 928*a9643ea8Slogwang 929*a9643ea8Slogwang Elf64_Brandinfo *linux_brandlist[] = { 930*a9643ea8Slogwang &linux_glibc2brand, 931*a9643ea8Slogwang &linux_glibc2brandshort, 932*a9643ea8Slogwang NULL 933*a9643ea8Slogwang }; 934*a9643ea8Slogwang 935*a9643ea8Slogwang static int 936*a9643ea8Slogwang linux64_elf_modevent(module_t mod, int type, void *data) 937*a9643ea8Slogwang { 938*a9643ea8Slogwang Elf64_Brandinfo **brandinfo; 939*a9643ea8Slogwang int error; 940*a9643ea8Slogwang struct linux_ioctl_handler **lihp; 941*a9643ea8Slogwang 942*a9643ea8Slogwang error = 0; 943*a9643ea8Slogwang 944*a9643ea8Slogwang switch(type) { 945*a9643ea8Slogwang case MOD_LOAD: 946*a9643ea8Slogwang for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 947*a9643ea8Slogwang ++brandinfo) 948*a9643ea8Slogwang if (elf64_insert_brand_entry(*brandinfo) < 0) 949*a9643ea8Slogwang error = EINVAL; 950*a9643ea8Slogwang if (error == 0) { 951*a9643ea8Slogwang SET_FOREACH(lihp, linux_ioctl_handler_set) 952*a9643ea8Slogwang linux_ioctl_register_handler(*lihp); 953*a9643ea8Slogwang LIST_INIT(&futex_list); 954*a9643ea8Slogwang mtx_init(&futex_mtx, "ftllk64", NULL, MTX_DEF); 955*a9643ea8Slogwang stclohz = (stathz ? stathz : hz); 956*a9643ea8Slogwang if (bootverbose) 957*a9643ea8Slogwang printf("Linux x86-64 ELF exec handler installed\n"); 958*a9643ea8Slogwang } else 959*a9643ea8Slogwang printf("cannot insert Linux x86-64 ELF brand handler\n"); 960*a9643ea8Slogwang break; 961*a9643ea8Slogwang case MOD_UNLOAD: 962*a9643ea8Slogwang for (brandinfo = &linux_brandlist[0]; *brandinfo != NULL; 963*a9643ea8Slogwang ++brandinfo) 964*a9643ea8Slogwang if (elf64_brand_inuse(*brandinfo)) 965*a9643ea8Slogwang error = EBUSY; 966*a9643ea8Slogwang if (error == 0) { 967*a9643ea8Slogwang for (brandinfo = &linux_brandlist[0]; 968*a9643ea8Slogwang *brandinfo != NULL; ++brandinfo) 969*a9643ea8Slogwang if (elf64_remove_brand_entry(*brandinfo) < 0) 970*a9643ea8Slogwang error = EINVAL; 971*a9643ea8Slogwang } 972*a9643ea8Slogwang if (error == 0) { 973*a9643ea8Slogwang SET_FOREACH(lihp, linux_ioctl_handler_set) 974*a9643ea8Slogwang linux_ioctl_unregister_handler(*lihp); 975*a9643ea8Slogwang mtx_destroy(&futex_mtx); 976*a9643ea8Slogwang if (bootverbose) 977*a9643ea8Slogwang printf("Linux ELF exec handler removed\n"); 978*a9643ea8Slogwang } else 979*a9643ea8Slogwang printf("Could not deinstall ELF interpreter entry\n"); 980*a9643ea8Slogwang break; 981*a9643ea8Slogwang default: 982*a9643ea8Slogwang return (EOPNOTSUPP); 983*a9643ea8Slogwang } 984*a9643ea8Slogwang return (error); 985*a9643ea8Slogwang } 986*a9643ea8Slogwang 987*a9643ea8Slogwang static moduledata_t linux64_elf_mod = { 988*a9643ea8Slogwang "linux64elf", 989*a9643ea8Slogwang linux64_elf_modevent, 990*a9643ea8Slogwang 0 991*a9643ea8Slogwang }; 992*a9643ea8Slogwang 993*a9643ea8Slogwang DECLARE_MODULE_TIED(linux64elf, linux64_elf_mod, SI_SUB_EXEC, SI_ORDER_ANY); 994*a9643ea8Slogwang MODULE_DEPEND(linux64elf, linux_common, 1, 1, 1); 995*a9643ea8Slogwang FEATURE(linux64, "Linux 64bit support"); 996