1 /* 2 * Copyright (C) 1995 Linus Torvalds 3 * 4 * Pentium III FXSR, SSE support 5 * Gareth Hughes <[email protected]>, May 2000 6 * 7 * X86-64 port 8 * Andi Kleen. 9 * 10 * CPU hotplug support - [email protected] 11 */ 12 13 /* 14 * This file handles the architecture-dependent parts of process handling.. 15 */ 16 17 #include <linux/stackprotector.h> 18 #include <linux/cpu.h> 19 #include <linux/errno.h> 20 #include <linux/sched.h> 21 #include <linux/fs.h> 22 #include <linux/kernel.h> 23 #include <linux/mm.h> 24 #include <linux/elfcore.h> 25 #include <linux/smp.h> 26 #include <linux/slab.h> 27 #include <linux/user.h> 28 #include <linux/interrupt.h> 29 #include <linux/utsname.h> 30 #include <linux/delay.h> 31 #include <linux/module.h> 32 #include <linux/ptrace.h> 33 #include <linux/notifier.h> 34 #include <linux/kprobes.h> 35 #include <linux/kdebug.h> 36 #include <linux/tick.h> 37 #include <linux/prctl.h> 38 #include <linux/uaccess.h> 39 #include <linux/io.h> 40 #include <linux/ftrace.h> 41 #include <linux/dmi.h> 42 43 #include <asm/pgtable.h> 44 #include <asm/system.h> 45 #include <asm/processor.h> 46 #include <asm/i387.h> 47 #include <asm/mmu_context.h> 48 #include <asm/prctl.h> 49 #include <asm/desc.h> 50 #include <asm/proto.h> 51 #include <asm/ia32.h> 52 #include <asm/idle.h> 53 #include <asm/syscalls.h> 54 #include <asm/ds.h> 55 #include <asm/debugreg.h> 56 57 asmlinkage extern void ret_from_fork(void); 58 59 DEFINE_PER_CPU(unsigned long, old_rsp); 60 static DEFINE_PER_CPU(unsigned char, is_idle); 61 62 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED; 63 64 static ATOMIC_NOTIFIER_HEAD(idle_notifier); 65 66 void idle_notifier_register(struct notifier_block *n) 67 { 68 atomic_notifier_chain_register(&idle_notifier, n); 69 } 70 EXPORT_SYMBOL_GPL(idle_notifier_register); 71 72 void idle_notifier_unregister(struct notifier_block *n) 73 { 74 atomic_notifier_chain_unregister(&idle_notifier, n); 75 } 76 EXPORT_SYMBOL_GPL(idle_notifier_unregister); 77 78 void enter_idle(void) 79 { 80 percpu_write(is_idle, 1); 81 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); 82 } 83 84 static void __exit_idle(void) 85 { 86 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) 87 return; 88 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); 89 } 90 91 /* Called from interrupts to signify idle end */ 92 void exit_idle(void) 93 { 94 /* idle loop has pid 0 */ 95 if (current->pid) 96 return; 97 __exit_idle(); 98 } 99 100 #ifndef CONFIG_SMP 101 static inline void play_dead(void) 102 { 103 BUG(); 104 } 105 #endif 106 107 /* 108 * The idle thread. There's no useful work to be 109 * done, so just try to conserve power and have a 110 * low exit latency (ie sit in a loop waiting for 111 * somebody to say that they'd like to reschedule) 112 */ 113 void cpu_idle(void) 114 { 115 current_thread_info()->status |= TS_POLLING; 116 117 /* 118 * If we're the non-boot CPU, nothing set the stack canary up 119 * for us. CPU0 already has it initialized but no harm in 120 * doing it again. This is a good place for updating it, as 121 * we wont ever return from this function (so the invalid 122 * canaries already on the stack wont ever trigger). 123 */ 124 boot_init_stack_canary(); 125 126 /* endless idle loop with no priority at all */ 127 while (1) { 128 tick_nohz_stop_sched_tick(1); 129 while (!need_resched()) { 130 131 rmb(); 132 133 if (cpu_is_offline(smp_processor_id())) 134 play_dead(); 135 /* 136 * Idle routines should keep interrupts disabled 137 * from here on, until they go to idle. 138 * Otherwise, idle callbacks can misfire. 139 */ 140 local_irq_disable(); 141 enter_idle(); 142 /* Don't trace irqs off for idle */ 143 stop_critical_timings(); 144 pm_idle(); 145 start_critical_timings(); 146 /* In many cases the interrupt that ended idle 147 has already called exit_idle. But some idle 148 loops can be woken up without interrupt. */ 149 __exit_idle(); 150 } 151 152 tick_nohz_restart_sched_tick(); 153 preempt_enable_no_resched(); 154 schedule(); 155 preempt_disable(); 156 } 157 } 158 159 /* Prints also some state that isn't saved in the pt_regs */ 160 void __show_regs(struct pt_regs *regs, int all) 161 { 162 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 163 unsigned long d0, d1, d2, d3, d6, d7; 164 unsigned int fsindex, gsindex; 165 unsigned int ds, cs, es; 166 const char *board; 167 168 printk("\n"); 169 print_modules(); 170 board = dmi_get_system_info(DMI_PRODUCT_NAME); 171 if (!board) 172 board = ""; 173 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n", 174 current->pid, current->comm, print_tainted(), 175 init_utsname()->release, 176 (int)strcspn(init_utsname()->version, " "), 177 init_utsname()->version, board); 178 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 179 printk_address(regs->ip, 1); 180 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 181 regs->sp, regs->flags); 182 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 183 regs->ax, regs->bx, regs->cx); 184 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 185 regs->dx, regs->si, regs->di); 186 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 187 regs->bp, regs->r8, regs->r9); 188 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 189 regs->r10, regs->r11, regs->r12); 190 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 191 regs->r13, regs->r14, regs->r15); 192 193 asm("movl %%ds,%0" : "=r" (ds)); 194 asm("movl %%cs,%0" : "=r" (cs)); 195 asm("movl %%es,%0" : "=r" (es)); 196 asm("movl %%fs,%0" : "=r" (fsindex)); 197 asm("movl %%gs,%0" : "=r" (gsindex)); 198 199 rdmsrl(MSR_FS_BASE, fs); 200 rdmsrl(MSR_GS_BASE, gs); 201 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 202 203 if (!all) 204 return; 205 206 cr0 = read_cr0(); 207 cr2 = read_cr2(); 208 cr3 = read_cr3(); 209 cr4 = read_cr4(); 210 211 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 212 fs, fsindex, gs, gsindex, shadowgs); 213 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 214 es, cr0); 215 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 216 cr4); 217 218 get_debugreg(d0, 0); 219 get_debugreg(d1, 1); 220 get_debugreg(d2, 2); 221 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 222 get_debugreg(d3, 3); 223 get_debugreg(d6, 6); 224 get_debugreg(d7, 7); 225 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 226 } 227 228 void show_regs(struct pt_regs *regs) 229 { 230 printk(KERN_INFO "CPU %d:", smp_processor_id()); 231 __show_regs(regs, 1); 232 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 233 } 234 235 void release_thread(struct task_struct *dead_task) 236 { 237 if (dead_task->mm) { 238 if (dead_task->mm->context.size) { 239 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n", 240 dead_task->comm, 241 dead_task->mm->context.ldt, 242 dead_task->mm->context.size); 243 BUG(); 244 } 245 } 246 } 247 248 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr) 249 { 250 struct user_desc ud = { 251 .base_addr = addr, 252 .limit = 0xfffff, 253 .seg_32bit = 1, 254 .limit_in_pages = 1, 255 .useable = 1, 256 }; 257 struct desc_struct *desc = t->thread.tls_array; 258 desc += tls; 259 fill_ldt(desc, &ud); 260 } 261 262 static inline u32 read_32bit_tls(struct task_struct *t, int tls) 263 { 264 return get_desc_base(&t->thread.tls_array[tls]); 265 } 266 267 /* 268 * This gets called before we allocate a new thread and copy 269 * the current task into it. 270 */ 271 void prepare_to_copy(struct task_struct *tsk) 272 { 273 unlazy_fpu(tsk); 274 } 275 276 int copy_thread(unsigned long clone_flags, unsigned long sp, 277 unsigned long unused, 278 struct task_struct *p, struct pt_regs *regs) 279 { 280 int err; 281 struct pt_regs *childregs; 282 struct task_struct *me = current; 283 284 childregs = ((struct pt_regs *) 285 (THREAD_SIZE + task_stack_page(p))) - 1; 286 *childregs = *regs; 287 288 childregs->ax = 0; 289 childregs->sp = sp; 290 if (sp == ~0UL) 291 childregs->sp = (unsigned long)childregs; 292 293 p->thread.sp = (unsigned long) childregs; 294 p->thread.sp0 = (unsigned long) (childregs+1); 295 p->thread.usersp = me->thread.usersp; 296 297 set_tsk_thread_flag(p, TIF_FORK); 298 299 p->thread.fs = me->thread.fs; 300 p->thread.gs = me->thread.gs; 301 p->thread.io_bitmap_ptr = NULL; 302 303 savesegment(gs, p->thread.gsindex); 304 savesegment(fs, p->thread.fsindex); 305 savesegment(es, p->thread.es); 306 savesegment(ds, p->thread.ds); 307 308 err = -ENOMEM; 309 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); 310 311 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 312 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 313 if (!p->thread.io_bitmap_ptr) { 314 p->thread.io_bitmap_max = 0; 315 return -ENOMEM; 316 } 317 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, 318 IO_BITMAP_BYTES); 319 set_tsk_thread_flag(p, TIF_IO_BITMAP); 320 } 321 322 /* 323 * Set a new TLS for the child thread? 324 */ 325 if (clone_flags & CLONE_SETTLS) { 326 #ifdef CONFIG_IA32_EMULATION 327 if (test_thread_flag(TIF_IA32)) 328 err = do_set_thread_area(p, -1, 329 (struct user_desc __user *)childregs->si, 0); 330 else 331 #endif 332 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 333 if (err) 334 goto out; 335 } 336 337 clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); 338 p->thread.ds_ctx = NULL; 339 340 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); 341 p->thread.debugctlmsr = 0; 342 343 err = 0; 344 out: 345 if (err && p->thread.io_bitmap_ptr) { 346 kfree(p->thread.io_bitmap_ptr); 347 p->thread.io_bitmap_max = 0; 348 } 349 350 return err; 351 } 352 353 void 354 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 355 { 356 loadsegment(fs, 0); 357 loadsegment(es, 0); 358 loadsegment(ds, 0); 359 load_gs_index(0); 360 regs->ip = new_ip; 361 regs->sp = new_sp; 362 percpu_write(old_rsp, new_sp); 363 regs->cs = __USER_CS; 364 regs->ss = __USER_DS; 365 regs->flags = 0x200; 366 set_fs(USER_DS); 367 /* 368 * Free the old FP and other extended state 369 */ 370 free_thread_xstate(current); 371 } 372 EXPORT_SYMBOL_GPL(start_thread); 373 374 /* 375 * switch_to(x,y) should switch tasks from x to y. 376 * 377 * This could still be optimized: 378 * - fold all the options into a flag word and test it with a single test. 379 * - could test fs/gs bitsliced 380 * 381 * Kprobes not supported here. Set the probe on schedule instead. 382 * Function graph tracer not supported too. 383 */ 384 __notrace_funcgraph struct task_struct * 385 __switch_to(struct task_struct *prev_p, struct task_struct *next_p) 386 { 387 struct thread_struct *prev = &prev_p->thread; 388 struct thread_struct *next = &next_p->thread; 389 int cpu = smp_processor_id(); 390 struct tss_struct *tss = &per_cpu(init_tss, cpu); 391 unsigned fsindex, gsindex; 392 bool preload_fpu; 393 394 /* 395 * If the task has used fpu the last 5 timeslices, just do a full 396 * restore of the math state immediately to avoid the trap; the 397 * chances of needing FPU soon are obviously high now 398 */ 399 preload_fpu = tsk_used_math(next_p) && next_p->fpu_counter > 5; 400 401 /* we're going to use this soon, after a few expensive things */ 402 if (preload_fpu) 403 prefetch(next->xstate); 404 405 /* 406 * Reload esp0, LDT and the page table pointer: 407 */ 408 load_sp0(tss, next); 409 410 /* 411 * Switch DS and ES. 412 * This won't pick up thread selector changes, but I guess that is ok. 413 */ 414 savesegment(es, prev->es); 415 if (unlikely(next->es | prev->es)) 416 loadsegment(es, next->es); 417 418 savesegment(ds, prev->ds); 419 if (unlikely(next->ds | prev->ds)) 420 loadsegment(ds, next->ds); 421 422 423 /* We must save %fs and %gs before load_TLS() because 424 * %fs and %gs may be cleared by load_TLS(). 425 * 426 * (e.g. xen_load_tls()) 427 */ 428 savesegment(fs, fsindex); 429 savesegment(gs, gsindex); 430 431 load_TLS(next, cpu); 432 433 /* Must be after DS reload */ 434 unlazy_fpu(prev_p); 435 436 /* Make sure cpu is ready for new context */ 437 if (preload_fpu) 438 clts(); 439 440 /* 441 * Leave lazy mode, flushing any hypercalls made here. 442 * This must be done before restoring TLS segments so 443 * the GDT and LDT are properly updated, and must be 444 * done before math_state_restore, so the TS bit is up 445 * to date. 446 */ 447 arch_end_context_switch(next_p); 448 449 /* 450 * Switch FS and GS. 451 * 452 * Segment register != 0 always requires a reload. Also 453 * reload when it has changed. When prev process used 64bit 454 * base always reload to avoid an information leak. 455 */ 456 if (unlikely(fsindex | next->fsindex | prev->fs)) { 457 loadsegment(fs, next->fsindex); 458 /* 459 * Check if the user used a selector != 0; if yes 460 * clear 64bit base, since overloaded base is always 461 * mapped to the Null selector 462 */ 463 if (fsindex) 464 prev->fs = 0; 465 } 466 /* when next process has a 64bit base use it */ 467 if (next->fs) 468 wrmsrl(MSR_FS_BASE, next->fs); 469 prev->fsindex = fsindex; 470 471 if (unlikely(gsindex | next->gsindex | prev->gs)) { 472 load_gs_index(next->gsindex); 473 if (gsindex) 474 prev->gs = 0; 475 } 476 if (next->gs) 477 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 478 prev->gsindex = gsindex; 479 480 /* 481 * Switch the PDA and FPU contexts. 482 */ 483 prev->usersp = percpu_read(old_rsp); 484 percpu_write(old_rsp, next->usersp); 485 percpu_write(current_task, next_p); 486 487 percpu_write(kernel_stack, 488 (unsigned long)task_stack_page(next_p) + 489 THREAD_SIZE - KERNEL_STACK_OFFSET); 490 491 /* 492 * Now maybe reload the debug registers and handle I/O bitmaps 493 */ 494 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT || 495 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV)) 496 __switch_to_xtra(prev_p, next_p, tss); 497 498 /* 499 * Preload the FPU context, now that we've determined that the 500 * task is likely to be using it. 501 */ 502 if (preload_fpu) 503 __math_state_restore(); 504 505 return prev_p; 506 } 507 508 /* 509 * sys_execve() executes a new program. 510 */ 511 asmlinkage 512 long sys_execve(char __user *name, char __user * __user *argv, 513 char __user * __user *envp, struct pt_regs *regs) 514 { 515 long error; 516 char *filename; 517 518 filename = getname(name); 519 error = PTR_ERR(filename); 520 if (IS_ERR(filename)) 521 return error; 522 error = do_execve(filename, argv, envp, regs); 523 putname(filename); 524 return error; 525 } 526 527 void set_personality_64bit(void) 528 { 529 /* inherit personality from parent */ 530 531 /* Make sure to be in 64bit mode */ 532 clear_thread_flag(TIF_IA32); 533 534 /* TBD: overwrites user setup. Should have two bits. 535 But 64bit processes have always behaved this way, 536 so it's not too bad. The main problem is just that 537 32bit childs are affected again. */ 538 current->personality &= ~READ_IMPLIES_EXEC; 539 } 540 541 asmlinkage long 542 sys_clone(unsigned long clone_flags, unsigned long newsp, 543 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs) 544 { 545 if (!newsp) 546 newsp = regs->sp; 547 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 548 } 549 550 unsigned long get_wchan(struct task_struct *p) 551 { 552 unsigned long stack; 553 u64 fp, ip; 554 int count = 0; 555 556 if (!p || p == current || p->state == TASK_RUNNING) 557 return 0; 558 stack = (unsigned long)task_stack_page(p); 559 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) 560 return 0; 561 fp = *(u64 *)(p->thread.sp); 562 do { 563 if (fp < (unsigned long)stack || 564 fp >= (unsigned long)stack+THREAD_SIZE) 565 return 0; 566 ip = *(u64 *)(fp+8); 567 if (!in_sched_functions(ip)) 568 return ip; 569 fp = *(u64 *)fp; 570 } while (count++ < 16); 571 return 0; 572 } 573 574 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 575 { 576 int ret = 0; 577 int doit = task == current; 578 int cpu; 579 580 switch (code) { 581 case ARCH_SET_GS: 582 if (addr >= TASK_SIZE_OF(task)) 583 return -EPERM; 584 cpu = get_cpu(); 585 /* handle small bases via the GDT because that's faster to 586 switch. */ 587 if (addr <= 0xffffffff) { 588 set_32bit_tls(task, GS_TLS, addr); 589 if (doit) { 590 load_TLS(&task->thread, cpu); 591 load_gs_index(GS_TLS_SEL); 592 } 593 task->thread.gsindex = GS_TLS_SEL; 594 task->thread.gs = 0; 595 } else { 596 task->thread.gsindex = 0; 597 task->thread.gs = addr; 598 if (doit) { 599 load_gs_index(0); 600 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 601 } 602 } 603 put_cpu(); 604 break; 605 case ARCH_SET_FS: 606 /* Not strictly needed for fs, but do it for symmetry 607 with gs */ 608 if (addr >= TASK_SIZE_OF(task)) 609 return -EPERM; 610 cpu = get_cpu(); 611 /* handle small bases via the GDT because that's faster to 612 switch. */ 613 if (addr <= 0xffffffff) { 614 set_32bit_tls(task, FS_TLS, addr); 615 if (doit) { 616 load_TLS(&task->thread, cpu); 617 loadsegment(fs, FS_TLS_SEL); 618 } 619 task->thread.fsindex = FS_TLS_SEL; 620 task->thread.fs = 0; 621 } else { 622 task->thread.fsindex = 0; 623 task->thread.fs = addr; 624 if (doit) { 625 /* set the selector to 0 to not confuse 626 __switch_to */ 627 loadsegment(fs, 0); 628 ret = checking_wrmsrl(MSR_FS_BASE, addr); 629 } 630 } 631 put_cpu(); 632 break; 633 case ARCH_GET_FS: { 634 unsigned long base; 635 if (task->thread.fsindex == FS_TLS_SEL) 636 base = read_32bit_tls(task, FS_TLS); 637 else if (doit) 638 rdmsrl(MSR_FS_BASE, base); 639 else 640 base = task->thread.fs; 641 ret = put_user(base, (unsigned long __user *)addr); 642 break; 643 } 644 case ARCH_GET_GS: { 645 unsigned long base; 646 unsigned gsindex; 647 if (task->thread.gsindex == GS_TLS_SEL) 648 base = read_32bit_tls(task, GS_TLS); 649 else if (doit) { 650 savesegment(gs, gsindex); 651 if (gsindex) 652 rdmsrl(MSR_KERNEL_GS_BASE, base); 653 else 654 base = task->thread.gs; 655 } else 656 base = task->thread.gs; 657 ret = put_user(base, (unsigned long __user *)addr); 658 break; 659 } 660 661 default: 662 ret = -EINVAL; 663 break; 664 } 665 666 return ret; 667 } 668 669 long sys_arch_prctl(int code, unsigned long addr) 670 { 671 return do_arch_prctl(current, code, addr); 672 } 673 674 unsigned long KSTK_ESP(struct task_struct *task) 675 { 676 return (test_tsk_thread_flag(task, TIF_IA32)) ? 677 (task_pt_regs(task)->sp) : ((task)->thread.usersp); 678 } 679