2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
19 #include <linux/stackprotector.h>
20 #include <linux/cpu.h>
21 #include <linux/errno.h>
22 #include <linux/sched.h>
24 #include <linux/kernel.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/utsname.h>
32 #include <linux/delay.h>
33 #include <linux/module.h>
34 #include <linux/ptrace.h>
35 #include <linux/random.h>
36 #include <linux/notifier.h>
37 #include <linux/kprobes.h>
38 #include <linux/kdebug.h>
39 #include <linux/tick.h>
40 #include <linux/prctl.h>
41 #include <linux/uaccess.h>
43 #include <linux/ftrace.h>
45 #include <asm/pgtable.h>
46 #include <asm/system.h>
47 #include <asm/processor.h>
49 #include <asm/mmu_context.h>
51 #include <asm/prctl.h>
53 #include <asm/proto.h>
56 #include <asm/syscalls.h>
59 asmlinkage extern void ret_from_fork(void);
61 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
62 EXPORT_PER_CPU_SYMBOL(current_task);
64 DEFINE_PER_CPU(unsigned long, old_rsp);
65 static DEFINE_PER_CPU(unsigned char, is_idle);
67 unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
69 static ATOMIC_NOTIFIER_HEAD(idle_notifier);
71 void idle_notifier_register(struct notifier_block *n)
73 atomic_notifier_chain_register(&idle_notifier, n);
75 EXPORT_SYMBOL_GPL(idle_notifier_register);
77 void idle_notifier_unregister(struct notifier_block *n)
79 atomic_notifier_chain_unregister(&idle_notifier, n);
81 EXPORT_SYMBOL_GPL(idle_notifier_unregister);
85 percpu_write(is_idle, 1);
86 atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL);
89 static void __exit_idle(void)
91 if (x86_test_and_clear_bit_percpu(0, is_idle) == 0)
93 atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL);
96 /* Called from interrupts to signify idle end */
99 /* idle loop has pid 0 */
106 static inline void play_dead(void)
113 * The idle thread. There's no useful work to be
114 * done, so just try to conserve power and have a
115 * low exit latency (ie sit in a loop waiting for
116 * somebody to say that they'd like to reschedule)
120 current_thread_info()->status |= TS_POLLING;
123 * If we're the non-boot CPU, nothing set the PDA stack
124 * canary up for us - and if we are the boot CPU we have
125 * a 0 stack canary. This is a good place for updating
126 * it, as we wont ever return from this function (so the
127 * invalid canaries already on the stack wont ever
130 boot_init_stack_canary();
132 /* endless idle loop with no priority at all */
134 tick_nohz_stop_sched_tick(1);
135 while (!need_resched()) {
139 if (cpu_is_offline(smp_processor_id()))
142 * Idle routines should keep interrupts disabled
143 * from here on, until they go to idle.
144 * Otherwise, idle callbacks can misfire.
148 /* Don't trace irqs off for idle */
149 stop_critical_timings();
151 start_critical_timings();
152 /* In many cases the interrupt that ended idle
153 has already called exit_idle. But some idle
154 loops can be woken up without interrupt. */
158 tick_nohz_restart_sched_tick();
159 preempt_enable_no_resched();
165 /* Prints also some state that isn't saved in the pt_regs */
166 void __show_regs(struct pt_regs *regs, int all)
168 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
169 unsigned long d0, d1, d2, d3, d6, d7;
170 unsigned int fsindex, gsindex;
171 unsigned int ds, cs, es;
175 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
176 current->pid, current->comm, print_tainted(),
177 init_utsname()->release,
178 (int)strcspn(init_utsname()->version, " "),
179 init_utsname()->version);
180 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
181 printk_address(regs->ip, 1);
182 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
183 regs->sp, regs->flags);
184 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
185 regs->ax, regs->bx, regs->cx);
186 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
187 regs->dx, regs->si, regs->di);
188 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
189 regs->bp, regs->r8, regs->r9);
190 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
191 regs->r10, regs->r11, regs->r12);
192 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
193 regs->r13, regs->r14, regs->r15);
195 asm("movl %%ds,%0" : "=r" (ds));
196 asm("movl %%cs,%0" : "=r" (cs));
197 asm("movl %%es,%0" : "=r" (es));
198 asm("movl %%fs,%0" : "=r" (fsindex));
199 asm("movl %%gs,%0" : "=r" (gsindex));
201 rdmsrl(MSR_FS_BASE, fs);
202 rdmsrl(MSR_GS_BASE, gs);
203 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
213 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
214 fs, fsindex, gs, gsindex, shadowgs);
215 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
217 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
223 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
227 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
230 void show_regs(struct pt_regs *regs)
232 printk(KERN_INFO "CPU %d:", smp_processor_id());
233 __show_regs(regs, 1);
234 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
238 * Free current thread data structures etc..
240 void exit_thread(void)
242 struct task_struct *me = current;
243 struct thread_struct *t = &me->thread;
245 if (me->thread.io_bitmap_ptr) {
246 struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
248 kfree(t->io_bitmap_ptr);
249 t->io_bitmap_ptr = NULL;
250 clear_thread_flag(TIF_IO_BITMAP);
252 * Careful, clear this in the TSS too:
254 memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
255 t->io_bitmap_max = 0;
259 ds_exit_thread(current);
262 void flush_thread(void)
264 struct task_struct *tsk = current;
266 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) {
267 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING);
268 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
269 clear_tsk_thread_flag(tsk, TIF_IA32);
271 set_tsk_thread_flag(tsk, TIF_IA32);
272 current_thread_info()->status |= TS_COMPAT;
275 clear_tsk_thread_flag(tsk, TIF_DEBUG);
277 tsk->thread.debugreg0 = 0;
278 tsk->thread.debugreg1 = 0;
279 tsk->thread.debugreg2 = 0;
280 tsk->thread.debugreg3 = 0;
281 tsk->thread.debugreg6 = 0;
282 tsk->thread.debugreg7 = 0;
283 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
285 * Forget coprocessor state..
287 tsk->fpu_counter = 0;
292 void release_thread(struct task_struct *dead_task)
295 if (dead_task->mm->context.size) {
296 printk("WARNING: dead process %8s still has LDT? <%p/%d>\n",
298 dead_task->mm->context.ldt,
299 dead_task->mm->context.size);
305 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
307 struct user_desc ud = {
314 struct desc_struct *desc = t->thread.tls_array;
319 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
321 return get_desc_base(&t->thread.tls_array[tls]);
325 * This gets called before we allocate a new thread and copy
326 * the current task into it.
328 void prepare_to_copy(struct task_struct *tsk)
333 int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
334 unsigned long unused,
335 struct task_struct *p, struct pt_regs *regs)
338 struct pt_regs *childregs;
339 struct task_struct *me = current;
341 childregs = ((struct pt_regs *)
342 (THREAD_SIZE + task_stack_page(p))) - 1;
348 childregs->sp = (unsigned long)childregs;
350 p->thread.sp = (unsigned long) childregs;
351 p->thread.sp0 = (unsigned long) (childregs+1);
352 p->thread.usersp = me->thread.usersp;
354 set_tsk_thread_flag(p, TIF_FORK);
356 p->thread.fs = me->thread.fs;
357 p->thread.gs = me->thread.gs;
359 savesegment(gs, p->thread.gsindex);
360 savesegment(fs, p->thread.fsindex);
361 savesegment(es, p->thread.es);
362 savesegment(ds, p->thread.ds);
364 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
365 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
366 if (!p->thread.io_bitmap_ptr) {
367 p->thread.io_bitmap_max = 0;
370 memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
372 set_tsk_thread_flag(p, TIF_IO_BITMAP);
376 * Set a new TLS for the child thread?
378 if (clone_flags & CLONE_SETTLS) {
379 #ifdef CONFIG_IA32_EMULATION
380 if (test_thread_flag(TIF_IA32))
381 err = do_set_thread_area(p, -1,
382 (struct user_desc __user *)childregs->si, 0);
385 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
390 ds_copy_thread(p, me);
392 clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
393 p->thread.debugctlmsr = 0;
397 if (err && p->thread.io_bitmap_ptr) {
398 kfree(p->thread.io_bitmap_ptr);
399 p->thread.io_bitmap_max = 0;
405 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
413 percpu_write(old_rsp, new_sp);
414 regs->cs = __USER_CS;
415 regs->ss = __USER_DS;
419 * Free the old FP and other extended state
421 free_thread_xstate(current);
423 EXPORT_SYMBOL_GPL(start_thread);
425 static void hard_disable_TSC(void)
427 write_cr4(read_cr4() | X86_CR4_TSD);
430 void disable_TSC(void)
433 if (!test_and_set_thread_flag(TIF_NOTSC))
435 * Must flip the CPU state synchronously with
436 * TIF_NOTSC in the current running context.
442 static void hard_enable_TSC(void)
444 write_cr4(read_cr4() & ~X86_CR4_TSD);
447 static void enable_TSC(void)
450 if (test_and_clear_thread_flag(TIF_NOTSC))
452 * Must flip the CPU state synchronously with
453 * TIF_NOTSC in the current running context.
459 int get_tsc_mode(unsigned long adr)
463 if (test_thread_flag(TIF_NOTSC))
464 val = PR_TSC_SIGSEGV;
468 return put_user(val, (unsigned int __user *)adr);
471 int set_tsc_mode(unsigned int val)
473 if (val == PR_TSC_SIGSEGV)
475 else if (val == PR_TSC_ENABLE)
484 * This special macro can be used to load a debugging register
486 #define loaddebug(thread, r) set_debugreg(thread->debugreg ## r, r)
488 static inline void __switch_to_xtra(struct task_struct *prev_p,
489 struct task_struct *next_p,
490 struct tss_struct *tss)
492 struct thread_struct *prev, *next;
494 prev = &prev_p->thread,
495 next = &next_p->thread;
497 if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
498 test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
499 ds_switch_to(prev_p, next_p);
500 else if (next->debugctlmsr != prev->debugctlmsr)
501 update_debugctlmsr(next->debugctlmsr);
503 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
513 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
514 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
515 /* prev and next are different */
516 if (test_tsk_thread_flag(next_p, TIF_NOTSC))
522 if (test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
524 * Copy the relevant range of the IO bitmap.
525 * Normally this is 128 bytes or less:
527 memcpy(tss->io_bitmap, next->io_bitmap_ptr,
528 max(prev->io_bitmap_max, next->io_bitmap_max));
529 } else if (test_tsk_thread_flag(prev_p, TIF_IO_BITMAP)) {
531 * Clear any possible leftover bits:
533 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
538 * switch_to(x,y) should switch tasks from x to y.
540 * This could still be optimized:
541 * - fold all the options into a flag word and test it with a single test.
542 * - could test fs/gs bitsliced
544 * Kprobes not supported here. Set the probe on schedule instead.
545 * Function graph tracer not supported too.
547 __notrace_funcgraph struct task_struct *
548 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
550 struct thread_struct *prev = &prev_p->thread;
551 struct thread_struct *next = &next_p->thread;
552 int cpu = smp_processor_id();
553 struct tss_struct *tss = &per_cpu(init_tss, cpu);
554 unsigned fsindex, gsindex;
556 /* we're going to use this soon, after a few expensive things */
557 if (next_p->fpu_counter > 5)
558 prefetch(next->xstate);
561 * Reload esp0, LDT and the page table pointer:
567 * This won't pick up thread selector changes, but I guess that is ok.
569 savesegment(es, prev->es);
570 if (unlikely(next->es | prev->es))
571 loadsegment(es, next->es);
573 savesegment(ds, prev->ds);
574 if (unlikely(next->ds | prev->ds))
575 loadsegment(ds, next->ds);
578 /* We must save %fs and %gs before load_TLS() because
579 * %fs and %gs may be cleared by load_TLS().
581 * (e.g. xen_load_tls())
583 savesegment(fs, fsindex);
584 savesegment(gs, gsindex);
589 * Leave lazy mode, flushing any hypercalls made here.
590 * This must be done before restoring TLS segments so
591 * the GDT and LDT are properly updated, and must be
592 * done before math_state_restore, so the TS bit is up
595 arch_leave_lazy_cpu_mode();
600 * Segment register != 0 always requires a reload. Also
601 * reload when it has changed. When prev process used 64bit
602 * base always reload to avoid an information leak.
604 if (unlikely(fsindex | next->fsindex | prev->fs)) {
605 loadsegment(fs, next->fsindex);
607 * Check if the user used a selector != 0; if yes
608 * clear 64bit base, since overloaded base is always
609 * mapped to the Null selector
614 /* when next process has a 64bit base use it */
616 wrmsrl(MSR_FS_BASE, next->fs);
617 prev->fsindex = fsindex;
619 if (unlikely(gsindex | next->gsindex | prev->gs)) {
620 load_gs_index(next->gsindex);
625 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
626 prev->gsindex = gsindex;
628 /* Must be after DS reload */
632 * Switch the PDA and FPU contexts.
634 prev->usersp = percpu_read(old_rsp);
635 percpu_write(old_rsp, next->usersp);
636 percpu_write(current_task, next_p);
638 percpu_write(kernel_stack,
639 (unsigned long)task_stack_page(next_p) +
640 THREAD_SIZE - KERNEL_STACK_OFFSET);
641 #ifdef CONFIG_CC_STACKPROTECTOR
643 * Build time only check to make sure the stack_canary is at
644 * offset 40 in the pda; this is a gcc ABI requirement
646 BUILD_BUG_ON(offsetof(struct x8664_pda, stack_canary) != 40);
650 * Now maybe reload the debug registers and handle I/O bitmaps
652 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
653 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
654 __switch_to_xtra(prev_p, next_p, tss);
656 /* If the task has used fpu the last 5 timeslices, just do a full
657 * restore of the math state immediately to avoid the trap; the
658 * chances of needing FPU soon are obviously high now
660 * tsk_used_math() checks prevent calling math_state_restore(),
661 * which can sleep in the case of !tsk_used_math()
663 if (tsk_used_math(next_p) && next_p->fpu_counter > 5)
664 math_state_restore();
669 * sys_execve() executes a new program.
672 long sys_execve(char __user *name, char __user * __user *argv,
673 char __user * __user *envp, struct pt_regs *regs)
678 filename = getname(name);
679 error = PTR_ERR(filename);
680 if (IS_ERR(filename))
682 error = do_execve(filename, argv, envp, regs);
687 void set_personality_64bit(void)
689 /* inherit personality from parent */
691 /* Make sure to be in 64bit mode */
692 clear_thread_flag(TIF_IA32);
694 /* TBD: overwrites user setup. Should have two bits.
695 But 64bit processes have always behaved this way,
696 so it's not too bad. The main problem is just that
697 32bit childs are affected again. */
698 current->personality &= ~READ_IMPLIES_EXEC;
701 asmlinkage long sys_fork(struct pt_regs *regs)
703 return do_fork(SIGCHLD, regs->sp, regs, 0, NULL, NULL);
707 sys_clone(unsigned long clone_flags, unsigned long newsp,
708 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
712 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
716 * This is trivial, and on the face of it looks like it
717 * could equally well be done in user mode.
719 * Not so, for quite unobvious reasons - register pressure.
720 * In user mode vfork() cannot have a stack frame, and if
721 * done by calling the "clone()" system call directly, you
722 * do not have enough call-clobbered registers to hold all
723 * the information you need.
725 asmlinkage long sys_vfork(struct pt_regs *regs)
727 return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, regs->sp, regs, 0,
731 unsigned long get_wchan(struct task_struct *p)
737 if (!p || p == current || p->state == TASK_RUNNING)
739 stack = (unsigned long)task_stack_page(p);
740 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
742 fp = *(u64 *)(p->thread.sp);
744 if (fp < (unsigned long)stack ||
745 fp >= (unsigned long)stack+THREAD_SIZE)
748 if (!in_sched_functions(ip))
751 } while (count++ < 16);
755 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
758 int doit = task == current;
763 if (addr >= TASK_SIZE_OF(task))
766 /* handle small bases via the GDT because that's faster to
768 if (addr <= 0xffffffff) {
769 set_32bit_tls(task, GS_TLS, addr);
771 load_TLS(&task->thread, cpu);
772 load_gs_index(GS_TLS_SEL);
774 task->thread.gsindex = GS_TLS_SEL;
777 task->thread.gsindex = 0;
778 task->thread.gs = addr;
781 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
787 /* Not strictly needed for fs, but do it for symmetry
789 if (addr >= TASK_SIZE_OF(task))
792 /* handle small bases via the GDT because that's faster to
794 if (addr <= 0xffffffff) {
795 set_32bit_tls(task, FS_TLS, addr);
797 load_TLS(&task->thread, cpu);
798 loadsegment(fs, FS_TLS_SEL);
800 task->thread.fsindex = FS_TLS_SEL;
803 task->thread.fsindex = 0;
804 task->thread.fs = addr;
806 /* set the selector to 0 to not confuse
809 ret = checking_wrmsrl(MSR_FS_BASE, addr);
816 if (task->thread.fsindex == FS_TLS_SEL)
817 base = read_32bit_tls(task, FS_TLS);
819 rdmsrl(MSR_FS_BASE, base);
821 base = task->thread.fs;
822 ret = put_user(base, (unsigned long __user *)addr);
828 if (task->thread.gsindex == GS_TLS_SEL)
829 base = read_32bit_tls(task, GS_TLS);
831 savesegment(gs, gsindex);
833 rdmsrl(MSR_KERNEL_GS_BASE, base);
835 base = task->thread.gs;
837 base = task->thread.gs;
838 ret = put_user(base, (unsigned long __user *)addr);
850 long sys_arch_prctl(int code, unsigned long addr)
852 return do_arch_prctl(current, code, addr);
855 unsigned long arch_align_stack(unsigned long sp)
857 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
858 sp -= get_random_int() % 8192;
862 unsigned long arch_randomize_brk(struct mm_struct *mm)
864 unsigned long range_end = mm->brk + 0x02000000;
865 return randomize_range(mm->brk, range_end, 0) ? : mm->brk;