2 * Copyright (C) 1995 Linus Torvalds
4 * Pentium III FXSR, SSE support
5 * Gareth Hughes <gareth@valinux.com>, May 2000
10 * CPU hotplug support - ashok.raj@intel.com
14 * This file handles the architecture-dependent parts of process handling..
17 #include <linux/cpu.h>
18 #include <linux/errno.h>
19 #include <linux/sched.h>
21 #include <linux/kernel.h>
23 #include <linux/elfcore.h>
24 #include <linux/smp.h>
25 #include <linux/slab.h>
26 #include <linux/user.h>
27 #include <linux/interrupt.h>
28 #include <linux/delay.h>
29 #include <linux/module.h>
30 #include <linux/ptrace.h>
31 #include <linux/notifier.h>
32 #include <linux/kprobes.h>
33 #include <linux/kdebug.h>
34 #include <linux/prctl.h>
35 #include <linux/uaccess.h>
37 #include <linux/ftrace.h>
39 #include <asm/pgtable.h>
40 #include <asm/processor.h>
42 #include <asm/fpu-internal.h>
43 #include <asm/mmu_context.h>
44 #include <asm/prctl.h>
46 #include <asm/proto.h>
49 #include <asm/syscalls.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
53 asmlinkage extern void ret_from_fork(void);
55 DEFINE_PER_CPU(unsigned long, old_rsp);
57 /* Prints also some state that isn't saved in the pt_regs */
58 void __show_regs(struct pt_regs *regs, int all)
60 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
61 unsigned long d0, d1, d2, d3, d6, d7;
62 unsigned int fsindex, gsindex;
63 unsigned int ds, cs, es;
65 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
66 printk_address(regs->ip, 1);
67 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
68 regs->sp, regs->flags);
69 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
70 regs->ax, regs->bx, regs->cx);
71 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
72 regs->dx, regs->si, regs->di);
73 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
74 regs->bp, regs->r8, regs->r9);
75 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
76 regs->r10, regs->r11, regs->r12);
77 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
78 regs->r13, regs->r14, regs->r15);
80 asm("movl %%ds,%0" : "=r" (ds));
81 asm("movl %%cs,%0" : "=r" (cs));
82 asm("movl %%es,%0" : "=r" (es));
83 asm("movl %%fs,%0" : "=r" (fsindex));
84 asm("movl %%gs,%0" : "=r" (gsindex));
86 rdmsrl(MSR_FS_BASE, fs);
87 rdmsrl(MSR_GS_BASE, gs);
88 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
98 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
99 fs, fsindex, gs, gsindex, shadowgs);
100 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
102 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
108 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
112 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
115 void release_thread(struct task_struct *dead_task)
118 if (dead_task->mm->context.size) {
119 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
121 dead_task->mm->context.ldt,
122 dead_task->mm->context.size);
128 static inline void set_32bit_tls(struct task_struct *t, int tls, u32 addr)
130 struct user_desc ud = {
137 struct desc_struct *desc = t->thread.tls_array;
142 static inline u32 read_32bit_tls(struct task_struct *t, int tls)
144 return get_desc_base(&t->thread.tls_array[tls]);
147 int copy_thread(unsigned long clone_flags, unsigned long sp,
148 unsigned long arg, struct task_struct *p)
151 struct pt_regs *childregs;
152 struct task_struct *me = current;
154 p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
155 childregs = task_pt_regs(p);
156 p->thread.sp = (unsigned long) childregs;
157 p->thread.usersp = me->thread.usersp;
158 set_tsk_thread_flag(p, TIF_FORK);
160 p->thread.io_bitmap_ptr = NULL;
162 savesegment(gs, p->thread.gsindex);
163 p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
164 savesegment(fs, p->thread.fsindex);
165 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
166 savesegment(es, p->thread.es);
167 savesegment(ds, p->thread.ds);
168 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
170 if (unlikely(p->flags & PF_KTHREAD)) {
172 memset(childregs, 0, sizeof(struct pt_regs));
173 childregs->sp = (unsigned long)childregs;
174 childregs->ss = __KERNEL_DS;
175 childregs->bx = sp; /* function */
177 childregs->orig_ax = -1;
178 childregs->cs = __KERNEL_CS | get_kernel_rpl();
179 childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED;
182 *childregs = *current_pt_regs();
189 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
191 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
192 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
193 IO_BITMAP_BYTES, GFP_KERNEL);
194 if (!p->thread.io_bitmap_ptr) {
195 p->thread.io_bitmap_max = 0;
198 set_tsk_thread_flag(p, TIF_IO_BITMAP);
202 * Set a new TLS for the child thread?
204 if (clone_flags & CLONE_SETTLS) {
205 #ifdef CONFIG_IA32_EMULATION
206 if (test_thread_flag(TIF_IA32))
207 err = do_set_thread_area(p, -1,
208 (struct user_desc __user *)childregs->si, 0);
211 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
217 if (err && p->thread.io_bitmap_ptr) {
218 kfree(p->thread.io_bitmap_ptr);
219 p->thread.io_bitmap_max = 0;
226 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
227 unsigned long new_sp,
228 unsigned int _cs, unsigned int _ss, unsigned int _ds)
231 loadsegment(es, _ds);
232 loadsegment(ds, _ds);
234 current->thread.usersp = new_sp;
237 this_cpu_write(old_rsp, new_sp);
240 regs->flags = X86_EFLAGS_IF;
244 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
246 start_thread_common(regs, new_ip, new_sp,
247 __USER_CS, __USER_DS, 0);
250 #ifdef CONFIG_IA32_EMULATION
251 void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
253 start_thread_common(regs, new_ip, new_sp,
254 test_thread_flag(TIF_X32)
255 ? __USER_CS : __USER32_CS,
256 __USER_DS, __USER_DS);
261 * switch_to(x,y) should switch tasks from x to y.
263 * This could still be optimized:
264 * - fold all the options into a flag word and test it with a single test.
265 * - could test fs/gs bitsliced
267 * Kprobes not supported here. Set the probe on schedule instead.
268 * Function graph tracer not supported too.
270 __notrace_funcgraph struct task_struct *
271 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
273 struct thread_struct *prev = &prev_p->thread;
274 struct thread_struct *next = &next_p->thread;
275 int cpu = smp_processor_id();
276 struct tss_struct *tss = &per_cpu(init_tss, cpu);
277 unsigned fsindex, gsindex;
280 fpu = switch_fpu_prepare(prev_p, next_p, cpu);
282 /* Reload esp0 and ss1. */
285 /* We must save %fs and %gs before load_TLS() because
286 * %fs and %gs may be cleared by load_TLS().
288 * (e.g. xen_load_tls())
290 savesegment(fs, fsindex);
291 savesegment(gs, gsindex);
294 * Load TLS before restoring any segments so that segment loads
295 * reference the correct GDT entries.
300 * Leave lazy mode, flushing any hypercalls made here. This
301 * must be done after loading TLS entries in the GDT but before
302 * loading segments that might reference them, and and it must
303 * be done before math_state_restore, so the TS bit is up to
306 arch_end_context_switch(next_p);
310 * Reading them only returns the selectors, but writing them (if
311 * nonzero) loads the full descriptor from the GDT or LDT. The
312 * LDT for next is loaded in switch_mm, and the GDT is loaded
315 * We therefore need to write new values to the segment
316 * registers on every context switch unless both the new and old
319 * Note that we don't need to do anything for CS and SS, as
320 * those are saved and restored as part of pt_regs.
322 savesegment(es, prev->es);
323 if (unlikely(next->es | prev->es))
324 loadsegment(es, next->es);
326 savesegment(ds, prev->ds);
327 if (unlikely(next->ds | prev->ds))
328 loadsegment(ds, next->ds);
333 * These are even more complicated than FS and GS: they have
334 * 64-bit bases are that controlled by arch_prctl. Those bases
335 * only differ from the values in the GDT or LDT if the selector
338 * Loading the segment register resets the hidden base part of
339 * the register to 0 or the value from the GDT / LDT. If the
340 * next base address zero, writing 0 to the segment register is
341 * much faster than using wrmsr to explicitly zero the base.
343 * The thread_struct.fs and thread_struct.gs values are 0
344 * if the fs and gs bases respectively are not overridden
345 * from the values implied by fsindex and gsindex. They
346 * are nonzero, and store the nonzero base addresses, if
347 * the bases are overridden.
349 * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
352 * Therefore we need to reload the segment registers if either
353 * the old or new selector is nonzero, and we need to override
354 * the base address if next thread expects it to be overridden.
356 * This code is unnecessarily slow in the case where the old and
357 * new indexes are zero and the new base is nonzero -- it will
358 * unnecessarily write 0 to the selector before writing the new
361 * Note: This all depends on arch_prctl being the only way that
362 * user code can override the segment base. Once wrfsbase and
363 * wrgsbase are enabled, most of this code will need to change.
365 if (unlikely(fsindex | next->fsindex | prev->fs)) {
366 loadsegment(fs, next->fsindex);
369 * If user code wrote a nonzero value to FS, then it also
370 * cleared the overridden base address.
372 * XXX: if user code wrote 0 to FS and cleared the base
373 * address itself, we won't notice and we'll incorrectly
374 * restore the prior base address next time we reschdule
381 wrmsrl(MSR_FS_BASE, next->fs);
382 prev->fsindex = fsindex;
384 if (unlikely(gsindex | next->gsindex | prev->gs)) {
385 load_gs_index(next->gsindex);
387 /* This works (and fails) the same way as fsindex above. */
392 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
393 prev->gsindex = gsindex;
395 switch_fpu_finish(next_p, fpu);
398 * Switch the PDA and FPU contexts.
400 prev->usersp = this_cpu_read(old_rsp);
401 this_cpu_write(old_rsp, next->usersp);
402 this_cpu_write(current_task, next_p);
404 this_cpu_write(kernel_stack,
405 (unsigned long)task_stack_page(next_p) +
406 THREAD_SIZE - KERNEL_STACK_OFFSET);
409 * Now maybe reload the debug registers and handle I/O bitmaps
411 if (unlikely(task_thread_info(next_p)->flags & _TIF_WORK_CTXSW_NEXT ||
412 task_thread_info(prev_p)->flags & _TIF_WORK_CTXSW_PREV))
413 __switch_to_xtra(prev_p, next_p, tss);
418 void set_personality_64bit(void)
420 /* inherit personality from parent */
422 /* Make sure to be in 64bit mode */
423 clear_thread_flag(TIF_IA32);
424 clear_thread_flag(TIF_ADDR32);
425 clear_thread_flag(TIF_X32);
427 /* Ensure the corresponding mm is not marked. */
429 current->mm->context.ia32_compat = 0;
431 /* TBD: overwrites user setup. Should have two bits.
432 But 64bit processes have always behaved this way,
433 so it's not too bad. The main problem is just that
434 32bit childs are affected again. */
435 current->personality &= ~READ_IMPLIES_EXEC;
438 void set_personality_ia32(bool x32)
440 /* inherit personality from parent */
442 /* Make sure to be in 32bit mode */
443 set_thread_flag(TIF_ADDR32);
445 /* Mark the associated mm as containing 32-bit tasks. */
447 current->mm->context.ia32_compat = 1;
450 clear_thread_flag(TIF_IA32);
451 set_thread_flag(TIF_X32);
452 current->personality &= ~READ_IMPLIES_EXEC;
453 /* is_compat_task() uses the presence of the x32
454 syscall bit flag to determine compat status */
455 current_thread_info()->status &= ~TS_COMPAT;
457 set_thread_flag(TIF_IA32);
458 clear_thread_flag(TIF_X32);
459 current->personality |= force_personality32;
460 /* Prepare the first "return" to user space */
461 current_thread_info()->status |= TS_COMPAT;
464 EXPORT_SYMBOL_GPL(set_personality_ia32);
466 unsigned long get_wchan(struct task_struct *p)
472 if (!p || p == current || p->state == TASK_RUNNING)
474 stack = (unsigned long)task_stack_page(p);
475 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
477 fp = *(u64 *)(p->thread.sp);
479 if (fp < (unsigned long)stack ||
480 fp >= (unsigned long)stack+THREAD_SIZE)
483 if (!in_sched_functions(ip))
486 } while (count++ < 16);
490 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
493 int doit = task == current;
498 if (addr >= TASK_SIZE_OF(task))
501 /* handle small bases via the GDT because that's faster to
503 if (addr <= 0xffffffff) {
504 set_32bit_tls(task, GS_TLS, addr);
506 load_TLS(&task->thread, cpu);
507 load_gs_index(GS_TLS_SEL);
509 task->thread.gsindex = GS_TLS_SEL;
512 task->thread.gsindex = 0;
513 task->thread.gs = addr;
516 ret = wrmsrl_safe(MSR_KERNEL_GS_BASE, addr);
522 /* Not strictly needed for fs, but do it for symmetry
524 if (addr >= TASK_SIZE_OF(task))
527 /* handle small bases via the GDT because that's faster to
529 if (addr <= 0xffffffff) {
530 set_32bit_tls(task, FS_TLS, addr);
532 load_TLS(&task->thread, cpu);
533 loadsegment(fs, FS_TLS_SEL);
535 task->thread.fsindex = FS_TLS_SEL;
538 task->thread.fsindex = 0;
539 task->thread.fs = addr;
541 /* set the selector to 0 to not confuse
544 ret = wrmsrl_safe(MSR_FS_BASE, addr);
551 if (task->thread.fsindex == FS_TLS_SEL)
552 base = read_32bit_tls(task, FS_TLS);
554 rdmsrl(MSR_FS_BASE, base);
556 base = task->thread.fs;
557 ret = put_user(base, (unsigned long __user *)addr);
563 if (task->thread.gsindex == GS_TLS_SEL)
564 base = read_32bit_tls(task, GS_TLS);
566 savesegment(gs, gsindex);
568 rdmsrl(MSR_KERNEL_GS_BASE, base);
570 base = task->thread.gs;
572 base = task->thread.gs;
573 ret = put_user(base, (unsigned long __user *)addr);
585 long sys_arch_prctl(int code, unsigned long addr)
587 return do_arch_prctl(current, code, addr);
590 unsigned long KSTK_ESP(struct task_struct *task)
592 return (test_tsk_thread_flag(task, TIF_IA32)) ?
593 (task_pt_regs(task)->sp) : ((task)->thread.usersp);