Merge branch 'core/percpu' into perfcounters/core
authorIngo Molnar <mingo@elte.hu>
Sun, 18 Jan 2009 17:15:49 +0000 (18:15 +0100)
committerIngo Molnar <mingo@elte.hu>
Sun, 18 Jan 2009 17:15:49 +0000 (18:15 +0100)
Conflicts:
arch/x86/include/asm/pda.h

We merge tip/core/percpu into tip/perfcounters/core because of a
semantic and contextual conflict: the former eliminates the PDA,
while the latter extends it with apic_perf_irqs field.

Resolve the conflict by moving the new field to the irq_cpustat
structure on 64-bit too.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
12 files changed:
1  2 
arch/powerpc/kernel/irq.c
arch/x86/ia32/ia32entry.S
arch/x86/include/asm/hardirq_32.h
arch/x86/include/asm/hardirq_64.h
arch/x86/include/asm/irq_vectors.h
arch/x86/include/asm/thread_info.h
arch/x86/kernel/apic.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/entry_64.S
arch/x86/kernel/irq.c
kernel/Makefile
kernel/sched.c

index f5ae4878ccef0169bd910b889600a7faa6807258,ad1e5ac721d86f557bac20079b407c8929459273..7f8e6a92c5a1b3470c234c069c1291bd13e5e731
@@@ -104,13 -104,6 +104,13 @@@ static inline notrace void set_soft_ena
        : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
  }
  
 +#ifdef CONFIG_PERF_COUNTERS
 +notrace void __weak perf_counter_do_pending(void)
 +{
 +      set_perf_counter_pending(0);
 +}
 +#endif
 +
  notrace void raw_local_irq_restore(unsigned long en)
  {
        /*
                        iseries_handle_interrupts();
        }
  
 +      if (get_perf_counter_pending())
 +              perf_counter_do_pending();
 +
        /*
         * if (get_paca()->hard_enabled) return;
         * But again we need to take care that gcc gets hard_enabled directly
@@@ -241,7 -231,7 +241,7 @@@ void fixup_irqs(cpumask_t map
                if (irq_desc[irq].status & IRQ_PER_CPU)
                        continue;
  
-               cpus_and(mask, irq_desc[irq].affinity, map);
+               cpumask_and(&mask, irq_desc[irq].affinity, &map);
                if (any_online_cpu(mask) == NR_CPUS) {
                        printk("Breaking affinity for irq %i\n", irq);
                        mask = map;
index 3c14ed07dc4e75f20dfc9fc86677a62551bf82c1,9c79b247700801e6467b5629e7ee757f09a44b7e..01e7c4c5c7fe3b8d9405543cb981d4df2c0d26f6
@@@ -112,8 -112,8 +112,8 @@@ ENTRY(ia32_sysenter_target
        CFI_DEF_CFA     rsp,0
        CFI_REGISTER    rsp,rbp
        SWAPGS_UNSAFE_STACK
-       movq    %gs:pda_kernelstack, %rsp
-       addq    $(PDA_STACKOFFSET),%rsp 
+       movq    PER_CPU_VAR(kernel_stack), %rsp
+       addq    $(KERNEL_STACK_OFFSET),%rsp
        /*
         * No need to follow this irqs on/off section: the syscall
         * disabled irqs, here we enable it straight after entry:
@@@ -273,13 -273,13 +273,13 @@@ ENDPROC(ia32_sysenter_target
  ENTRY(ia32_cstar_target)
        CFI_STARTPROC32 simple
        CFI_SIGNAL_FRAME
-       CFI_DEF_CFA     rsp,PDA_STACKOFFSET
+       CFI_DEF_CFA     rsp,KERNEL_STACK_OFFSET
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
        SWAPGS_UNSAFE_STACK
        movl    %esp,%r8d
        CFI_REGISTER    rsp,r8
-       movq    %gs:pda_kernelstack,%rsp
+       movq    PER_CPU_VAR(kernel_stack),%rsp
        /*
         * No need to follow this irqs on/off section: the syscall
         * disabled irqs and here we enable it straight after entry:
@@@ -823,8 -823,7 +823,8 @@@ ia32_sys_call_table
        .quad compat_sys_signalfd4
        .quad sys_eventfd2
        .quad sys_epoll_create1
 -      .quad sys_dup3                  /* 330 */
 +      .quad sys_dup3                          /* 330 */
        .quad sys_pipe2
        .quad sys_inotify_init1
 +      .quad sys_perf_counter_open
  ia32_syscall_end:
index 7a07897a78887f59c6bbccbbdbe1f82669f37bba,d4b5d731073fb194f75de05f3878ad157b048b21..7838276bfe512f1d2d8a937d93f9fa05e6fca25b
@@@ -9,7 -9,6 +9,7 @@@ typedef struct 
        unsigned long idle_timestamp;
        unsigned int __nmi_count;       /* arch dependent */
        unsigned int apic_timer_irqs;   /* arch dependent */
 +      unsigned int apic_perf_irqs;    /* arch dependent */
        unsigned int irq0_irqs;
        unsigned int irq_resched_count;
        unsigned int irq_call_count;
@@@ -20,6 -19,9 +20,9 @@@
  
  DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
  
+ /* We can have at most NR_VECTORS irqs routed to a cpu at a time */
+ #define MAX_HARDIRQS_PER_CPU NR_VECTORS
  #define __ARCH_IRQ_STAT
  #define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
  
index b5a6b5d56704c89330d2f88f515c5d25281e105f,a65bab20f6ce5cbe737dd365407ddaff518e559c..42930b2792155014162ad82768bc9789d410dab6
@@@ -3,22 -3,36 +3,37 @@@
  
  #include <linux/threads.h>
  #include <linux/irq.h>
- #include <asm/pda.h>
  #include <asm/apic.h>
  
+ typedef struct {
+       unsigned int __softirq_pending;
+       unsigned int __nmi_count;       /* arch dependent */
+       unsigned int apic_timer_irqs;   /* arch dependent */
++      unsigned int apic_perf_irqs;    /* arch dependent */
+       unsigned int irq0_irqs;
+       unsigned int irq_resched_count;
+       unsigned int irq_call_count;
+       unsigned int irq_tlb_count;
+       unsigned int irq_thermal_count;
+       unsigned int irq_spurious_count;
+       unsigned int irq_threshold_count;
+ } ____cacheline_aligned irq_cpustat_t;
+ DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
  /* We can have at most NR_VECTORS irqs routed to a cpu at a time */
  #define MAX_HARDIRQS_PER_CPU NR_VECTORS
  
  #define __ARCH_IRQ_STAT 1
  
- #define inc_irq_stat(member)  add_pda(member, 1)
+ #define inc_irq_stat(member)  percpu_add(irq_stat.member, 1)
  
- #define local_softirq_pending() read_pda(__softirq_pending)
+ #define local_softirq_pending() percpu_read(irq_stat.__softirq_pending)
  
  #define __ARCH_SET_SOFTIRQ_PENDING 1
  
- #define set_softirq_pending(x) write_pda(__softirq_pending, (x))
- #define or_softirq_pending(x)  or_pda(__softirq_pending, (x))
+ #define set_softirq_pending(x) percpu_write(irq_stat.__softirq_pending, (x))
+ #define or_softirq_pending(x)  percpu_or(irq_stat.__softirq_pending, (x))
  
  extern void ack_bad_irq(unsigned int irq);
  
index 21a0b92027f5850455b45f467300aad42b887a2f,a16a2ab2b42998a964d20c990ba88365d07bdfe4..1554d0236e03c77cb4004f66dfdf92e5c0b76191
   */
  #define LOCAL_TIMER_VECTOR    0xef
  
 +/*
 + * Performance monitoring interrupt vector:
 + */
 +#define LOCAL_PERF_VECTOR     0xee
 +
  /*
   * First APIC vector available to drivers: (vectors 0x30-0xee) we
   * start at 0x31(0x41) to spread out vectors evenly between priority
  
  #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
  
+ #include <asm/apicnum.h>      /* need MAX_IO_APICS */
  #ifndef CONFIG_SPARSE_IRQ
  # if NR_CPUS < MAX_IO_APICS
  #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
  #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
  # endif
  #else
- # if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
- #  define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
- # else
- #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
- # endif
+ # define NR_IRQS                                      \
+       ((8 * NR_CPUS) > (32 * MAX_IO_APICS) ?          \
+               (NR_VECTORS + (8 * NR_CPUS)) :          \
+               (NR_VECTORS + (32 * MAX_IO_APICS)))     \
  #endif
  
  #elif defined(CONFIG_X86_VOYAGER)
index efdf93820aedda3abe54b82f697740f363fd5ffc,b46f8ca007b5754ce1282809d6a7e8669104ed81..f38488989db7c2f473b531bc05b1fc46922ce51f
@@@ -82,7 -82,6 +82,7 @@@ struct thread_info 
  #define TIF_SYSCALL_AUDIT     7       /* syscall auditing active */
  #define TIF_SECCOMP           8       /* secure computing */
  #define TIF_MCE_NOTIFY                10      /* notify userspace of an MCE */
 +#define TIF_PERF_COUNTERS     11      /* notify perf counter work */
  #define TIF_NOTSC             16      /* TSC is not accessible in userland */
  #define TIF_IA32              17      /* 32bit process */
  #define TIF_FORK              18      /* ret_from_fork */
  #define _TIF_SYSCALL_AUDIT    (1 << TIF_SYSCALL_AUDIT)
  #define _TIF_SECCOMP          (1 << TIF_SECCOMP)
  #define _TIF_MCE_NOTIFY               (1 << TIF_MCE_NOTIFY)
 +#define _TIF_PERF_COUNTERS    (1 << TIF_PERF_COUNTERS)
  #define _TIF_NOTSC            (1 << TIF_NOTSC)
  #define _TIF_IA32             (1 << TIF_IA32)
  #define _TIF_FORK             (1 << TIF_FORK)
  
  /* Only used for 64 bit */
  #define _TIF_DO_NOTIFY_MASK                                           \
 -      (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
 +      (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
  
  /* flags to check in __switch_to() */
  #define _TIF_WORK_CTXSW                                                       \
@@@ -196,25 -194,21 +196,21 @@@ static inline struct thread_info *curre
  
  #else /* X86_32 */
  
- #include <asm/pda.h>
+ #include <asm/percpu.h>
+ #define KERNEL_STACK_OFFSET (5*8)
  
  /*
   * macros/functions for gaining access to the thread information structure
   * preempt_count needs to be 1 initially, until the scheduler is functional.
   */
  #ifndef __ASSEMBLY__
- static inline struct thread_info *current_thread_info(void)
- {
-       struct thread_info *ti;
-       ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
-       return ti;
- }
+ DECLARE_PER_CPU(unsigned long, kernel_stack);
  
- /* do not use in interrupt context */
- static inline struct thread_info *stack_thread_info(void)
+ static inline struct thread_info *current_thread_info(void)
  {
        struct thread_info *ti;
-       asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1)));
+       ti = (void *)(percpu_read(kernel_stack) +
+                     KERNEL_STACK_OFFSET - THREAD_SIZE);
        return ti;
  }
  
  
  /* how to get the thread information struct from ASM */
  #define GET_THREAD_INFO(reg) \
-       movq %gs:pda_kernelstack,reg ; \
-       subq $(THREAD_SIZE-PDA_STACKOFFSET),reg
+       movq PER_CPU_VAR(kernel_stack),reg ; \
+       subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg
  
  #endif
  
diff --combined arch/x86/kernel/apic.c
index d2d17b8d10f863f78defdef3f42aeb7f073fa4f8,4857879558346af6a31e519a07a88a0e97abd511..e9af14f748ea95a9ae2f7d374370ade92fd22b96
@@@ -35,7 -35,6 +35,7 @@@
  #include <linux/nmi.h>
  #include <linux/timex.h>
  
 +#include <asm/perf_counter.h>
  #include <asm/atomic.h>
  #include <asm/mtrr.h>
  #include <asm/mpspec.h>
@@@ -48,6 -47,7 +48,7 @@@
  #include <asm/proto.h>
  #include <asm/apic.h>
  #include <asm/i8259.h>
+ #include <asm/smp.h>
  
  #include <mach_apic.h>
  #include <mach_apicdef.h>
@@@ -895,6 -895,10 +896,10 @@@ void disable_local_APIC(void
  {
        unsigned int value;
  
+       /* APIC hasn't been mapped yet */
+       if (!apic_phys)
+               return;
        clear_local_APIC();
  
        /*
@@@ -1126,6 -1130,11 +1131,11 @@@ void __cpuinit setup_local_APIC(void
        unsigned int value;
        int i, j;
  
+       if (disable_apic) {
+               disable_ioapic_setup();
+               return;
+       }
  #ifdef CONFIG_X86_32
        /* Pound the ESR really hard over the head with a big hammer - mbligh */
        if (lapic_is_integrated() && esr_disable) {
                apic_write(APIC_ESR, 0);
        }
  #endif
 +      perf_counters_lapic_init(0);
  
        preempt_disable();
  
@@@ -1567,11 -1575,11 +1577,11 @@@ int apic_version[MAX_APICS]
  
  int __init APIC_init_uniprocessor(void)
  {
- #ifdef CONFIG_X86_64
        if (disable_apic) {
                pr_info("Apic disabled\n");
                return -1;
        }
+ #ifdef CONFIG_X86_64
        if (!cpu_has_apic) {
                disable_apic = 1;
                pr_info("Apic disabled by BIOS\n");
@@@ -1869,17 -1877,8 +1879,8 @@@ void __cpuinit generic_processor_info(i
  #endif
  
  #if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
-       /* are we being called early in kernel startup? */
-       if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
-               u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
-               u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
-               cpu_to_apicid[cpu] = apicid;
-               bios_cpu_apicid[cpu] = apicid;
-       } else {
-               per_cpu(x86_cpu_to_apicid, cpu) = apicid;
-               per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
-       }
+       early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
+       early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
  #endif
  
        set_cpu_possible(cpu, true);
index 667e5d561ed77f39fadbc547421bef1f2832910b,7976a6a0f65c1c626c4fb983e7c45f7f6d84198d..95eb30e1e677d467c7185d7992789870469d2196
  #include <asm/mmu_context.h>
  #include <asm/mtrr.h>
  #include <asm/mce.h>
 +#include <asm/perf_counter.h>
  #include <asm/pat.h>
  #include <asm/asm.h>
  #include <asm/numa.h>
  #include <asm/smp.h>
+ #include <asm/cpu.h>
+ #include <asm/cpumask.h>
  #ifdef CONFIG_X86_LOCAL_APIC
  #include <asm/mpspec.h>
  #include <asm/apic.h>
@@@ -773,7 -774,6 +775,7 @@@ void __init identify_boot_cpu(void
  #else
        vgetcpu_set_mode();
  #endif
 +      init_hw_perf_counters();
  }
  
  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@@ -879,54 -879,34 +881,34 @@@ static __init int setup_disablecpuid(ch
  __setup("clearcpuid=", setup_disablecpuid);
  
  #ifdef CONFIG_X86_64
- struct x8664_pda **_cpu_pda __read_mostly;
- EXPORT_SYMBOL(_cpu_pda);
  struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
  
- static char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
+ DEFINE_PER_CPU_PAGE_ALIGNED(char[IRQ_STACK_SIZE], irq_stack);
+ #ifdef CONFIG_SMP
+ DEFINE_PER_CPU(char *, irq_stack_ptr);        /* will be set during per cpu init */
+ #else
+ DEFINE_PER_CPU(char *, irq_stack_ptr) =
+       per_cpu_var(irq_stack) + IRQ_STACK_SIZE - 64;
+ #endif
+ DEFINE_PER_CPU(unsigned long, kernel_stack) =
+       (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+ EXPORT_PER_CPU_SYMBOL(kernel_stack);
+ DEFINE_PER_CPU(unsigned int, irq_count) = -1;
  
  void __cpuinit pda_init(int cpu)
  {
-       struct x8664_pda *pda = cpu_pda(cpu);
        /* Setup up data that may be needed in __get_free_pages early */
        loadsegment(fs, 0);
        loadsegment(gs, 0);
-       /* Memory clobbers used to order PDA accessed */
-       mb();
-       wrmsrl(MSR_GS_BASE, pda);
-       mb();
-       pda->cpunumber = cpu;
-       pda->irqcount = -1;
-       pda->kernelstack = (unsigned long)stack_thread_info() -
-                                PDA_STACKOFFSET + THREAD_SIZE;
-       pda->active_mm = &init_mm;
-       pda->mmu_state = 0;
-       if (cpu == 0) {
-               /* others are initialized in smpboot.c */
-               pda->pcurrent = &init_task;
-               pda->irqstackptr = boot_cpu_stack;
-               pda->irqstackptr += IRQSTACKSIZE - 64;
-       } else {
-               if (!pda->irqstackptr) {
-                       pda->irqstackptr = (char *)
-                               __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
-                       if (!pda->irqstackptr)
-                               panic("cannot allocate irqstack for cpu %d",
-                                     cpu);
-                       pda->irqstackptr += IRQSTACKSIZE - 64;
-               }
  
-               if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
-                       pda->nodenumber = cpu_to_node(cpu);
-       }
+       load_pda_offset(cpu);
  }
  
- static char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
-                                 DEBUG_STKSZ] __page_aligned_bss;
+ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+       [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ])
+       __aligned(PAGE_SIZE);
  
  extern asmlinkage void ignore_sysret(void);
  
@@@ -984,15 -964,18 +966,18 @@@ void __cpuinit cpu_init(void
        struct tss_struct *t = &per_cpu(init_tss, cpu);
        struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
        unsigned long v;
-       char *estacks = NULL;
        struct task_struct *me;
        int i;
  
        /* CPU 0 is initialised in head64.c */
        if (cpu != 0)
                pda_init(cpu);
-       else
-               estacks = boot_exception_stacks;
+ #ifdef CONFIG_NUMA
+       if (cpu != 0 && percpu_read(node_number) == 0 &&
+           cpu_to_node(cpu) != NUMA_NO_NODE)
+               percpu_write(node_number, cpu_to_node(cpu));
+ #endif
  
        me = current;
  
         * set up and load the per-CPU TSS
         */
        if (!orig_ist->ist[0]) {
-               static const unsigned int order[N_EXCEPTION_STACKS] = {
-                 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
-                 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
+               static const unsigned int sizes[N_EXCEPTION_STACKS] = {
+                 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ,
+                 [DEBUG_STACK - 1] = DEBUG_STKSZ
                };
+               char *estacks = per_cpu(exception_stacks, cpu);
                for (v = 0; v < N_EXCEPTION_STACKS; v++) {
-                       if (cpu) {
-                               estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
-                               if (!estacks)
-                                       panic("Cannot allocate exception "
-                                             "stack %ld %d\n", v, cpu);
-                       }
-                       estacks += PAGE_SIZE << order[v];
+                       estacks += sizes[v];
                        orig_ist->ist[v] = t->x86_tss.ist[v] =
                                        (unsigned long)estacks;
                }
index 1954a96622036f3409d5381c4679d5260a111eaa,c52b60919163af41af58a7068fcba266fb0c657e..c092e7d2686d3015d655cf08beb06796e928ab01
@@@ -52,6 -52,7 +52,7 @@@
  #include <asm/irqflags.h>
  #include <asm/paravirt.h>
  #include <asm/ftrace.h>
+ #include <asm/percpu.h>
  
  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
  #include <linux/elf-em.h>
@@@ -209,7 -210,7 +210,7 @@@ ENTRY(native_usergs_sysret64
  
        /* %rsp:at FRAMEEND */
        .macro FIXUP_TOP_OF_STACK tmp offset=0
-       movq %gs:pda_oldrsp,\tmp
+       movq PER_CPU_VAR(old_rsp),\tmp
        movq \tmp,RSP+\offset(%rsp)
        movq $__USER_DS,SS+\offset(%rsp)
        movq $__USER_CS,CS+\offset(%rsp)
  
        .macro RESTORE_TOP_OF_STACK tmp offset=0
        movq RSP+\offset(%rsp),\tmp
-       movq \tmp,%gs:pda_oldrsp
+       movq \tmp,PER_CPU_VAR(old_rsp)
        movq EFLAGS+\offset(%rsp),\tmp
        movq \tmp,R11+\offset(%rsp)
        .endm
@@@ -336,15 -337,15 +337,15 @@@ ENTRY(save_args
        je 1f
        SWAPGS
        /*
-        * irqcount is used to check if a CPU is already on an interrupt stack
+        * irq_count is used to check if a CPU is already on an interrupt stack
         * or not. While this is essentially redundant with preempt_count it is
         * a little cheaper to use a separate counter in the PDA (short of
         * moving irq_enter into assembly, which would be too much work)
         */
- 1:    incl %gs:pda_irqcount
+ 1:    incl PER_CPU_VAR(irq_count)
        jne 2f
        popq_cfi %rax                   /* move return address... */
-       mov %gs:pda_irqstackptr,%rsp
+       mov PER_CPU_VAR(irq_stack_ptr),%rsp
        EMPTY_FRAME 0
        pushq_cfi %rax                  /* ... to the new stack */
        /*
@@@ -467,7 -468,7 +468,7 @@@ END(ret_from_fork
  ENTRY(system_call)
        CFI_STARTPROC   simple
        CFI_SIGNAL_FRAME
-       CFI_DEF_CFA     rsp,PDA_STACKOFFSET
+       CFI_DEF_CFA     rsp,KERNEL_STACK_OFFSET
        CFI_REGISTER    rip,rcx
        /*CFI_REGISTER  rflags,r11*/
        SWAPGS_UNSAFE_STACK
         */
  ENTRY(system_call_after_swapgs)
  
-       movq    %rsp,%gs:pda_oldrsp
-       movq    %gs:pda_kernelstack,%rsp
+       movq    %rsp,PER_CPU_VAR(old_rsp)
+       movq    PER_CPU_VAR(kernel_stack),%rsp
        /*
         * No need to follow this irqs off/on section - it's straight
         * and short:
@@@ -522,7 -523,7 +523,7 @@@ sysret_check
        CFI_REGISTER    rip,rcx
        RESTORE_ARGS 0,-ARG_SKIP,1
        /*CFI_REGISTER  rflags,r11*/
-       movq    %gs:pda_oldrsp, %rsp
+       movq    PER_CPU_VAR(old_rsp), %rsp
        USERGS_SYSRET64
  
        CFI_RESTORE_STATE
@@@ -832,11 -833,11 +833,11 @@@ common_interrupt
        XCPT_FRAME
        addq $-0x80,(%rsp)              /* Adjust vector to [-256,-1] range */
        interrupt do_IRQ
-       /* 0(%rsp): oldrsp-ARGOFFSET */
+       /* 0(%rsp): old_rsp-ARGOFFSET */
  ret_from_intr:
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF
-       decl %gs:pda_irqcount
+       decl PER_CPU_VAR(irq_count)
        leaveq
        CFI_DEF_CFA_REGISTER    rsp
        CFI_ADJUST_CFA_OFFSET   -8
@@@ -1024,11 -1025,6 +1025,11 @@@ apicinterrupt ERROR_APIC_VECTOR 
  apicinterrupt SPURIOUS_APIC_VECTOR \
        spurious_interrupt smp_spurious_interrupt
  
 +#ifdef CONFIG_PERF_COUNTERS
 +apicinterrupt LOCAL_PERF_VECTOR \
 +      perf_counter_interrupt smp_perf_counter_interrupt
 +#endif
 +
  /*
   * Exception entry points.
   */
@@@ -1077,10 -1073,10 +1078,10 @@@ ENTRY(\sym
        TRACE_IRQS_OFF
        movq %rsp,%rdi          /* pt_regs pointer */
        xorl %esi,%esi          /* no error code */
-       movq %gs:pda_data_offset, %rbp
-       subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+       PER_CPU(init_tss, %rbp)
+       subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
        call \do_sym
-       addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
+       addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp)
        jmp paranoid_exit       /* %ebx: no swapgs flag */
        CFI_ENDPROC
  END(\sym)
@@@ -1264,14 -1260,14 +1265,14 @@@ ENTRY(call_softirq
        CFI_REL_OFFSET rbp,0
        mov  %rsp,%rbp
        CFI_DEF_CFA_REGISTER rbp
-       incl %gs:pda_irqcount
-       cmove %gs:pda_irqstackptr,%rsp
+       incl PER_CPU_VAR(irq_count)
+       cmove PER_CPU_VAR(irq_stack_ptr),%rsp
        push  %rbp                      # backlink for old unwinder
        call __do_softirq
        leaveq
        CFI_DEF_CFA_REGISTER    rsp
        CFI_ADJUST_CFA_OFFSET   -8
-       decl %gs:pda_irqcount
+       decl PER_CPU_VAR(irq_count)
        ret
        CFI_ENDPROC
  END(call_softirq)
@@@ -1301,15 -1297,15 +1302,15 @@@ ENTRY(xen_do_hypervisor_callback)   # d
        movq %rdi, %rsp            # we don't return, adjust the stack frame
        CFI_ENDPROC
        DEFAULT_FRAME
- 11:   incl %gs:pda_irqcount
+ 11:   incl PER_CPU_VAR(irq_count)
        movq %rsp,%rbp
        CFI_DEF_CFA_REGISTER rbp
-       cmovzq %gs:pda_irqstackptr,%rsp
+       cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
        pushq %rbp                      # backlink for old unwinder
        call xen_evtchn_do_upcall
        popq %rsp
        CFI_DEF_CFA_REGISTER rsp
-       decl %gs:pda_irqcount
+       decl PER_CPU_VAR(irq_count)
        jmp  error_exit
        CFI_ENDPROC
  END(do_hypervisor_callback)
diff --combined arch/x86/kernel/irq.c
index 22f650db917fc2615f34741b11035bb74a6d5656,8b30d0c2512cefa30ac7492039d3b014a8817699..a6bca1d33a8aca4e701dc8b9ad7e4d6ffda1d2ef
@@@ -36,11 -36,7 +36,7 @@@ void ack_bad_irq(unsigned int irq
  #endif
  }
  
- #ifdef CONFIG_X86_32
- # define irq_stats(x)         (&per_cpu(irq_stat, x))
- #else
- # define irq_stats(x)         cpu_pda(x)
- #endif
+ #define irq_stats(x)          (&per_cpu(irq_stat, x))
  /*
   * /proc/interrupts printing:
   */
@@@ -57,10 -53,6 +53,10 @@@ static int show_other_interrupts(struc
        for_each_online_cpu(j)
                seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
        seq_printf(p, "  Local timer interrupts\n");
 +      seq_printf(p, "CNT: ");
 +      for_each_online_cpu(j)
 +              seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 +      seq_printf(p, "  Performance counter interrupts\n");
  #endif
  #ifdef CONFIG_SMP
        seq_printf(p, "RES: ");
@@@ -168,7 -160,6 +164,7 @@@ u64 arch_irq_stat_cpu(unsigned int cpu
  
  #ifdef CONFIG_X86_LOCAL_APIC
        sum += irq_stats(cpu)->apic_timer_irqs;
 +      sum += irq_stats(cpu)->apic_perf_irqs;
  #endif
  #ifdef CONFIG_SMP
        sum += irq_stats(cpu)->irq_resched_count;
diff --combined kernel/Makefile
index 8b2628c7914b039a90be2647b1b8926751d2dc7a,2aebc4cd787810a5e71ca4ea4aa35301220a6355..e4115926c536af5548e2e9bc6a6c30f945776b71
@@@ -40,7 -40,11 +40,11 @@@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.
  obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
  obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
  obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
- obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
+ ifeq ($(CONFIG_USE_GENERIC_SMP_HELPERS),y)
+ obj-y += smp.o
+ else
+ obj-y += up.o
+ endif
  obj-$(CONFIG_SMP) += spinlock.o
  obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
  obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
@@@ -89,7 -93,6 +93,7 @@@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT
  obj-$(CONFIG_FUNCTION_TRACER) += trace/
  obj-$(CONFIG_TRACING) += trace/
  obj-$(CONFIG_SMP) += sched_cpupri.o
 +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
  
  ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --combined kernel/sched.c
index 43fd21233b93bb1c350673621f64c1508217fb20,8be2c13b50d018cc69280829e14e31612d4deddd..ce9fecab5f0201c054ed3a6b477fc0745025f1db
@@@ -125,6 -125,9 +125,9 @@@ DEFINE_TRACE(sched_switch)
  DEFINE_TRACE(sched_migrate_task);
  
  #ifdef CONFIG_SMP
+ static void double_rq_lock(struct rq *rq1, struct rq *rq2);
  /*
   * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
   * Since cpu_power is a 'constant', we can use a reciprocal divide.
@@@ -665,7 -668,7 +668,7 @@@ static inline int cpu_of(struct rq *rq
  #define task_rq(p)            cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
  
 -static inline void update_rq_clock(struct rq *rq)
 +inline void update_rq_clock(struct rq *rq)
  {
        rq->clock = sched_clock_cpu(cpu_of(rq));
  }
@@@ -976,26 -979,6 +979,26 @@@ static struct rq *task_rq_lock(struct t
        }
  }
  
 +void curr_rq_lock_irq_save(unsigned long *flags)
 +      __acquires(rq->lock)
 +{
 +      struct rq *rq;
 +
 +      local_irq_save(*flags);
 +      rq = cpu_rq(smp_processor_id());
 +      spin_lock(&rq->lock);
 +}
 +
 +void curr_rq_unlock_irq_restore(unsigned long *flags)
 +      __releases(rq->lock)
 +{
 +      struct rq *rq;
 +
 +      rq = cpu_rq(smp_processor_id());
 +      spin_unlock(&rq->lock);
 +      local_irq_restore(*flags);
 +}
 +
  void task_rq_unlock_wait(struct task_struct *p)
  {
        struct rq *rq = task_rq(p);
@@@ -1902,14 -1885,12 +1905,14 @@@ void set_task_cpu(struct task_struct *p
                p->se.sleep_start -= clock_offset;
        if (p->se.block_start)
                p->se.block_start -= clock_offset;
 +#endif
        if (old_cpu != new_cpu) {
 -              schedstat_inc(p, se.nr_migrations);
 +              p->se.nr_migrations++;
 +#ifdef CONFIG_SCHEDSTATS
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
 -      }
  #endif
 +      }
        p->se.vruntime -= old_cfsrq->min_vruntime -
                                         new_cfsrq->min_vruntime;
  
@@@ -2261,27 -2242,6 +2264,27 @@@ static int sched_balance_self(int cpu, 
  
  #endif /* CONFIG_SMP */
  
 +/**
 + * task_oncpu_function_call - call a function on the cpu on which a task runs
 + * @p:                the task to evaluate
 + * @func:     the function to be called
 + * @info:     the function call argument
 + *
 + * Calls the function @func when the task is currently running. This might
 + * be on the current CPU, which just calls the function directly
 + */
 +void task_oncpu_function_call(struct task_struct *p,
 +                            void (*func) (void *info), void *info)
 +{
 +      int cpu;
 +
 +      preempt_disable();
 +      cpu = task_cpu(p);
 +      if (task_curr(p))
 +              smp_call_function_single(cpu, func, info, 1);
 +      preempt_enable();
 +}
 +
  /***
   * try_to_wake_up - wake up a thread
   * @p: the to-be-woken-up thread
@@@ -2424,7 -2384,6 +2427,7 @@@ static void __sched_fork(struct task_st
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
 +      p->se.nr_migrations             = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
  
@@@ -2645,7 -2604,6 +2648,7 @@@ static void finish_task_switch(struct r
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
 +      perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
        if (current->sched_class->post_schedule)
@@@ -4170,29 -4128,6 +4173,29 @@@ DEFINE_PER_CPU(struct kernel_stat, ksta
  
  EXPORT_PER_CPU_SYMBOL(kstat);
  
 +/*
 + * Return any ns on the sched_clock that have not yet been banked in
 + * @p in case that task is currently running.
 + */
 +unsigned long long __task_delta_exec(struct task_struct *p, int update)
 +{
 +      s64 delta_exec;
 +      struct rq *rq;
 +
 +      rq = task_rq(p);
 +      WARN_ON_ONCE(!runqueue_is_locked());
 +      WARN_ON_ONCE(!task_current(rq, p));
 +
 +      if (update)
 +              update_rq_clock(rq);
 +
 +      delta_exec = rq->clock - p->se.exec_start;
 +
 +      WARN_ON_ONCE(delta_exec < 0);
 +
 +      return delta_exec;
 +}
 +
  /*
   * Return any ns on the sched_clock that have not yet been banked in
   * @p in case that task is currently running.
@@@ -4456,7 -4391,6 +4459,7 @@@ void scheduler_tick(void
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
 +      perf_counter_task_tick(curr, cpu);
        spin_unlock(&rq->lock);
  
  #ifdef CONFIG_SMP
@@@ -4652,7 -4586,6 +4655,7 @@@ need_resched_nonpreemptible
  
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
 +              perf_counter_task_sched_out(prev, cpu);
  
                rq->nr_switches++;
                rq->curr = next;
@@@ -7352,10 -7285,10 +7355,10 @@@ cpu_to_phys_group(int cpu, const struc
   * groups, so roll our own. Now each node has its own list of groups which
   * gets dynamically allocated.
   */
- static DEFINE_PER_CPU(struct sched_domain, node_domains);
+ static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
  static struct sched_group ***sched_group_nodes_bycpu;
  
- static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
+ static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
  static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
  
  static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
@@@ -7630,7 -7563,7 +7633,7 @@@ static int __build_sched_domains(const 
  #ifdef CONFIG_NUMA
                if (cpumask_weight(cpu_map) >
                                SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
-                       sd = &per_cpu(allnodes_domains, i);
+                       sd = &per_cpu(allnodes_domains, i).sd;
                        SD_INIT(sd, ALLNODES);
                        set_domain_attribute(sd, attr);
                        cpumask_copy(sched_domain_span(sd), cpu_map);
                } else
                        p = NULL;
  
-               sd = &per_cpu(node_domains, i);
+               sd = &per_cpu(node_domains, i).sd;
                SD_INIT(sd, NODE);
                set_domain_attribute(sd, attr);
                sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
                for_each_cpu(j, nodemask) {
                        struct sched_domain *sd;
  
-                       sd = &per_cpu(node_domains, j);
+                       sd = &per_cpu(node_domains, j).sd;
                        sd->groups = sg;
                }
                sg->__cpu_power = 0;