Merge commit 'v2.6.29-rc1' into perfcounters/core

author Ingo Molnar <mingo@elte.hu>

Sun, 11 Jan 2009 01:42:53 +0000 (02:42 +0100)

committer Ingo Molnar <mingo@elte.hu>

Sun, 11 Jan 2009 01:42:53 +0000 (02:42 +0100)
author Ingo Molnar <mingo@elte.hu>
Sun, 11 Jan 2009 01:42:53 +0000 (02:42 +0100)
committer Ingo Molnar <mingo@elte.hu>
Sun, 11 Jan 2009 01:42:53 +0000 (02:42 +0100)
diff --combined arch/x86/Kconfig

index f3921028038c84d7e910fddc38fff36b71ee861d,73f7fe8fd4d1c52d0e6851745067035fc008821e..1f4844505765cb8d22506bc5aa22bdb4f40965ae
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -27,6 -27,7 +27,7 @@@ config X8
         select HAVE_IOREMAP_PROT
         select HAVE_KPROBES
         select ARCH_WANT_OPTIONAL_GPIOLIB
+       select ARCH_WANT_FRAME_POINTERS
         select HAVE_KRETPROBES
         select HAVE_FTRACE_MCOUNT_RECORD
         select HAVE_DYNAMIC_FTRACE
@@@ -247,6 -248,28 +248,28 @@@ config X86_HAS_BOOT_CPU_I
         def_bool y
         depends on X86_VOYAGER
   
+ config SPARSE_IRQ
+       bool "Support sparse irq numbering"
+       depends on PCI_MSI || HT_IRQ
+       help
+         This enables support for sparse irqs. This is useful for distro
+         kernels that want to define a high CONFIG_NR_CPUS value but still
+         want to have low kernel memory footprint on smaller machines.
+ 
+         ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
+           out the irq_desc[] array in a more NUMA-friendly way. )
+ 
+         If you don't know what to do here, say N.
+ 
+ config NUMA_MIGRATE_IRQ_DESC
+       bool "Move irq desc when changing irq smp_affinity"
+       depends on SPARSE_IRQ && NUMA
+       default n
+       help
+         This enables moving irq_desc to cpu/node that irq will use handled.
+ 
+         If you don't know what to do here, say N.
+ 
   config X86_FIND_SMP_CONFIG
         def_bool y
         depends on X86_MPPARSE || X86_VOYAGER
@@@ -479,7 -502,7 +502,7 @@@ config HPET_TIME
            The HPET provides a stable time base on SMP
            systems, unlike the TSC, but it is more expensive to access,
            as it is off-chip.  You can find the HPET spec at
-          <http://www.intel.com/hardwaredesign/hpetspec.htm>.
+          <http://www.intel.com/hardwaredesign/hpetspec_1.pdf>.
   
            You can safely choose Y here.  However, HPET will only be
            activated if the platform and the BIOS support this feature.
@@@ -564,6 -587,16 +587,16 @@@ config AMD_IOMM
           your BIOS for an option to enable it or if you have an IVRS ACPI
           table.
   
+ config AMD_IOMMU_STATS
+       bool "Export AMD IOMMU statistics to debugfs"
+       depends on AMD_IOMMU
+       select DEBUG_FS
+       help
+         This option enables code in the AMD IOMMU driver to collect various
+         statistics about whats happening in the driver and exports that
+         information to userspace via debugfs.
+         If unsure, say N.
+ 
   # need this always selected by IOMMU for the VIA workaround
   config SWIOTLB
         def_bool y if X86_64
@@@ -577,21 -610,25 +610,25 @@@
   config IOMMU_HELPER
         def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
   
+ config IOMMU_API
+       def_bool (AMD_IOMMU || DMAR)
+ 
   config MAXSMP
         bool "Configure Maximum number of SMP Processors and NUMA Nodes"
-       depends on X86_64 && SMP && BROKEN
+       depends on X86_64 && SMP && DEBUG_KERNEL && EXPERIMENTAL
+       select CPUMASK_OFFSTACK
         default n
         help
           Configure maximum number of CPUS and NUMA Nodes for this architecture.
           If unsure, say N.
   
   config NR_CPUS
-       int "Maximum number of CPUs (2-512)" if !MAXSMP
-       range 2 512
-       depends on SMP
+       int "Maximum number of CPUs" if SMP && !MAXSMP
+       range 2 512 if SMP && !MAXSMP
+       default "1" if !SMP
         default "4096" if MAXSMP
-       default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
-       default "8"
+       default "32" if SMP && (X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000)
+       default "8" if SMP
         help
           This allows you to specify the maximum number of CPUs which this
           kernel will support.  The maximum supported value is 512 and the
@@@ -648,7 -685,6 +685,7 @@@ config X86_UP_IOAPI
   config X86_LOCAL_APIC
         def_bool y
         depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+ +      select HAVE_PERF_COUNTERS if (!M386 && !M486)
   
   config X86_IO_APIC
         def_bool y
diff --combined arch/x86/include/asm/atomic_32.h

index 9927e01b03c2c281cff96bad0c36ab3ab500da42,85b46fba4229cc0334e05d5d9c5e66deff3fa69f..977250ed8b898c17a869e759ec431b71b55177f8
--- 1/arch/x86/include/asm/atomic_32.h
--- 2/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@@ -2,6 -2,7 +2,7 @@@
   #define _ASM_X86_ATOMIC_32_H
   
   #include <linux/compiler.h>
+ #include <linux/types.h>
   #include <asm/processor.h>
   #include <asm/cmpxchg.h>
   
@@@ -10,15 -11,6 +11,6 @@@
    * resource counting etc..
    */
   
- /*
-  * Make sure gcc doesn't try to be clever and move things around
-  * on us. We need to use _exactly_ the address the user gave us,
-  * not some alias that contains the same information.
-  */
- typedef struct {
-       int counter;
- } atomic_t;
- 
   #define ATOMIC_INIT(i)        { (i) }
   
   /**
@@@ -255,223 -247,5 +247,223 @@@ static inline int atomic_add_unless(ato
   #define smp_mb__before_atomic_inc()   barrier()
   #define smp_mb__after_atomic_inc()    barrier()
   
+ +/* An 64bit atomic type */
+ +
+ +typedef struct {
+ +      unsigned long long counter;
+ +} atomic64_t;
+ +
+ +#define ATOMIC64_INIT(val)    { (val) }
+ +
+ +/**
+ + * atomic64_read - read atomic64 variable
+ + * @v: pointer of type atomic64_t
+ + *
+ + * Atomically reads the value of @v.
+ + * Doesn't imply a read memory barrier.
+ + */
+ +#define __atomic64_read(ptr)          ((ptr)->counter)
+ +
+ +static inline unsigned long long
+ +cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
+ +{
+ +      asm volatile(
+ +
+ +              LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
+ +
+ +                   :          "=A" (old)
+ +
+ +                   : [ptr]    "D" (ptr),
+ +                              "A" (old),
+ +                              "b" (ll_low(new)),
+ +                              "c" (ll_high(new))
+ +
+ +                   : "memory");
+ +
+ +      return old;
+ +}
+ +
+ +static inline unsigned long long
+ +atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
+ +               unsigned long long new_val)
+ +{
+ +      return cmpxchg8b(&ptr->counter, old_val, new_val);
+ +}
+ +
+ +/**
+ + * atomic64_set - set atomic64 variable
+ + * @ptr:      pointer to type atomic64_t
+ + * @new_val:  value to assign
+ + *
+ + * Atomically sets the value of @ptr to @new_val.
+ + */
+ +static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+ +{
+ +      unsigned long long old_val;
+ +
+ +      do {
+ +              old_val = atomic_read(ptr);
+ +      } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+ +}
+ +
+ +/**
+ + * atomic64_read - read atomic64 variable
+ + * @ptr:      pointer to type atomic64_t
+ + *
+ + * Atomically reads the value of @ptr and returns it.
+ + */
+ +static inline unsigned long long atomic64_read(atomic64_t *ptr)
+ +{
+ +      unsigned long long curr_val;
+ +
+ +      do {
+ +              curr_val = __atomic64_read(ptr);
+ +      } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
+ +
+ +      return curr_val;
+ +}
+ +
+ +/**
+ + * atomic64_add_return - add and return
+ + * @delta: integer value to add
+ + * @ptr:   pointer to type atomic64_t
+ + *
+ + * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ + */
+ +static inline unsigned long long
+ +atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
+ +{
+ +      unsigned long long old_val, new_val;
+ +
+ +      do {
+ +              old_val = atomic_read(ptr);
+ +              new_val = old_val + delta;
+ +
+ +      } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+ +
+ +      return new_val;
+ +}
+ +
+ +static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
+ +{
+ +      return atomic64_add_return(-delta, ptr);
+ +}
+ +
+ +static inline long atomic64_inc_return(atomic64_t *ptr)
+ +{
+ +      return atomic64_add_return(1, ptr);
+ +}
+ +
+ +static inline long atomic64_dec_return(atomic64_t *ptr)
+ +{
+ +      return atomic64_sub_return(1, ptr);
+ +}
+ +
+ +/**
+ + * atomic64_add - add integer to atomic64 variable
+ + * @delta: integer value to add
+ + * @ptr:   pointer to type atomic64_t
+ + *
+ + * Atomically adds @delta to @ptr.
+ + */
+ +static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
+ +{
+ +      atomic64_add_return(delta, ptr);
+ +}
+ +
+ +/**
+ + * atomic64_sub - subtract the atomic64 variable
+ + * @delta: integer value to subtract
+ + * @ptr:   pointer to type atomic64_t
+ + *
+ + * Atomically subtracts @delta from @ptr.
+ + */
+ +static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
+ +{
+ +      atomic64_add(-delta, ptr);
+ +}
+ +
+ +/**
+ + * atomic64_sub_and_test - subtract value from variable and test result
+ + * @delta: integer value to subtract
+ + * @ptr:   pointer to type atomic64_t
+ + *
+ + * Atomically subtracts @delta from @ptr and returns
+ + * true if the result is zero, or false for all
+ + * other cases.
+ + */
+ +static inline int
+ +atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
+ +{
+ +      unsigned long long old_val = atomic64_sub_return(delta, ptr);
+ +
+ +      return old_val == 0;
+ +}
+ +
+ +/**
+ + * atomic64_inc - increment atomic64 variable
+ + * @ptr: pointer to type atomic64_t
+ + *
+ + * Atomically increments @ptr by 1.
+ + */
+ +static inline void atomic64_inc(atomic64_t *ptr)
+ +{
+ +      atomic64_add(1, ptr);
+ +}
+ +
+ +/**
+ + * atomic64_dec - decrement atomic64 variable
+ + * @ptr: pointer to type atomic64_t
+ + *
+ + * Atomically decrements @ptr by 1.
+ + */
+ +static inline void atomic64_dec(atomic64_t *ptr)
+ +{
+ +      atomic64_sub(1, ptr);
+ +}
+ +
+ +/**
+ + * atomic64_dec_and_test - decrement and test
+ + * @ptr: pointer to type atomic64_t
+ + *
+ + * Atomically decrements @ptr by 1 and
+ + * returns true if the result is 0, or false for all other
+ + * cases.
+ + */
+ +static inline int atomic64_dec_and_test(atomic64_t *ptr)
+ +{
+ +      return atomic64_sub_and_test(1, ptr);
+ +}
+ +
+ +/**
+ + * atomic64_inc_and_test - increment and test
+ + * @ptr: pointer to type atomic64_t
+ + *
+ + * Atomically increments @ptr by 1
+ + * and returns true if the result is zero, or false for all
+ + * other cases.
+ + */
+ +static inline int atomic64_inc_and_test(atomic64_t *ptr)
+ +{
+ +      return atomic64_sub_and_test(-1, ptr);
+ +}
+ +
+ +/**
+ + * atomic64_add_negative - add and test if negative
+ + * @delta: integer value to add
+ + * @ptr:   pointer to type atomic64_t
+ + *
+ + * Atomically adds @delta to @ptr and returns true
+ + * if the result is negative, or false when
+ + * result is greater than or equal to zero.
+ + */
+ +static inline int
+ +atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
+ +{
+ +      long long old_val = atomic64_add_return(delta, ptr);
+ +
+ +      return old_val < 0;
+ +}
+ +
   #include <asm-generic/atomic.h>
   #endif /* _ASM_X86_ATOMIC_32_H */
diff --combined arch/x86/include/asm/irq_vectors.h

index b8d277f1252f935f8354cbb71680c20f4b90d6cc,f7ff65032b9d66aee077fcc3ddbcddd40bd4f75d..21a0b92027f5850455b45f467300aad42b887a2f
--- 1/arch/x86/include/asm/irq_vectors.h
--- 2/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@@ -86,11 -86,6 +86,11 @@@
    */
   #define LOCAL_TIMER_VECTOR    0xef
   
+ +/*
+ + * Performance monitoring interrupt vector:
+ + */
+ +#define LOCAL_PERF_VECTOR     0xee
+ +
   /*
    * First APIC vector available to drivers: (vectors 0x30-0xee) we
    * start at 0x31(0x41) to spread out vectors evenly between priority
@@@ -106,12 -101,23 +106,23 @@@
   #define LAST_VM86_IRQ         15
   #define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
   
+ #define NR_IRQS_LEGACY                16
+ 
   #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
+ 
+ #ifndef CONFIG_SPARSE_IRQ
   # if NR_CPUS < MAX_IO_APICS
   #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
   # else
   #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
   # endif
+ #else
+ # if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
+ #  define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
+ # else
+ #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
+ # endif
+ #endif
   
   #elif defined(CONFIG_X86_VOYAGER)
   
diff --combined arch/x86/kernel/apic.c

index 6c83ac10e6d3458365e852978584a00be9545a3c,566a08466b191dd2ac2097b3df676bd072d1ccdf..d2d17b8d10f863f78defdef3f42aeb7f073fa4f8
--- 1/arch/x86/kernel/apic.c
--- 2/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@@ -31,10 -31,11 +31,12 @@@
   #include <linux/dmi.h>
   #include <linux/dmar.h>
   #include <linux/ftrace.h>
+ #include <linux/smp.h>
+ #include <linux/nmi.h>
+ #include <linux/timex.h>
   
+ +#include <asm/perf_counter.h>
   #include <asm/atomic.h>
- #include <asm/smp.h>
   #include <asm/mtrr.h>
   #include <asm/mpspec.h>
   #include <asm/desc.h>
@@@ -42,10 -43,8 +44,8 @@@
   #include <asm/hpet.h>
   #include <asm/pgalloc.h>
   #include <asm/i8253.h>
- #include <asm/nmi.h>
   #include <asm/idle.h>
   #include <asm/proto.h>
- #include <asm/timex.h>
   #include <asm/apic.h>
   #include <asm/i8259.h>
   
@@@ -99,8 -98,8 +99,8 @@@ __setup("apicpmtimer", setup_apicpmtime
   #ifdef HAVE_X2APIC
   int x2apic;
   /* x2apic enabled before OS handover */
- int x2apic_preenabled;
- int disable_x2apic;
+ static int x2apic_preenabled;
+ static int disable_x2apic;
   static __init int setup_nox2apic(char *str)
   {
         disable_x2apic = 1;
@@@ -120,8 -119,6 +120,6 @@@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_o
   
   int first_system_vector = 0xfe;
   
- char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
- 
   /*
    * Debug level, exported for io_apic.c
    */
@@@ -143,7 -140,7 +141,7 @@@ static int lapic_next_event(unsigned lo
                             struct clock_event_device *evt);
   static void lapic_timer_setup(enum clock_event_mode mode,
                               struct clock_event_device *evt);
- static void lapic_timer_broadcast(cpumask_t mask);
+ static void lapic_timer_broadcast(const struct cpumask *mask);
   static void apic_pm_activate(void);
   
   /*
@@@ -229,7 -226,7 +227,7 @@@ void xapic_icr_write(u32 low, u32 id
         apic_write(APIC_ICR, low);
   }
   
- u64 xapic_icr_read(void)
+ static u64 xapic_icr_read(void)
   {
         u32 icr1, icr2;
   
@@@ -269,7 -266,7 +267,7 @@@ void x2apic_icr_write(u32 low, u32 id
         wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
   }
   
- u64 x2apic_icr_read(void)
+ static u64 x2apic_icr_read(void)
   {
         unsigned long val;
   
@@@ -456,7 -453,7 +454,7 @@@ static void lapic_timer_setup(enum cloc
   /*
    * Local APIC timer broadcast function
    */
- static void lapic_timer_broadcast(cpumask_t mask)
+ static void lapic_timer_broadcast(const struct cpumask *mask)
   {
   #ifdef CONFIG_SMP
         send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
@@@ -472,7 -469,7 +470,7 @@@ static void __cpuinit setup_APIC_timer(
         struct clock_event_device *levt = &__get_cpu_var(lapic_events);
   
         memcpy(levt, &lapic_clockevent, sizeof(*levt));
-       levt->cpumask = cpumask_of_cpu(smp_processor_id());
+       levt->cpumask = cpumask_of(smp_processor_id());
   
         clockevents_register_device(levt);
   }
@@@ -690,7 -687,7 +688,7 @@@ static int __init calibrate_APIC_clock(
                 local_irq_enable();
   
         if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
-               pr_warning("APIC timer disabled due to verification failure.\n");
+               pr_warning("APIC timer disabled due to verification failure\n");
                         return -1;
         }
   
@@@ -1137,7 -1134,6 +1135,7 @@@ void __cpuinit setup_local_APIC(void
                 apic_write(APIC_ESR, 0);
         }
   #endif
+ +      perf_counters_lapic_init(0);
   
         preempt_disable();
   
@@@ -1809,28 -1805,32 +1807,32 @@@ void disconnect_bsp_APIC(int virt_wire_
   void __cpuinit generic_processor_info(int apicid, int version)
   {
         int cpu;
-       cpumask_t tmp_map;
   
         /*
          * Validate version
          */
         if (version == 0x0) {
                 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
-                       "fixing up to 0x10. (tell your hw vendor)\n",
-                       version);
+                          "fixing up to 0x10. (tell your hw vendor)\n",
+                               version);
                 version = 0x10;
         }
         apic_version[apicid] = version;
   
-       if (num_processors >= NR_CPUS) {
-               pr_warning("WARNING: NR_CPUS limit of %i reached."
-                       "  Processor ignored.\n", NR_CPUS);
+       if (num_processors >= nr_cpu_ids) {
+               int max = nr_cpu_ids;
+               int thiscpu = max + disabled_cpus;
+ 
+               pr_warning(
+                       "ACPI: NR_CPUS/possible_cpus limit of %i reached."
+                       "  Processor %d/0x%x ignored.\n", max, thiscpu, apicid);
+ 
+               disabled_cpus++;
                 return;
         }
   
         num_processors++;
-       cpus_complement(tmp_map, cpu_present_map);
-       cpu = first_cpu(tmp_map);
+       cpu = cpumask_next_zero(-1, cpu_present_mask);
   
         physid_set(apicid, phys_cpu_present_map);
         if (apicid == boot_cpu_physical_apicid) {
@@@ -1880,8 -1880,8 +1882,8 @@@
         }
   #endif
   
-       cpu_set(cpu, cpu_possible_map);
-       cpu_set(cpu, cpu_present_map);
+       set_cpu_possible(cpu, true);
+       set_cpu_present(cpu, true);
   }
   
   #ifdef CONFIG_X86_64
@@@ -2083,18 -2083,16 +2085,16 @@@ __cpuinit int apic_is_clustered_box(voi
         bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
         bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
   
-       for (i = 0; i < NR_CPUS; i++) {
+       for (i = 0; i < nr_cpu_ids; i++) {
                 /* are we being called early in kernel startup? */
                 if (bios_cpu_apicid) {
                         id = bios_cpu_apicid[i];
-               }
-               else if (i < nr_cpu_ids) {
+               } else if (i < nr_cpu_ids) {
                         if (cpu_present(i))
                                 id = per_cpu(x86_bios_cpu_apicid, i);
                         else
                                 continue;
-               }
-               else
+               } else
                         break;
   
                 if (id != BAD_APICID)
diff --combined arch/x86/kernel/cpu/common.c

index 376b9f9d8d2396b4c2f97a2e012600a4452af22f,83492b1f93b11c5e0b851300ffdb3e314eaacc9e..667e5d561ed77f39fadbc547421bef1f2832910b
--- 1/arch/x86/kernel/cpu/common.c
--- 2/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@@ -17,7 -17,6 +17,7 @@@
   #include <asm/mmu_context.h>
   #include <asm/mtrr.h>
   #include <asm/mce.h>
+ +#include <asm/perf_counter.h>
   #include <asm/pat.h>
   #include <asm/asm.h>
   #include <asm/numa.h>
@@@ -41,6 -40,26 +41,26 @@@
   
   #include "cpu.h"
   
+ #ifdef CONFIG_X86_64
+ 
+ /* all of these masks are initialized in setup_cpu_local_masks() */
+ cpumask_var_t cpu_callin_mask;
+ cpumask_var_t cpu_callout_mask;
+ cpumask_var_t cpu_initialized_mask;
+ 
+ /* representing cpus for which sibling maps can be computed */
+ cpumask_var_t cpu_sibling_setup_mask;
+ 
+ #else /* CONFIG_X86_32 */
+ 
+ cpumask_t cpu_callin_map;
+ cpumask_t cpu_callout_map;
+ cpumask_t cpu_initialized;
+ cpumask_t cpu_sibling_setup_map;
+ 
+ #endif /* CONFIG_X86_32 */
+ 
+ 
   static struct cpu_dev *this_cpu __cpuinitdata;
   
   #ifdef CONFIG_X86_64
@@@ -356,7 -375,7 +376,7 @@@ void __cpuinit detect_ht(struct cpuinfo
                 printk(KERN_INFO  "CPU: Hyper-Threading is disabled\n");
         } else if (smp_num_siblings > 1) {
   
-               if (smp_num_siblings > NR_CPUS) {
+               if (smp_num_siblings > nr_cpu_ids) {
                         printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
                                         smp_num_siblings);
                         smp_num_siblings = 1;
@@@ -753,7 -772,6 +773,7 @@@ void __init identify_boot_cpu(void
   #else
         vgetcpu_set_mode();
   #endif
+ +      init_hw_perf_counters();
   }
   
   void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@@ -858,8 -876,6 +878,6 @@@ static __init int setup_disablecpuid(ch
   }
   __setup("clearcpuid=", setup_disablecpuid);
   
- cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
- 
   #ifdef CONFIG_X86_64
   struct x8664_pda **_cpu_pda __read_mostly;
   EXPORT_SYMBOL(_cpu_pda);
@@@ -978,7 -994,7 +996,7 @@@ void __cpuinit cpu_init(void
   
         me = current;
   
-       if (cpu_test_and_set(cpu, cpu_initialized))
+       if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
                 panic("CPU#%d already initialized!\n", cpu);
   
         printk(KERN_INFO "Initializing CPU#%d\n", cpu);
@@@ -1087,7 -1103,7 +1105,7 @@@ void __cpuinit cpu_init(void
         struct tss_struct *t = &per_cpu(init_tss, cpu);
         struct thread_struct *thread = &curr->thread;
   
-       if (cpu_test_and_set(cpu, cpu_initialized)) {
+       if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) {
                 printk(KERN_WARNING "CPU#%d already initialized!\n", cpu);
                 for (;;) local_irq_enable();
         }
diff --combined arch/x86/kernel/irq.c

index d92bc71e41a7c594442087a25a63b35a474d1774,3973e2df7f877c3a2fd7868691e3fd55214a178e..22f650db917fc2615f34741b11035bb74a6d5656
--- 1/arch/x86/kernel/irq.c
--- 2/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@@ -5,10 -5,11 +5,11 @@@
   #include <linux/interrupt.h>
   #include <linux/kernel_stat.h>
   #include <linux/seq_file.h>
+ #include <linux/smp.h>
   
   #include <asm/apic.h>
   #include <asm/io_apic.h>
- #include <asm/smp.h>
+ #include <asm/irq.h>
   
   atomic_t irq_err_count;
   
@@@ -56,10 -57,6 +57,10 @@@ static int show_other_interrupts(struc
         for_each_online_cpu(j)
                 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
         seq_printf(p, "  Local timer interrupts\n");
+ +      seq_printf(p, "CNT: ");
+ +      for_each_online_cpu(j)
+ +              seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+ +      seq_printf(p, "  Performance counter interrupts\n");
   #endif
   #ifdef CONFIG_SMP
         seq_printf(p, "RES: ");
@@@ -122,6 -119,9 +123,9 @@@ int show_interrupts(struct seq_file *p
         }
   
         desc = irq_to_desc(i);
+       if (!desc)
+               return 0;
+ 
         spin_lock_irqsave(&desc->lock, flags);
   #ifndef CONFIG_SMP
         any_count = kstat_irqs(i);
@@@ -164,7 -164,6 +168,7 @@@ u64 arch_irq_stat_cpu(unsigned int cpu
   
   #ifdef CONFIG_X86_LOCAL_APIC
         sum += irq_stats(cpu)->apic_timer_irqs;
+ +      sum += irq_stats(cpu)->apic_perf_irqs;
   #endif
   #ifdef CONFIG_SMP
         sum += irq_stats(cpu)->irq_resched_count;
@@@ -192,3 -191,5 +196,5 @@@ u64 arch_irq_stat(void
   #endif
         return sum;
   }
+ 
+ EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
diff --combined arch/x86/kernel/irqinit_32.c

index 6a33b5e30161cc4dff4bb5f87616a3371c174619,1507ad4e674d253f93b60e8df9188bd17354a987..0bef6280f30ccebbd59b1b1f5c0e2fdf9cf6c7ae
--- 1/arch/x86/kernel/irqinit_32.c
--- 2/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@@ -9,18 -9,18 +9,18 @@@
   #include <linux/kernel_stat.h>
   #include <linux/sysdev.h>
   #include <linux/bitops.h>
+ #include <linux/io.h>
+ #include <linux/delay.h>
   
   #include <asm/atomic.h>
   #include <asm/system.h>
- #include <asm/io.h>
   #include <asm/timer.h>
   #include <asm/pgtable.h>
- #include <asm/delay.h>
   #include <asm/desc.h>
   #include <asm/apic.h>
   #include <asm/arch_hooks.h>
   #include <asm/i8259.h>
- 
+ #include <asm/traps.h>
   
   
   /*
@@@ -34,12 -34,10 +34,10 @@@
    * leads to races. IBM designers who came up with it should
    * be shot.
    */
-  
   
   static irqreturn_t math_error_irq(int cpl, void *dev_id)
   {
-       extern void math_error(void __user *);
-       outb(0,0xF0);
+       outb(0, 0xF0);
         if (ignore_fpu_irq || !boot_cpu_data.hard_math)
                 return IRQ_NONE;
         math_error((void __user *)get_irq_regs()->ip);
@@@ -56,7 -54,7 +54,7 @@@ static struct irqaction fpu_irq = 
         .name = "fpu",
   };
   
- void __init init_ISA_irqs (void)
+ void __init init_ISA_irqs(void)
   {
         int i;
   
@@@ -68,8 -66,7 +66,7 @@@
         /*
          * 16 old-style INTA-cycle interrupts:
          */
-       for (i = 0; i < 16; i++) {
-               /* first time call this irq_desc */
+       for (i = 0; i < NR_IRQS_LEGACY; i++) {
                 struct irq_desc *desc = irq_to_desc(i);
   
                 desc->status = IRQ_DISABLED;
@@@ -111,6 -108,18 +108,18 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq
         [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
   };
   
+ int vector_used_by_percpu_irq(unsigned int vector)
+ {
+       int cpu;
+ 
+       for_each_online_cpu(cpu) {
+               if (per_cpu(vector_irq, cpu)[vector] != -1)
+                       return 1;
+       }
+ 
+       return 0;
+ }
+ 
   /* Overridden in paravirt.c */
   void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
   
@@@ -147,10 -156,12 +156,12 @@@ void __init native_init_IRQ(void
         alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
   
         /* IPI for single call function */
-       set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
+       alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+                                call_function_single_interrupt);
   
         /* Low priority IPI to cleanup after moving an irq */
         set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+       set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
   #endif
   
   #ifdef CONFIG_X86_LOCAL_APIC
@@@ -160,9 -171,6 +171,9 @@@
         /* IPI vectors for APIC spurious and error interrupts */
         alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
         alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+ +# ifdef CONFIG_PERF_COUNTERS
+ +      alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+ +# endif
   #endif
   
   #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
diff --combined arch/x86/kernel/irqinit_64.c

index 91d785c25ad9e9bc9c4bd8462de5bb12b446d0db,da481a1e3f303f8fe3bf10995e8e218cc556f2d4..6a71bfc51e51affdbc0a0a963e4959530c92f78a
--- 1/arch/x86/kernel/irqinit_64.c
--- 2/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@@ -11,14 -11,14 +11,14 @@@
   #include <linux/kernel_stat.h>
   #include <linux/sysdev.h>
   #include <linux/bitops.h>
+ #include <linux/acpi.h>
+ #include <linux/io.h>
+ #include <linux/delay.h>
   
- #include <asm/acpi.h>
   #include <asm/atomic.h>
   #include <asm/system.h>
- #include <asm/io.h>
   #include <asm/hw_irq.h>
   #include <asm/pgtable.h>
- #include <asm/delay.h>
   #include <asm/desc.h>
   #include <asm/apic.h>
   #include <asm/i8259.h>
@@@ -69,15 -69,26 +69,26 @@@ DEFINE_PER_CPU(vector_irq_t, vector_irq
         [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
   };
   
- void __init init_ISA_irqs(void)
+ int vector_used_by_percpu_irq(unsigned int vector)
+ {
+       int cpu;
+ 
+       for_each_online_cpu(cpu) {
+               if (per_cpu(vector_irq, cpu)[vector] != -1)
+                       return 1;
+       }
+ 
+       return 0;
+ }
+ 
+ static void __init init_ISA_irqs(void)
   {
         int i;
   
         init_bsp_APIC();
         init_8259A(0);
   
-       for (i = 0; i < 16; i++) {
-               /* first time call this irq_desc */
+       for (i = 0; i < NR_IRQS_LEGACY; i++) {
                 struct irq_desc *desc = irq_to_desc(i);
   
                 desc->status = IRQ_DISABLED;
@@@ -122,6 -133,7 +133,7 @@@ static void __init smp_intr_init(void
   
         /* Low priority IPI to cleanup after moving an irq */
         set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+       set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
   #endif
   }
   
@@@ -138,11 -150,6 +150,11 @@@ static void __init apic_intr_init(void
         /* IPI vectors for APIC spurious and error interrupts */
         alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
         alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+ +
+ +      /* Performance monitoring interrupt: */
+ +#ifdef CONFIG_PERF_COUNTERS
+ +      alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+ +#endif
   }
   
   void __init native_init_IRQ(void)
diff --combined drivers/acpi/processor_idle.c

index f2a043131727d20d7ecd3f409e1cc59095747fde,66a9d81455628454f628496415f4bbee495732b4..7acb23f830ceba79e0b490da2af74d69fcbfef8a
--- 1/drivers/acpi/processor_idle.c
--- 2/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@@ -41,6 -41,7 +41,7 @@@
   #include <linux/pm_qos_params.h>
   #include <linux/clockchips.h>
   #include <linux/cpuidle.h>
+ #include <linux/irqflags.h>
   
   /*
    * Include the apic definitions for x86 to have the APIC timer related defines
@@@ -270,11 -271,8 +271,11 @@@ static atomic_t c3_cpu_count
   /* Common C-state entry for C2, C3, .. */
   static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
   {
+ +      u64 perf_flags;
+ +
         /* Don't trace irqs off for idle */
         stop_critical_timings();
+ +      perf_flags = hw_perf_save_disable();
         if (cstate->entry_method == ACPI_CSTATE_FFH) {
                 /* Call into architectural FFH based C-state */
                 acpi_processor_ffh_cstate_enter(cstate);
@@@ -287,7 -285,6 +288,7 @@@
                    gets asserted in time to freeze execution properly. */
                 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
         }
+ +      hw_perf_restore(perf_flags);
         start_critical_timings();
   }
   #endif /* !CONFIG_CPU_IDLE */
@@@ -1429,11 -1426,8 +1430,11 @@@ static inline void acpi_idle_update_bm_
    */
   static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
   {
+ +      u64 pctrl;
+ +
         /* Don't trace irqs off for idle */
         stop_critical_timings();
+ +      pctrl = hw_perf_save_disable();
         if (cx->entry_method == ACPI_CSTATE_FFH) {
                 /* Call into architectural FFH based C-state */
                 acpi_processor_ffh_cstate_enter(cx);
@@@ -1448,7 -1442,6 +1449,7 @@@
                    gets asserted in time to freeze execution properly. */
                 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
         }
+ +      hw_perf_restore(pctrl);
         start_critical_timings();
   }
   
diff --combined drivers/char/sysrq.c

index 9bcf0c9848b18a5c64de420fabfd3245d168c35e,d41b9f6f7903d297976114553527ce51f7e44e3e..5a3eab0882a024f3fab11a2b01d63b3a10ef9b6c
--- 1/drivers/char/sysrq.c
--- 2/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@@ -25,7 -25,6 +25,7 @@@
   #include <linux/kbd_kern.h>
   #include <linux/proc_fs.h>
   #include <linux/quotaops.h>
+ +#include <linux/perf_counter.h>
   #include <linux/kernel.h>
   #include <linux/module.h>
   #include <linux/suspend.h>
@@@ -83,7 -82,7 +83,7 @@@ static void sysrq_handle_loglevel(int k
   }
   static struct sysrq_key_op sysrq_loglevel_op = {
         .handler        = sysrq_handle_loglevel,
-       .help_msg       = "loglevel0-8",
+       .help_msg       = "loglevel(0-9)",
         .action_msg     = "Changing Loglevel",
         .enable_mask    = SYSRQ_ENABLE_LOG,
   };
@@@ -234,7 -233,7 +234,7 @@@ static void sysrq_handle_showallcpus(in
   
   static struct sysrq_key_op sysrq_showallcpus_op = {
         .handler        = sysrq_handle_showallcpus,
-       .help_msg       = "aLlcpus",
+       .help_msg       = "show-backtrace-all-active-cpus(L)",
         .action_msg     = "Show backtrace of all active CPUs",
         .enable_mask    = SYSRQ_ENABLE_DUMP,
   };
@@@ -245,11 -244,10 +245,11 @@@ static void sysrq_handle_showregs(int k
         struct pt_regs *regs = get_irq_regs();
         if (regs)
                 show_regs(regs);
+ +      perf_counter_print_debug();
   }
   static struct sysrq_key_op sysrq_showregs_op = {
         .handler        = sysrq_handle_showregs,
-       .help_msg       = "showPc",
+       .help_msg       = "show-registers(P)",
         .action_msg     = "Show Regs",
         .enable_mask    = SYSRQ_ENABLE_DUMP,
   };
@@@ -260,7 -258,7 +260,7 @@@ static void sysrq_handle_showstate(int 
   }
   static struct sysrq_key_op sysrq_showstate_op = {
         .handler        = sysrq_handle_showstate,
-       .help_msg       = "showTasks",
+       .help_msg       = "show-task-states(T)",
         .action_msg     = "Show State",
         .enable_mask    = SYSRQ_ENABLE_DUMP,
   };
@@@ -271,7 -269,7 +271,7 @@@ static void sysrq_handle_showstate_bloc
   }
   static struct sysrq_key_op sysrq_showstate_blocked_op = {
         .handler        = sysrq_handle_showstate_blocked,
-       .help_msg       = "shoW-blocked-tasks",
+       .help_msg       = "show-blocked-tasks(W)",
         .action_msg     = "Show Blocked State",
         .enable_mask    = SYSRQ_ENABLE_DUMP,
   };
@@@ -299,7 -297,7 +299,7 @@@ static void sysrq_handle_showmem(int ke
   }
   static struct sysrq_key_op sysrq_showmem_op = {
         .handler        = sysrq_handle_showmem,
-       .help_msg       = "showMem",
+       .help_msg       = "show-memory-usage(M)",
         .action_msg     = "Show Memory",
         .enable_mask    = SYSRQ_ENABLE_DUMP,
   };
@@@ -325,7 -323,7 +325,7 @@@ static void sysrq_handle_term(int key, 
   }
   static struct sysrq_key_op sysrq_term_op = {
         .handler        = sysrq_handle_term,
-       .help_msg       = "tErm",
+       .help_msg       = "terminate-all-tasks(E)",
         .action_msg     = "Terminate All Tasks",
         .enable_mask    = SYSRQ_ENABLE_SIGNAL,
   };
@@@ -343,7 -341,7 +343,7 @@@ static void sysrq_handle_moom(int key, 
   }
   static struct sysrq_key_op sysrq_moom_op = {
         .handler        = sysrq_handle_moom,
-       .help_msg       = "Full",
+       .help_msg       = "memory-full-oom-kill(F)",
         .action_msg     = "Manual OOM execution",
         .enable_mask    = SYSRQ_ENABLE_SIGNAL,
   };
@@@ -355,7 -353,7 +355,7 @@@ static void sysrq_handle_kill(int key, 
   }
   static struct sysrq_key_op sysrq_kill_op = {
         .handler        = sysrq_handle_kill,
-       .help_msg       = "kIll",
+       .help_msg       = "kill-all-tasks(I)",
         .action_msg     = "Kill All Tasks",
         .enable_mask    = SYSRQ_ENABLE_SIGNAL,
   };
@@@ -366,7 -364,7 +366,7 @@@ static void sysrq_handle_unrt(int key, 
   }
   static struct sysrq_key_op sysrq_unrt_op = {
         .handler        = sysrq_handle_unrt,
-       .help_msg       = "Nice",
+       .help_msg       = "nice-all-RT-tasks(N)",
         .action_msg     = "Nice All RT Tasks",
         .enable_mask    = SYSRQ_ENABLE_RTNICE,
   };
diff --combined fs/exec.c

index 911dd0fd7e096b9a12b9cf472f52a0b7e8e61ce9,71a6efe5d8bd572455c1305a07734af70194d581..605be573fe873b5b327b45835a337c3c868d0aab
--- 1/fs/exec.c
--- 2/fs/exec.c
+++ b/fs/exec.c
@@@ -33,7 -33,6 +33,7 @@@
   #include <linux/string.h>
   #include <linux/init.h>
   #include <linux/pagemap.h>
+ +#include <linux/perf_counter.h>
   #include <linux/highmem.h>
   #include <linux/spinlock.h>
   #include <linux/key.h>
@@@ -52,17 -51,13 +52,13 @@@
   #include <linux/audit.h>
   #include <linux/tracehook.h>
   #include <linux/kmod.h>
+ #include <linux/fsnotify.h>
   
   #include <asm/uaccess.h>
   #include <asm/mmu_context.h>
   #include <asm/tlb.h>
   #include "internal.h"
   
- #ifdef __alpha__
- /* for /sbin/loader handling in search_binary_handler() */
- #include <linux/a.out.h>
- #endif
- 
   int core_uses_pid;
   char core_pattern[CORENAME_MAX_SIZE] = "core";
   int suid_dumpable = 0;
@@@ -128,7 -123,8 +124,8 @@@ asmlinkage long sys_uselib(const char _
         if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
                 goto exit;
   
-       error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
+       error = inode_permission(nd.path.dentry->d_inode,
+                                MAY_READ | MAY_EXEC | MAY_OPEN);
         if (error)
                 goto exit;
   
@@@ -137,6 -133,8 +134,8 @@@
         if (IS_ERR(file))
                 goto out;
   
+       fsnotify_open(file->f_path.dentry);
+ 
         error = -ENOEXEC;
         if(file->f_op) {
                 struct linux_binfmt * fmt;
@@@ -234,13 -232,13 +233,13 @@@ static void flush_arg_page(struct linux
   
   static int __bprm_mm_init(struct linux_binprm *bprm)
   {
-       int err = -ENOMEM;
+       int err;
         struct vm_area_struct *vma = NULL;
         struct mm_struct *mm = bprm->mm;
   
         bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
         if (!vma)
-               goto err;
+               return -ENOMEM;
   
         down_write(&mm->mmap_sem);
         vma->vm_mm = mm;
@@@ -253,28 -251,20 +252,20 @@@
          */
         vma->vm_end = STACK_TOP_MAX;
         vma->vm_start = vma->vm_end - PAGE_SIZE;
- 
         vma->vm_flags = VM_STACK_FLAGS;
         vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
         err = insert_vm_struct(mm, vma);
-       if (err) {
-               up_write(&mm->mmap_sem);
+       if (err)
                 goto err;
-       }
   
         mm->stack_vm = mm->total_vm = 1;
         up_write(&mm->mmap_sem);
- 
         bprm->p = vma->vm_end - sizeof(void *);
- 
         return 0;
- 
   err:
-       if (vma) {
-               bprm->vma = NULL;
-               kmem_cache_free(vm_area_cachep, vma);
-       }
- 
+       up_write(&mm->mmap_sem);
+       bprm->vma = NULL;
+       kmem_cache_free(vm_area_cachep, vma);
         return err;
   }
   
@@@ -681,7 -671,7 +672,7 @@@ struct file *open_exec(const char *name
         if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
                 goto out_path_put;
   
-       err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+       err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
         if (err)
                 goto out_path_put;
   
@@@ -689,6 -679,8 +680,8 @@@
         if (IS_ERR(file))
                 return file;
   
+       fsnotify_open(file->f_path.dentry);
+ 
         err = deny_write_access(file);
         if (err) {
                 fput(file);
@@@ -774,7 -766,6 +767,6 @@@ static int de_thread(struct task_struc
         struct signal_struct *sig = tsk->signal;
         struct sighand_struct *oldsighand = tsk->sighand;
         spinlock_t *lock = &oldsighand->siglock;
-       struct task_struct *leader = NULL;
         int count;
   
         if (thread_group_empty(tsk))
@@@ -812,7 -803,7 +804,7 @@@
          * and to assume its PID:
          */
         if (!thread_group_leader(tsk)) {
-               leader = tsk->group_leader;
+               struct task_struct *leader = tsk->group_leader;
   
                 sig->notify_count = -1; /* for exit_notify() */
                 for (;;) {
@@@ -864,8 -855,9 +856,9 @@@
   
                 BUG_ON(leader->exit_state != EXIT_ZOMBIE);
                 leader->exit_state = EXIT_DEAD;
- 
                 write_unlock_irq(&tasklist_lock);
+ 
+               release_task(leader);
         }
   
         sig->group_exit_task = NULL;
@@@ -874,8 -866,6 +867,6 @@@
   no_thread_group:
         exit_itimers(sig);
         flush_itimer_signals();
-       if (leader)
-               release_task(leader);
   
         if (atomic_read(&oldsighand->count) != 1) {
                 struct sighand_struct *newsighand;
@@@ -1020,13 -1010,6 +1011,13 @@@ int flush_old_exec(struct linux_binprm 
   
         current->personality &= ~bprm->per_clear;
   
+ +      /*
+ +       * Flush performance counters when crossing a
+ +       * security domain:
+ +       */
+ +      if (!get_dumpable(current->mm))
+ +              perf_counter_exit_task(current);
+ +
         /* An exec changes our domain. We are no longer part of the thread
            group */
   
@@@ -1181,41 -1164,7 +1172,7 @@@ int search_binary_handler(struct linux_
         unsigned int depth = bprm->recursion_depth;
         int try,retval;
         struct linux_binfmt *fmt;
- #ifdef __alpha__
-       /* handle /sbin/loader.. */
-       {
-           struct exec * eh = (struct exec *) bprm->buf;
- 
-           if (!bprm->loader && eh->fh.f_magic == 0x183 &&
-               (eh->fh.f_flags & 0x3000) == 0x3000)
-           {
-               struct file * file;
-               unsigned long loader;
   
-               allow_write_access(bprm->file);
-               fput(bprm->file);
-               bprm->file = NULL;
- 
-               loader = bprm->vma->vm_end - sizeof(void *);
- 
-               file = open_exec("/sbin/loader");
-               retval = PTR_ERR(file);
-               if (IS_ERR(file))
-                       return retval;
- 
-               /* Remember if the application is TASO.  */
-               bprm->taso = eh->ah.entry < 0x100000000UL;
- 
-               bprm->file = file;
-               bprm->loader = loader;
-               retval = prepare_binprm(bprm);
-               if (retval<0)
-                       return retval;
-               /* should call search_binary_handler recursively here,
-                  but it does not matter */
-           }
-       }
- #endif
         retval = security_bprm_check(bprm);
         if (retval)
                 return retval;
@@@ -1737,7 -1686,7 +1694,7 @@@ int get_dumpable(struct mm_struct *mm
         return (ret >= 2) ? 2 : ret;
   }
   
- int do_coredump(long signr, int exit_code, struct pt_regs * regs)
+ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
   {
         struct core_state core_state;
         char corename[CORENAME_MAX_SIZE + 1];
@@@ -1821,6 -1770,11 +1778,11 @@@
   
         if (ispipe) {
                 helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+               if (!helper_argv) {
+                       printk(KERN_WARNING "%s failed to allocate memory\n",
+                              __func__);
+                       goto fail_unlock;
+               }
                 /* Terminate the string before the first option */
                 delimit = strchr(corename, ' ');
                 if (delimit)
@@@ -1888,5 -1842,5 +1850,5 @@@ fail_unlock
         put_cred(cred);
         coredump_finish(mm);
   fail:
-       return retval;
+       return;
   }
diff --combined include/linux/init_task.h

index d0e6cf3b201cf7bbcd7033dd48dacde96d4aa260,2f3c2d4ef73b1b0df8b4e96998a9021480d5706e..49a40fbc806b3c0c140ad3e9e02677333dd50bb0
--- 1/include/linux/init_task.h
--- 2/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@@ -12,6 -12,7 +12,7 @@@
   #include <net/net_namespace.h>
   
   extern struct files_struct init_files;
+ extern struct fs_struct init_fs;
   
   #define INIT_KIOCTX(name, which_mm) \
   {                                                     \
@@@ -114,16 -115,6 +115,16 @@@ extern struct group_info init_groups
   
   extern struct cred init_cred;
   
+ +#ifdef CONFIG_PERF_COUNTERS
+ +# define INIT_PERF_COUNTERS(tsk)                                      \
+ +      .perf_counter_ctx.counter_list =                                \
+ +              LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),      \
+ +      .perf_counter_ctx.lock =                                        \
+ +              __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
+ +#else
+ +# define INIT_PERF_COUNTERS(tsk)
+ +#endif
+ +
   /*
    *  INIT_TASK is used to set up the first task table, touch at
    * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@@ -188,7 -179,6 +189,7 @@@
         INIT_IDS                                                        \
         INIT_TRACE_IRQFLAGS                                             \
         INIT_LOCKDEP                                                    \
+ +      INIT_PERF_COUNTERS(tsk)                                         \
   }
   
   
diff --combined include/linux/kernel_stat.h

index 1b2e3242497cc4e769bd561b5ac41ec630a10816,570d2041311911352da76c830073f7c9aa314457..ecfa668176341b2fc75e72fe69c42f97022ffc64
--- 1/include/linux/kernel_stat.h
--- 2/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@@ -28,7 -28,9 +28,9 @@@ struct cpu_usage_stat 
   
   struct kernel_stat {
         struct cpu_usage_stat   cpustat;
-       unsigned int irqs[NR_IRQS];
+ #ifndef CONFIG_SPARSE_IRQ
+        unsigned int irqs[NR_IRQS];
+ #endif
   };
   
   DECLARE_PER_CPU(struct kernel_stat, kstat);
@@@ -39,6 -41,10 +41,10 @@@
   
   extern unsigned long long nr_context_switches(void);
   
+ #ifndef CONFIG_SPARSE_IRQ
+ #define kstat_irqs_this_cpu(irq) \
+       (kstat_this_cpu.irqs[irq])
+ 
   struct irq_desc;
   
   static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
@@@ -46,11 -52,17 +52,17 @@@
   {
         kstat_this_cpu.irqs[irq]++;
   }
+ #endif
+ 
   
+ #ifndef CONFIG_SPARSE_IRQ
   static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
   {
          return kstat_cpu(cpu).irqs[irq];
   }
+ #else
+ extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu);
+ #endif
   
   /*
    * Number of interrupts per specific IRQ source, since bootup
@@@ -66,19 -78,14 +78,22 @@@ static inline unsigned int kstat_irqs(u
         return sum;
   }
   
+ +
+ +/*
+ + * Lock/unlock the current runqueue - to extract task statistics:
+ + */
+ +extern void curr_rq_lock_irq_save(unsigned long *flags);
+ +extern void curr_rq_unlock_irq_restore(unsigned long *flags);
+ +extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
   extern unsigned long long task_delta_exec(struct task_struct *);
- extern void account_user_time(struct task_struct *, cputime_t);
- extern void account_user_time_scaled(struct task_struct *, cputime_t);
- extern void account_system_time(struct task_struct *, int, cputime_t);
- extern void account_system_time_scaled(struct task_struct *, cputime_t);
- extern void account_steal_time(struct task_struct *, cputime_t);
+ +
+ extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
+ extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
+ extern void account_steal_time(cputime_t);
+ extern void account_idle_time(cputime_t);
+ 
+ extern void account_process_tick(struct task_struct *, int user);
+ extern void account_steal_ticks(unsigned long ticks);
+ extern void account_idle_ticks(unsigned long ticks);
   
   #endif /* _LINUX_KERNEL_STAT_H */
diff --combined include/linux/sched.h

index fc2c6f3477e7617cb6f135f19a5cd0bde53d3bb1,4cae9b81a1f8851d51a5380d8d37fa7ba3ceb529..f134a0f7080ad6c348eeec19febd8fd3fd35c5b8
--- 1/include/linux/sched.h
--- 2/include/linux/sched.h
+++ b/include/linux/sched.h
@@@ -71,7 -71,6 +71,7 @@@ struct sched_param 
   #include <linux/fs_struct.h>
   #include <linux/compiler.h>
   #include <linux/completion.h>
+ +#include <linux/perf_counter.h>
   #include <linux/pid.h>
   #include <linux/percpu.h>
   #include <linux/topology.h>
@@@ -251,7 -250,7 +251,7 @@@ extern void init_idle_bootup_task(struc
   extern int runqueue_is_locked(void);
   extern void task_rq_unlock_wait(struct task_struct *p);
   
- extern cpumask_t nohz_cpu_mask;
+ extern cpumask_var_t nohz_cpu_mask;
   #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
   extern int select_nohz_load_balancer(int cpu);
   #else
@@@ -285,7 -284,6 +285,6 @@@ long io_schedule_timeout(long timeout)
   
   extern void cpu_init (void);
   extern void trap_init(void);
- extern void account_process_tick(struct task_struct *task, int user);
   extern void update_process_times(int user);
   extern void scheduler_tick(void);
   
@@@ -388,6 -386,9 +387,9 @@@ extern void arch_unmap_area_topdown(str
                 (mm)->hiwater_vm = (mm)->total_vm;      \
   } while (0)
   
+ #define get_mm_hiwater_rss(mm)        max((mm)->hiwater_rss, get_mm_rss(mm))
+ #define get_mm_hiwater_vm(mm) max((mm)->hiwater_vm, (mm)->total_vm)
+ 
   extern void set_dumpable(struct mm_struct *mm, int value);
   extern int get_dumpable(struct mm_struct *mm);
   
@@@ -759,20 -760,51 +761,51 @@@ enum cpu_idle_type 
   #define SD_SERIALIZE          1024    /* Only a single load balancing instance */
   #define SD_WAKE_IDLE_FAR      2048    /* Gain latency sacrificing cache hit */
   
- #define BALANCE_FOR_MC_POWER  \
-       (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
+ enum powersavings_balance_level {
+       POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
+       POWERSAVINGS_BALANCE_BASIC,     /* Fill one thread/core/package
+                                        * first for long running threads
+                                        */
+       POWERSAVINGS_BALANCE_WAKEUP,    /* Also bias task wakeups to semi-idle
+                                        * cpu package for power savings
+                                        */
+       MAX_POWERSAVINGS_BALANCE_LEVELS
+ };
   
- #define BALANCE_FOR_PKG_POWER \
-       ((sched_mc_power_savings || sched_smt_power_savings) ?  \
-        SD_POWERSAVINGS_BALANCE : 0)
+ extern int sched_mc_power_savings, sched_smt_power_savings;
   
- #define test_sd_parent(sd, flag)      ((sd->parent &&         \
-                                        (sd->parent->flags & flag)) ? 1 : 0)
+ static inline int sd_balance_for_mc_power(void)
+ {
+       if (sched_smt_power_savings)
+               return SD_POWERSAVINGS_BALANCE;
   
+       return 0;
+ }
+ 
+ static inline int sd_balance_for_package_power(void)
+ {
+       if (sched_mc_power_savings | sched_smt_power_savings)
+               return SD_POWERSAVINGS_BALANCE;
+ 
+       return 0;
+ }
+ 
+ /*
+  * Optimise SD flags for power savings:
+  * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
+  * Keep default SD flags if sched_{smt,mc}_power_saving=0
+  */
+ 
+ static inline int sd_power_saving_flags(void)
+ {
+       if (sched_mc_power_savings | sched_smt_power_savings)
+               return SD_BALANCE_NEWIDLE;
+ 
+       return 0;
+ }
   
   struct sched_group {
         struct sched_group *next;       /* Must be a circular list */
-       cpumask_t cpumask;
   
         /*
          * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@@ -785,8 -817,15 +818,15 @@@
          * (see include/linux/reciprocal_div.h)
          */
         u32 reciprocal_cpu_power;
+ 
+       unsigned long cpumask[];
   };
   
+ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+ {
+       return to_cpumask(sg->cpumask);
+ }
+ 
   enum sched_domain_level {
         SD_LV_NONE = 0,
         SD_LV_SIBLING,
@@@ -810,7 -849,6 +850,6 @@@ struct sched_domain 
         struct sched_domain *parent;    /* top domain must be null terminated */
         struct sched_domain *child;     /* bottom domain must be null terminated */
         struct sched_group *groups;     /* the balancing groups of the domain */
-       cpumask_t span;                 /* span of all CPUs in this domain */
         unsigned long min_interval;     /* Minimum balance interval ms */
         unsigned long max_interval;     /* Maximum balance interval ms */
         unsigned int busy_factor;       /* less balancing by factor if busy */
@@@ -865,18 -903,34 +904,34 @@@
   #ifdef CONFIG_SCHED_DEBUG
         char *name;
   #endif
+ 
+       /* span of all CPUs in this domain */
+       unsigned long span[];
   };
   
- extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+ {
+       return to_cpumask(sd->span);
+ }
+ 
+ extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                                     struct sched_domain_attr *dattr_new);
- extern int arch_reinit_sched_domains(void);
+ 
+ /* Test a flag in parent sched domain */
+ static inline int test_sd_parent(struct sched_domain *sd, int flag)
+ {
+       if (sd->parent && (sd->parent->flags & flag))
+               return 1;
+ 
+       return 0;
+ }
   
   #else /* CONFIG_SMP */
   
   struct sched_domain_attr;
   
   static inline void
- partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                         struct sched_domain_attr *dattr_new)
   {
   }
@@@ -927,7 -981,7 +982,7 @@@ struct sched_class 
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
   
         void (*set_cpus_allowed)(struct task_struct *p,
-                                const cpumask_t *newmask);
+                                const struct cpumask *newmask);
   
         void (*rq_online)(struct rq *rq);
         void (*rq_offline)(struct rq *rq);
@@@ -977,8 -1031,6 +1032,8 @@@ struct sched_entity 
         u64                     last_wakeup;
         u64                     avg_overlap;
   
+ +      u64                     nr_migrations;
+ +
   #ifdef CONFIG_SCHEDSTATS
         u64                     wait_start;
         u64                     wait_max;
@@@ -994,6 -1046,7 +1049,6 @@@
         u64                     exec_max;
         u64                     slice_max;
   
- -      u64                     nr_migrations;
         u64                     nr_migrations_cold;
         u64                     nr_failed_migrations_affine;
         u64                     nr_failed_migrations_running;
@@@ -1296,7 -1349,6 +1351,7 @@@ struct task_struct 
         struct list_head pi_state_list;
         struct futex_pi_state *pi_state_cache;
   #endif
+ +      struct perf_counter_context perf_counter_ctx;
   #ifdef CONFIG_NUMA
         struct mempolicy *mempolicy;
         short il_next;
@@@ -1582,12 -1634,12 +1637,12 @@@ extern cputime_t task_gtime(struct task
   
   #ifdef CONFIG_SMP
   extern int set_cpus_allowed_ptr(struct task_struct *p,
-                               const cpumask_t *new_mask);
+                               const struct cpumask *new_mask);
   #else
   static inline int set_cpus_allowed_ptr(struct task_struct *p,
-                                      const cpumask_t *new_mask)
+                                      const struct cpumask *new_mask)
   {
-       if (!cpu_isset(0, *new_mask))
+       if (!cpumask_test_cpu(0, new_mask))
                 return -EINVAL;
         return 0;
   }
@@@ -1654,16 -1706,16 +1709,16 @@@ extern void wake_up_idle_cpu(int cpu)
   static inline void wake_up_idle_cpu(int cpu) { }
   #endif
   
- #ifdef CONFIG_SCHED_DEBUG
   extern unsigned int sysctl_sched_latency;
   extern unsigned int sysctl_sched_min_granularity;
   extern unsigned int sysctl_sched_wakeup_granularity;
+ extern unsigned int sysctl_sched_shares_ratelimit;
+ extern unsigned int sysctl_sched_shares_thresh;
+ #ifdef CONFIG_SCHED_DEBUG
   extern unsigned int sysctl_sched_child_runs_first;
   extern unsigned int sysctl_sched_features;
   extern unsigned int sysctl_sched_migration_cost;
   extern unsigned int sysctl_sched_nr_migrate;
- extern unsigned int sysctl_sched_shares_ratelimit;
- extern unsigned int sysctl_sched_shares_thresh;
   
   int sched_nr_latency_handler(struct ctl_table *table, int write,
                 struct file *file, void __user *buffer, size_t *length,
@@@ -2198,10 -2250,8 +2253,8 @@@ __trace_special(void *__tr, void *__dat
   }
   #endif
   
- extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
- extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
- 
- extern int sched_mc_power_savings, sched_smt_power_savings;
+ extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+ extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
   
   extern void normalize_rt_tasks(void);
   
@@@ -2272,13 -2322,6 +2325,13 @@@ static inline void inc_syscw(struct tas
   #define TASK_SIZE_OF(tsk)     TASK_SIZE
   #endif
   
+ +/*
+ + * Call the function if the target task is executing on a CPU right now:
+ + */
+ +extern void task_oncpu_function_call(struct task_struct *p,
+ +                                   void (*func) (void *info), void *info);
+ +
+ +
   #ifdef CONFIG_MM_OWNER
   extern void mm_update_next_owner(struct mm_struct *mm);
   extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --combined include/linux/syscalls.h

index a549678b7c3c51697d340ebc7bd5689fdac4f62d,18d0a243a7b3a8cd98e05386c10ac697258bbb56..a1d177ce0a08edf101bc26e27416b6102cec43ef
--- 1/include/linux/syscalls.h
--- 2/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@@ -54,7 -54,6 +54,7 @@@ struct compat_stat
   struct compat_timeval;
   struct robust_list_head;
   struct getcpu_cache;
+ +struct perf_counter_hw_event;
   
   #include <linux/types.h>
   #include <linux/aio_abi.h>
@@@ -550,7 -549,7 +550,7 @@@ asmlinkage long sys_inotify_init(void)
   asmlinkage long sys_inotify_init1(int flags);
   asmlinkage long sys_inotify_add_watch(int fd, const char __user *path,
                                         u32 mask);
- asmlinkage long sys_inotify_rm_watch(int fd, u32 wd);
+ asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd);
   
   asmlinkage long sys_spu_run(int fd, __u32 __user *unpc,
                                  __u32 __user *ustatus);
@@@ -625,11 -624,4 +625,11 @@@ asmlinkage long sys_fallocate(int fd, i
   
   int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
   
+ +
+ +asmlinkage int sys_perf_counter_open(
+ +
+ +      struct perf_counter_hw_event    *hw_event_uptr          __user,
+ +      pid_t                           pid,
+ +      int                             cpu,
+ +      int                             group_fd);
   #endif
diff --combined init/Kconfig

index c38ae71a5e195c5c58d657bb5f5ec00cb3fd905b,a724a149bf3f232aba760c6b584db3f52dfd7e32..a588cdc274bcae501f736f0cbc06b4bf94d37669
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -271,59 -271,6 +271,6 @@@ config LOG_BUF_SHIF
                      13 =>  8 KB
                      12 =>  4 KB
   
- config CGROUPS
-       bool "Control Group support"
-       help
-         This option will let you use process cgroup subsystems
-         such as Cpusets
- 
-         Say N if unsure.
- 
- config CGROUP_DEBUG
-       bool "Example debug cgroup subsystem"
-       depends on CGROUPS
-       default n
-       help
-         This option enables a simple cgroup subsystem that
-         exports useful debugging information about the cgroups
-         framework
- 
-         Say N if unsure
- 
- config CGROUP_NS
-         bool "Namespace cgroup subsystem"
-         depends on CGROUPS
-         help
-           Provides a simple namespace cgroup subsystem to
-           provide hierarchical naming of sets of namespaces,
-           for instance virtual servers and checkpoint/restart
-           jobs.
- 
- config CGROUP_FREEZER
-         bool "control group freezer subsystem"
-         depends on CGROUPS
-         help
-           Provides a way to freeze and unfreeze all tasks in a
-         cgroup.
- 
- config CGROUP_DEVICE
-       bool "Device controller for cgroups"
-       depends on CGROUPS && EXPERIMENTAL
-       help
-         Provides a cgroup implementing whitelists for devices which
-         a process in the cgroup can mknod or open.
- 
- config CPUSETS
-       bool "Cpuset support"
-       depends on SMP && CGROUPS
-       help
-         This option will let you create and manage CPUSETs which
-         allow dynamically partitioning a system into sets of CPUs and
-         Memory Nodes and assigning tasks to run only within those sets.
-         This is primarily useful on large SMP or NUMA systems.
- 
-         Say N if unsure.
- 
   #
   # Architectures with an unreliable sched_clock() should select this:
   #
@@@ -337,6 -284,8 +284,8 @@@ config GROUP_SCHE
         help
           This feature lets CPU scheduler recognize task groups and control CPU
           bandwidth allocation to such task groups.
+         In order to create a group from arbitrary set of processes, use
+         CONFIG_CGROUPS. (See Control Group support.)
   
   config FAIR_GROUP_SCHED
         bool "Group scheduling for SCHED_OTHER"
@@@ -379,6 -328,66 +328,66 @@@ config CGROUP_SCHE
   
   endchoice
   
+ menu "Control Group support"
+ config CGROUPS
+       bool "Control Group support"
+       help
+         This option add support for grouping sets of processes together, for
+         use with process control subsystems such as Cpusets, CFS, memory
+         controls or device isolation.
+         See
+               - Documentation/cpusets.txt     (Cpusets)
+               - Documentation/scheduler/sched-design-CFS.txt  (CFS)
+               - Documentation/cgroups/ (features for grouping, isolation)
+               - Documentation/controllers/ (features for resource control)
+ 
+         Say N if unsure.
+ 
+ config CGROUP_DEBUG
+       bool "Example debug cgroup subsystem"
+       depends on CGROUPS
+       default n
+       help
+         This option enables a simple cgroup subsystem that
+         exports useful debugging information about the cgroups
+         framework
+ 
+         Say N if unsure
+ 
+ config CGROUP_NS
+         bool "Namespace cgroup subsystem"
+         depends on CGROUPS
+         help
+           Provides a simple namespace cgroup subsystem to
+           provide hierarchical naming of sets of namespaces,
+           for instance virtual servers and checkpoint/restart
+           jobs.
+ 
+ config CGROUP_FREEZER
+         bool "control group freezer subsystem"
+         depends on CGROUPS
+         help
+           Provides a way to freeze and unfreeze all tasks in a
+         cgroup.
+ 
+ config CGROUP_DEVICE
+       bool "Device controller for cgroups"
+       depends on CGROUPS && EXPERIMENTAL
+       help
+         Provides a cgroup implementing whitelists for devices which
+         a process in the cgroup can mknod or open.
+ 
+ config CPUSETS
+       bool "Cpuset support"
+       depends on SMP && CGROUPS
+       help
+         This option will let you create and manage CPUSETs which
+         allow dynamically partitioning a system into sets of CPUs and
+         Memory Nodes and assigning tasks to run only within those sets.
+         This is primarily useful on large SMP or NUMA systems.
+ 
+         Say N if unsure.
+ 
   config CGROUP_CPUACCT
         bool "Simple CPU accounting cgroup subsystem"
         depends on CGROUPS
@@@ -393,9 -402,6 +402,6 @@@ config RESOURCE_COUNTER
             infrastructure that works with cgroups
         depends on CGROUPS
   
- config MM_OWNER
-       bool
- 
   config CGROUP_MEM_RES_CTLR
         bool "Memory Resource Controller for Control Groups"
         depends on CGROUPS && RESOURCE_COUNTERS
@@@ -414,36 -420,68 +420,68 @@@
           sure you need the memory resource controller. Even when you enable
           this, you can set "cgroup_disable=memory" at your boot option to
           disable memory resource controller and you can avoid overheads.
-         (and lose benefits of memory resource contoller)
+         (and lose benefits of memory resource controller)
   
           This config option also selects MM_OWNER config option, which
           could in turn add some fork/exit overhead.
   
+ config MM_OWNER
+       bool
+ 
+ config CGROUP_MEM_RES_CTLR_SWAP
+       bool "Memory Resource Controller Swap Extension(EXPERIMENTAL)"
+       depends on CGROUP_MEM_RES_CTLR && SWAP && EXPERIMENTAL
+       help
+         Add swap management feature to memory resource controller. When you
+         enable this, you can limit mem+swap usage per cgroup. In other words,
+         when you disable this, memory resource controller has no cares to
+         usage of swap...a process can exhaust all of the swap. This extension
+         is useful when you want to avoid exhaustion swap but this itself
+         adds more overheads and consumes memory for remembering information.
+         Especially if you use 32bit system or small memory system, please
+         be careful about enabling this. When memory resource controller
+         is disabled by boot option, this will be automatically disabled and
+         there will be no overhead from this. Even when you set this config=y,
+         if boot option "noswapaccount" is set, swap will not be accounted.
+ 
+ 
+ endmenu
+ 
   config SYSFS_DEPRECATED
         bool
   
   config SYSFS_DEPRECATED_V2
-       bool "Create deprecated sysfs files"
+       bool "Create deprecated sysfs layout for older userspace tools"
         depends on SYSFS
         default y
         select SYSFS_DEPRECATED
         help
-         This option creates deprecated symlinks such as the
-         "device"-link, the <subsystem>:<name>-link, and the
-         "bus"-link. It may also add deprecated key in the
-         uevent environment.
-         None of these features or values should be used today, as
-         they export driver core implementation details to userspace
-         or export properties which can't be kept stable across kernel
-         releases.
- 
-         If enabled, this option will also move any device structures
-         that belong to a class, back into the /sys/class hierarchy, in
-         order to support older versions of udev and some userspace
-         programs.
- 
-         If you are using a distro with the most recent userspace
-         packages, it should be safe to say N here.
+         This option switches the layout of sysfs to the deprecated
+         version.
+ 
+         The current sysfs layout features a unified device tree at
+         /sys/devices/, which is able to express a hierarchy between
+         class devices. If the deprecated option is set to Y, the
+         unified device tree is split into a bus device tree at
+         /sys/devices/ and several individual class device trees at
+         /sys/class/. The class and bus devices will be connected by
+         "<subsystem>:<name>" and the "device" links. The "block"
+         class devices, will not show up in /sys/class/block/. Some
+         subsystems will suppress the creation of some devices which
+         depend on the unified device tree.
+ 
+         This option is not a pure compatibility option that can
+         be safely enabled on newer distributions. It will change the
+         layout of sysfs to the non-extensible deprecated version,
+         and disable some features, which can not be exported without
+         confusing older userspace tools. Since 2007/2008 all major
+         distributions do not enable this option, and ship no tools which
+         depend on the deprecated layout or this option.
+ 
+         If you are using a new kernel on an older distribution, or use
+         older userspace tools, you might need to say Y here. Do not say Y,
+         if the original kernel, that came with your distribution, has
+         this option set to N.
   
   config PROC_PID_CPUSET
         bool "Include legacy /proc/<pid>/cpuset file"
@@@ -739,36 -777,6 +777,36 @@@ config AI
             by some high performance threaded applications. Disabling
             this option saves about 7k.
   
+ +config HAVE_PERF_COUNTERS
+ +      bool
+ +
+ +menu "Performance Counters"
+ +
+ +config PERF_COUNTERS
+ +      bool "Kernel Performance Counters"
+ +      depends on HAVE_PERF_COUNTERS
+ +      default y
+ +      select ANON_INODES
+ +      help
+ +        Enable kernel support for performance counter hardware.
+ +
+ +        Performance counters are special hardware registers available
+ +        on most modern CPUs. These registers count the number of certain
+ +        types of hw events: such as instructions executed, cachemisses
+ +        suffered, or branches mis-predicted - without slowing down the
+ +        kernel or applications. These registers can also trigger interrupts
+ +        when a threshold number of events have passed - and can thus be
+ +        used to profile the code that runs on that CPU.
+ +
+ +        The Linux Performance Counter subsystem provides an abstraction of
+ +        these hardware capabilities, available via a system call. It
+ +        provides per task and per CPU counters, and it provides event
+ +        capabilities on top of those.
+ +
+ +        Say Y if unsure.
+ +
+ +endmenu
+ +
   config VM_EVENT_COUNTERS
         default y
         bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
@@@ -868,10 -876,6 +906,6 @@@ config RT_MUTEXE
         boolean
         select PLIST
   
- config TINY_SHMEM
-       default !SHMEM
-       bool
- 
   config BASE_SMALL
         int
         default 0 if BASE_FULL
@@@ -946,14 -950,17 +980,17 @@@ config MODULE_SRCVERSION_AL
           the version).  With this option, such a "srcversion" field
           will be created for all modules.  If unsure, say N.
   
- config KMOD
-       def_bool y
-       help
-         This is being removed soon.  These days, CONFIG_MODULES
-         implies CONFIG_KMOD, so use that instead.
- 
   endif # MODULES
   
+ config INIT_ALL_POSSIBLE
+       bool
+       help
+         Back when each arch used to define their own cpu_online_map and
+         cpu_possible_map, some of them chose to initialize cpu_possible_map
+         with all 1s, and others with all 0s.  When they were centralised,
+         it was better to provide this option than to break all the archs
+         and have several arch maintainers persuing me down dark alleys.
+ 
   config STOP_MACHINE
         bool
         default y
@@@ -966,10 -973,90 +1003,90 @@@ source "block/Kconfig
   config PREEMPT_NOTIFIERS
         bool
   
+ choice
+       prompt "RCU Implementation"
+       default CLASSIC_RCU
+ 
   config CLASSIC_RCU
-       def_bool !PREEMPT_RCU
+       bool "Classic RCU"
         help
           This option selects the classic RCU implementation that is
           designed for best read-side performance on non-realtime
-         systems.  Classic RCU is the default.  Note that the
-         PREEMPT_RCU symbol is used to select/deselect this option.
+         systems.
+ 
+         Select this option if you are unsure.
+ 
+ config TREE_RCU
+       bool "Tree-based hierarchical RCU"
+       help
+         This option selects the RCU implementation that is
+         designed for very large SMP system with hundreds or
+         thousands of CPUs.
+ 
+ config PREEMPT_RCU
+       bool "Preemptible RCU"
+       depends on PREEMPT
+       help
+         This option reduces the latency of the kernel by making certain
+         RCU sections preemptible. Normally RCU code is non-preemptible, if
+         this option is selected then read-only RCU sections become
+         preemptible. This helps latency, but may expose bugs due to
+         now-naive assumptions about each RCU read-side critical section
+         remaining on a given CPU through its execution.
+ 
+ endchoice
+ 
+ config RCU_TRACE
+       bool "Enable tracing for RCU"
+       depends on TREE_RCU || PREEMPT_RCU
+       help
+         This option provides tracing in RCU which presents stats
+         in debugfs for debugging RCU implementation.
+ 
+         Say Y here if you want to enable RCU tracing
+         Say N if you are unsure.
+ 
+ config RCU_FANOUT
+       int "Tree-based hierarchical RCU fanout value"
+       range 2 64 if 64BIT
+       range 2 32 if !64BIT
+       depends on TREE_RCU
+       default 64 if 64BIT
+       default 32 if !64BIT
+       help
+         This option controls the fanout of hierarchical implementations
+         of RCU, allowing RCU to work efficiently on machines with
+         large numbers of CPUs.  This value must be at least the cube
+         root of NR_CPUS, which allows NR_CPUS up to 32,768 for 32-bit
+         systems and up to 262,144 for 64-bit systems.
+ 
+         Select a specific number if testing RCU itself.
+         Take the default if unsure.
+ 
+ config RCU_FANOUT_EXACT
+       bool "Disable tree-based hierarchical RCU auto-balancing"
+       depends on TREE_RCU
+       default n
+       help
+         This option forces use of the exact RCU_FANOUT value specified,
+         regardless of imbalances in the hierarchy.  This is useful for
+         testing RCU itself, and might one day be useful on systems with
+         strong NUMA behavior.
+ 
+         Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
+ 
+         Say N if unsure.
+ 
+ config TREE_RCU_TRACE
+       def_bool RCU_TRACE && TREE_RCU
+       select DEBUG_FS
+       help
+         This option provides tracing for the TREE_RCU implementation,
+         permitting Makefile to trivially select kernel/rcutree_trace.c.
+ 
+ config PREEMPT_RCU_TRACE
+       def_bool RCU_TRACE && PREEMPT_RCU
+       select DEBUG_FS
+       help
+         This option provides tracing for the PREEMPT_RCU implementation,
+         permitting Makefile to trivially select kernel/rcupreempt_trace.c.
diff --combined kernel/Makefile

index 4476da868f86a8b52b0d1c1c87fbf43bf85b69be,2921d90ce32fd760000622032f6569b5ab413d71..8b2628c7914b039a90be2647b1b8926751d2dc7a
--- 1/kernel/Makefile
--- 2/kernel/Makefile
+++ b/kernel/Makefile
@@@ -9,7 -9,8 +9,8 @@@ obj-y     = sched.o fork.o exec_domain.
             rcupdate.o extable.o params.o posix-timers.o \
             kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
             hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-           notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o
+           notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
+           async.o
   
   ifdef CONFIG_FUNCTION_TRACER
   # Do not trace debug files and internal ftrace files
@@@ -73,10 -74,10 +74,10 @@@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq
   obj-$(CONFIG_SECCOMP) += seccomp.o
   obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
   obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+ obj-$(CONFIG_TREE_RCU) += rcutree.o
   obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
- ifeq ($(CONFIG_PREEMPT_RCU),y)
- obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
- endif
+ obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o
+ obj-$(CONFIG_PREEMPT_RCU_TRACE) += rcupreempt_trace.o
   obj-$(CONFIG_RELAY) += relay.o
   obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
   obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
@@@ -88,7 -89,6 +89,7 @@@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT
   obj-$(CONFIG_FUNCTION_TRACER) += trace/
   obj-$(CONFIG_TRACING) += trace/
   obj-$(CONFIG_SMP) += sched_cpupri.o
+ +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
   
   ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
   # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --combined kernel/exit.c

index ad8d04d83a2e1a0c55e53c6cb0814bfb241f839c,c7740fa3252cb0ab24d8f07bd89acf47006c3d60..cbdb39a498ebc3e60c4b81fc9925487826fb1a45
--- 1/kernel/exit.c
--- 2/kernel/exit.c
+++ b/kernel/exit.c
@@@ -159,9 -159,6 +159,9 @@@ static void delayed_put_task_struct(str
   {
         struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
   
+ +#ifdef CONFIG_PERF_COUNTERS
+ +      WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+ +#endif
         trace_sched_process_free(tsk);
         put_task_struct(tsk);
   }
@@@ -645,35 -642,31 +645,31 @@@ retry
         /*
          * We found no owner yet mm_users > 1: this implies that we are
          * most likely racing with swapoff (try_to_unuse()) or /proc or
-        * ptrace or page migration (get_task_mm()).  Mark owner as NULL,
-        * so that subsystems can understand the callback and take action.
+        * ptrace or page migration (get_task_mm()).  Mark owner as NULL.
          */
-       down_write(&mm->mmap_sem);
-       cgroup_mm_owner_callbacks(mm->owner, NULL);
         mm->owner = NULL;
-       up_write(&mm->mmap_sem);
         return;
   
   assign_new_owner:
         BUG_ON(c == p);
         get_task_struct(c);
-       read_unlock(&tasklist_lock);
-       down_write(&mm->mmap_sem);
         /*
          * The task_lock protects c->mm from changing.
          * We always want mm->owner->mm == mm
          */
         task_lock(c);
+       /*
+        * Delay read_unlock() till we have the task_lock()
+        * to ensure that c does not slip away underneath us
+        */
+       read_unlock(&tasklist_lock);
         if (c->mm != mm) {
                 task_unlock(c);
-               up_write(&mm->mmap_sem);
                 put_task_struct(c);
                 goto retry;
         }
-       cgroup_mm_owner_callbacks(mm->owner, c);
         mm->owner = c;
         task_unlock(c);
-       up_write(&mm->mmap_sem);
         put_task_struct(c);
   }
   #endif /* CONFIG_MM_OWNER */
@@@ -1040,8 -1033,6 +1036,6 @@@ NORET_TYPE void do_exit(long code
                  * task into the wait for ever nirwana as well.
                  */
                 tsk->flags |= PF_EXITPIDONE;
-               if (tsk->io_context)
-                       exit_io_context();
                 set_current_state(TASK_UNINTERRUPTIBLE);
                 schedule();
         }
@@@ -1060,10 -1051,7 +1054,7 @@@
                                 preempt_count());
   
         acct_update_integrals(tsk);
-       if (tsk->mm) {
-               update_hiwater_rss(tsk->mm);
-               update_hiwater_vm(tsk->mm);
-       }
+ 
         group_dead = atomic_dec_and_test(&tsk->signal->live);
         if (group_dead) {
                 hrtimer_cancel(&tsk->signal->real_timer);
@@@ -1105,6 -1093,10 +1096,6 @@@
         tsk->mempolicy = NULL;
   #endif
   #ifdef CONFIG_FUTEX
- -      /*
- -       * This must happen late, after the PID is not
- -       * hashed anymore:
- -       */
         if (unlikely(!list_empty(&tsk->pi_state_list)))
                 exit_pi_state_list(tsk);
         if (unlikely(current->pi_state_cache))
@@@ -1327,10 -1319,10 +1318,10 @@@ static int wait_task_zombie(struct task
                  * group, which consolidates times for all threads in the
                  * group including the group leader.
                  */
+               thread_group_cputime(p, &cputime);
                 spin_lock_irq(&p->parent->sighand->siglock);
                 psig = p->parent->signal;
                 sig = p->signal;
-               thread_group_cputime(p, &cputime);
                 psig->cutime =
                         cputime_add(psig->cutime,
                         cputime_add(cputime.utime,
@@@ -1369,12 -1361,6 +1360,12 @@@
          */
         read_unlock(&tasklist_lock);
   
+ +      /*
+ +       * Flush inherited counters to the parent - before the parent
+ +       * gets woken up by child-exit notifications.
+ +       */
+ +      perf_counter_exit_task(p);
+ +
         retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
         status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                 ? p->signal->group_exit_code : p->exit_code;
diff --combined kernel/fork.c

index cb706599057f1f462b9b3714c94b9bf5e96795b9,1d68f1255dd824cf2fd1c06a1e6464bac718d26f..b1f8609287ebfa6d22bb1da979473ad923ae1f9e
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -400,6 -400,18 +400,18 @@@ __cacheline_aligned_in_smp DEFINE_SPINL
   #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
   #define free_mm(mm)   (kmem_cache_free(mm_cachep, (mm)))
   
+ static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
+ 
+ static int __init coredump_filter_setup(char *s)
+ {
+       default_dump_filter =
+               (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) &
+               MMF_DUMP_FILTER_MASK;
+       return 1;
+ }
+ 
+ __setup("coredump_filter=", coredump_filter_setup);
+ 
   #include <linux/init_task.h>
   
   static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
@@@ -408,15 -420,14 +420,14 @@@
         atomic_set(&mm->mm_count, 1);
         init_rwsem(&mm->mmap_sem);
         INIT_LIST_HEAD(&mm->mmlist);
-       mm->flags = (current->mm) ? current->mm->flags
-                                 : MMF_DUMP_FILTER_DEFAULT;
+       mm->flags = (current->mm) ? current->mm->flags : default_dump_filter;
         mm->core_state = NULL;
         mm->nr_ptes = 0;
         set_mm_counter(mm, file_rss, 0);
         set_mm_counter(mm, anon_rss, 0);
         spin_lock_init(&mm->page_table_lock);
-       rwlock_init(&mm->ioctx_list_lock);
-       mm->ioctx_list = NULL;
+       spin_lock_init(&mm->ioctx_lock);
+       INIT_HLIST_HEAD(&mm->ioctx_list);
         mm->free_area_cache = TASK_UNMAPPED_BASE;
         mm->cached_hole_size = ~0UL;
         mm_init_owner(mm, p);
@@@ -758,7 -769,7 +769,7 @@@ static int copy_sighand(unsigned long c
   {
         struct sighand_struct *sig;
   
-       if (clone_flags & (CLONE_SIGHAND | CLONE_THREAD)) {
+       if (clone_flags & CLONE_SIGHAND) {
                 atomic_inc(&current->sighand->count);
                 return 0;
         }
@@@ -974,7 -985,6 +985,7 @@@ static struct task_struct *copy_process
                 goto fork_out;
   
         rt_mutex_init_task(p);
+ +      perf_counter_init_task(p);
   
   #ifdef CONFIG_PROVE_LOCKING
         DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
@@@ -1116,12 -1126,12 +1127,12 @@@
   
         if (pid != &init_struct_pid) {
                 retval = -ENOMEM;
-               pid = alloc_pid(task_active_pid_ns(p));
+               pid = alloc_pid(p->nsproxy->pid_ns);
                 if (!pid)
                         goto bad_fork_cleanup_io;
   
                 if (clone_flags & CLONE_NEWPID) {
-                       retval = pid_ns_prepare_proc(task_active_pid_ns(p));
+                       retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
                         if (retval < 0)
                                 goto bad_fork_free_pid;
                 }
@@@ -1471,12 -1481,10 +1482,10 @@@ void __init proc_caches_init(void
         fs_cachep = kmem_cache_create("fs_cache",
                         sizeof(struct fs_struct), 0,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-       vm_area_cachep = kmem_cache_create("vm_area_struct",
-                       sizeof(struct vm_area_struct), 0,
-                       SLAB_PANIC, NULL);
         mm_cachep = kmem_cache_create("mm_struct",
                         sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
                         SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+       mmap_init();
   }
   
   /*
diff --combined kernel/sched.c

index 3dfbff5fb1ac8456a49bcd85d768157020c70a96,deb5ac8c12f37c44e71dcc46484149d073430948..43fd21233b93bb1c350673621f64c1508217fb20
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -209,7 -209,6 +209,6 @@@ void init_rt_bandwidth(struct rt_bandwi
         hrtimer_init(&rt_b->rt_period_timer,
                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rt_b->rt_period_timer.function = sched_rt_period_timer;
-       rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
   }
   
   static inline int rt_bandwidth_enabled(void)
@@@ -499,18 -498,26 +498,26 @@@ struct rt_rq 
    */
   struct root_domain {
         atomic_t refcount;
-       cpumask_t span;
-       cpumask_t online;
+       cpumask_var_t span;
+       cpumask_var_t online;
   
         /*
          * The "RT overload" flag: it gets set if a CPU has more than
          * one runnable RT task.
          */
-       cpumask_t rto_mask;
+       cpumask_var_t rto_mask;
         atomic_t rto_count;
   #ifdef CONFIG_SMP
         struct cpupri cpupri;
   #endif
+ #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+       /*
+        * Preferred wake up cpu nominated by sched_mc balance that will be
+        * used when most cpus are idle in the system indicating overall very
+        * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2)
+        */
+       unsigned int sched_mc_preferred_wakeup_cpu;
+ #endif
   };
   
   /*
@@@ -658,7 -665,7 +665,7 @@@ static inline int cpu_of(struct rq *rq
   #define task_rq(p)            cpu_rq(task_cpu(p))
   #define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
   
- -static inline void update_rq_clock(struct rq *rq)
+ +inline void update_rq_clock(struct rq *rq)
   {
         rq->clock = sched_clock_cpu(cpu_of(rq));
   }
@@@ -969,26 -976,6 +976,26 @@@ static struct rq *task_rq_lock(struct t
         }
   }
   
+ +void curr_rq_lock_irq_save(unsigned long *flags)
+ +      __acquires(rq->lock)
+ +{
+ +      struct rq *rq;
+ +
+ +      local_irq_save(*flags);
+ +      rq = cpu_rq(smp_processor_id());
+ +      spin_lock(&rq->lock);
+ +}
+ +
+ +void curr_rq_unlock_irq_restore(unsigned long *flags)
+ +      __releases(rq->lock)
+ +{
+ +      struct rq *rq;
+ +
+ +      rq = cpu_rq(smp_processor_id());
+ +      spin_unlock(&rq->lock);
+ +      local_irq_restore(*flags);
+ +}
+ +
   void task_rq_unlock_wait(struct task_struct *p)
   {
         struct rq *rq = task_rq(p);
@@@ -1159,7 -1146,6 +1166,6 @@@ static void init_rq_hrtick(struct rq *r
   
         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         rq->hrtick_timer.function = hrtick;
-       rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
   }
   #else /* CONFIG_SCHED_HRTICK */
   static inline void hrtick_clear(struct rq *rq)
@@@ -1536,7 -1522,7 +1542,7 @@@ static int tg_shares_up(struct task_gro
         struct sched_domain *sd = data;
         int i;
   
-       for_each_cpu_mask(i, sd->span) {
+       for_each_cpu(i, sched_domain_span(sd)) {
                 /*
                  * If there are currently no tasks on the cpu pretend there
                  * is one of average load so that when a new task gets to
@@@ -1557,7 -1543,7 +1563,7 @@@
         if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                 shares = tg->shares;
   
-       for_each_cpu_mask(i, sd->span)
+       for_each_cpu(i, sched_domain_span(sd))
                 update_group_shares_cpu(tg, i, shares, rq_weight);
   
         return 0;
@@@ -1896,14 -1882,12 +1902,14 @@@ void set_task_cpu(struct task_struct *p
                 p->se.sleep_start -= clock_offset;
         if (p->se.block_start)
                 p->se.block_start -= clock_offset;
+ +#endif
         if (old_cpu != new_cpu) {
- -              schedstat_inc(p, se.nr_migrations);
+ +              p->se.nr_migrations++;
+ +#ifdef CONFIG_SCHEDSTATS
                 if (task_hot(p, old_rq->clock, NULL))
                         schedstat_inc(p, se.nr_forced2_migrations);
- -      }
   #endif
+ +      }
         p->se.vruntime -= old_cfsrq->min_vruntime -
                                          new_cfsrq->min_vruntime;
   
@@@ -2125,15 -2109,17 +2131,17 @@@ find_idlest_group(struct sched_domain *
                 int i;
   
                 /* Skip over this group if it has no CPUs allowed */
-               if (!cpus_intersects(group->cpumask, p->cpus_allowed))
+               if (!cpumask_intersects(sched_group_cpus(group),
+                                       &p->cpus_allowed))
                         continue;
   
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
   
                 /* Tally up the load of all CPUs in the group */
                 avg_load = 0;
   
-               for_each_cpu_mask_nr(i, group->cpumask) {
+               for_each_cpu(i, sched_group_cpus(group)) {
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
                                 load = source_load(i, load_idx);
@@@ -2165,17 -2151,14 +2173,14 @@@
    * find_idlest_cpu - find the idlest cpu among the cpus in group.
    */
   static int
- find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
-               cpumask_t *tmp)
+ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
   {
         unsigned long load, min_load = ULONG_MAX;
         int idlest = -1;
         int i;
   
         /* Traverse only the allowed CPUs */
-       cpus_and(*tmp, group->cpumask, p->cpus_allowed);
- 
-       for_each_cpu_mask_nr(i, *tmp) {
+       for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
                 load = weighted_cpuload(i);
   
                 if (load < min_load || (load == min_load && i == this_cpu)) {
@@@ -2217,7 -2200,6 +2222,6 @@@ static int sched_balance_self(int cpu, 
                 update_shares(sd);
   
         while (sd) {
-               cpumask_t span, tmpmask;
                 struct sched_group *group;
                 int new_cpu, weight;
   
@@@ -2226,14 -2208,13 +2230,13 @@@
                         continue;
                 }
   
-               span = sd->span;
                 group = find_idlest_group(sd, t, cpu);
                 if (!group) {
                         sd = sd->child;
                         continue;
                 }
   
-               new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+               new_cpu = find_idlest_cpu(group, t, cpu);
                 if (new_cpu == -1 || new_cpu == cpu) {
                         /* Now try balancing at a lower domain level of cpu */
                         sd = sd->child;
@@@ -2242,10 -2223,10 +2245,10 @@@
   
                 /* Now try balancing at a lower domain level of new_cpu */
                 cpu = new_cpu;
+               weight = cpumask_weight(sched_domain_span(sd));
                 sd = NULL;
-               weight = cpus_weight(span);
                 for_each_domain(cpu, tmp) {
-                       if (weight <= cpus_weight(tmp->span))
+                       if (weight <= cpumask_weight(sched_domain_span(tmp)))
                                 break;
                         if (tmp->flags & flag)
                                 sd = tmp;
@@@ -2258,27 -2239,6 +2261,27 @@@
   
   #endif /* CONFIG_SMP */
   
+ +/**
+ + * task_oncpu_function_call - call a function on the cpu on which a task runs
+ + * @p:                the task to evaluate
+ + * @func:     the function to be called
+ + * @info:     the function call argument
+ + *
+ + * Calls the function @func when the task is currently running. This might
+ + * be on the current CPU, which just calls the function directly
+ + */
+ +void task_oncpu_function_call(struct task_struct *p,
+ +                            void (*func) (void *info), void *info)
+ +{
+ +      int cpu;
+ +
+ +      preempt_disable();
+ +      cpu = task_cpu(p);
+ +      if (task_curr(p))
+ +              smp_call_function_single(cpu, func, info, 1);
+ +      preempt_enable();
+ +}
+ +
   /***
    * try_to_wake_up - wake up a thread
    * @p: the to-be-woken-up thread
@@@ -2311,7 -2271,7 +2314,7 @@@ static int try_to_wake_up(struct task_s
                 cpu = task_cpu(p);
   
                 for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                 update_shares(sd);
                                 break;
                         }
@@@ -2360,7 -2320,7 +2363,7 @@@
         else {
                 struct sched_domain *sd;
                 for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
+                       if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                                 schedstat_inc(sd, ttwu_wake_remote);
                                 break;
                         }
@@@ -2421,7 -2381,6 +2424,7 @@@ static void __sched_fork(struct task_st
         p->se.exec_start                = 0;
         p->se.sum_exec_runtime          = 0;
         p->se.prev_sum_exec_runtime     = 0;
+ +      p->se.nr_migrations             = 0;
         p->se.last_wakeup               = 0;
         p->se.avg_overlap               = 0;
   
@@@ -2642,7 -2601,6 +2645,7 @@@ static void finish_task_switch(struct r
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
+ +      perf_counter_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
   #ifdef CONFIG_SMP
         if (current->sched_class->post_schedule)
@@@ -2893,7 -2851,7 +2896,7 @@@ static void sched_migrate_task(struct t
         struct rq *rq;
   
         rq = task_rq_lock(p, &flags);
-       if (!cpu_isset(dest_cpu, p->cpus_allowed)
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
             || unlikely(!cpu_active(dest_cpu)))
                 goto out;
   
@@@ -2958,7 -2916,7 +2961,7 @@@ int can_migrate_task(struct task_struc
          * 2) cannot be migrated to this CPU due to cpus_allowed, or
          * 3) are cache-hot on their current CPU.
          */
-       if (!cpu_isset(this_cpu, p->cpus_allowed)) {
+       if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
                 schedstat_inc(p, se.nr_failed_migrations_affine);
                 return 0;
         }
@@@ -3133,7 -3091,7 +3136,7 @@@ static int move_one_task(struct rq *thi
   static struct sched_group *
   find_busiest_group(struct sched_domain *sd, int this_cpu,
                    unsigned long *imbalance, enum cpu_idle_type idle,
-                  int *sd_idle, const cpumask_t *cpus, int *balance)
+                  int *sd_idle, const struct cpumask *cpus, int *balance)
   {
         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
@@@ -3169,10 -3127,11 +3172,11 @@@
                 unsigned long sum_avg_load_per_task;
                 unsigned long avg_load_per_task;
   
-               local_group = cpu_isset(this_cpu, group->cpumask);
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_cpus(group));
   
                 if (local_group)
-                       balance_cpu = first_cpu(group->cpumask);
+                       balance_cpu = cpumask_first(sched_group_cpus(group));
   
                 /* Tally up the load of all CPUs in the group */
                 sum_weighted_load = sum_nr_running = avg_load = 0;
@@@ -3181,13 -3140,8 +3185,8 @@@
                 max_cpu_load = 0;
                 min_cpu_load = ~0UL;
   
-               for_each_cpu_mask_nr(i, group->cpumask) {
-                       struct rq *rq;
- 
-                       if (!cpu_isset(i, *cpus))
-                               continue;
- 
-                       rq = cpu_rq(i);
+               for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                       struct rq *rq = cpu_rq(i);
   
                         if (*sd_idle && rq->nr_running)
                                 *sd_idle = 0;
@@@ -3298,8 -3252,8 +3297,8 @@@
                  */
                 if ((sum_nr_running < min_nr_running) ||
                     (sum_nr_running == min_nr_running &&
-                    first_cpu(group->cpumask) <
-                    first_cpu(group_min->cpumask))) {
+                    cpumask_first(sched_group_cpus(group)) >
+                    cpumask_first(sched_group_cpus(group_min)))) {
                         group_min = group;
                         min_nr_running = sum_nr_running;
                         min_load_per_task = sum_weighted_load /
@@@ -3314,8 -3268,8 +3313,8 @@@
                 if (sum_nr_running <= group_capacity - 1) {
                         if (sum_nr_running > leader_nr_running ||
                             (sum_nr_running == leader_nr_running &&
-                            first_cpu(group->cpumask) >
-                             first_cpu(group_leader->cpumask))) {
+                            cpumask_first(sched_group_cpus(group)) <
+                            cpumask_first(sched_group_cpus(group_leader)))) {
                                 group_leader = group;
                                 leader_nr_running = sum_nr_running;
                         }
@@@ -3441,6 -3395,10 +3440,10 @@@ out_balanced
   
         if (this == group_leader && group_leader != group_min) {
                 *imbalance = min_load_per_task;
+               if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+                       cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                               cpumask_first(sched_group_cpus(group_leader));
+               }
                 return group_min;
         }
   #endif
@@@ -3454,16 -3412,16 +3457,16 @@@ ret
    */
   static struct rq *
   find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
-                  unsigned long imbalance, const cpumask_t *cpus)
+                  unsigned long imbalance, const struct cpumask *cpus)
   {
         struct rq *busiest = NULL, *rq;
         unsigned long max_load = 0;
         int i;
   
-       for_each_cpu_mask_nr(i, group->cpumask) {
+       for_each_cpu(i, sched_group_cpus(group)) {
                 unsigned long wl;
   
-               if (!cpu_isset(i, *cpus))
+               if (!cpumask_test_cpu(i, cpus))
                         continue;
   
                 rq = cpu_rq(i);
@@@ -3493,7 -3451,7 +3496,7 @@@
    */
   static int load_balance(int this_cpu, struct rq *this_rq,
                         struct sched_domain *sd, enum cpu_idle_type idle,
-                       int *balance, cpumask_t *cpus)
+                       int *balance, struct cpumask *cpus)
   {
         int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
         struct sched_group *group;
@@@ -3501,7 -3459,7 +3504,7 @@@
         struct rq *busiest;
         unsigned long flags;
   
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
   
         /*
          * When power savings policy is enabled for the parent domain, idle
@@@ -3561,8 -3519,8 +3564,8 @@@ redo
   
                 /* All tasks on this runqueue were pinned by CPU affinity */
                 if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                 goto redo;
                         goto out_balanced;
                 }
@@@ -3579,7 -3537,8 +3582,8 @@@
                         /* don't kick the migration_thread, if the curr
                          * task on busiest cpu can't be moved to this_cpu
                          */
-                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                       if (!cpumask_test_cpu(this_cpu,
+                                             &busiest->curr->cpus_allowed)) {
                                 spin_unlock_irqrestore(&busiest->lock, flags);
                                 all_pinned = 1;
                                 goto out_one_pinned;
@@@ -3654,7 -3613,7 +3658,7 @@@ out
    */
   static int
   load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-                       cpumask_t *cpus)
+                       struct cpumask *cpus)
   {
         struct sched_group *group;
         struct rq *busiest = NULL;
@@@ -3663,7 -3622,7 +3667,7 @@@
         int sd_idle = 0;
         int all_pinned = 0;
   
-       cpus_setall(*cpus);
+       cpumask_setall(cpus);
   
         /*
          * When power savings policy is enabled for the parent domain, idle
@@@ -3707,17 -3666,76 +3711,76 @@@ redo
                 double_unlock_balance(this_rq, busiest);
   
                 if (unlikely(all_pinned)) {
-                       cpu_clear(cpu_of(busiest), *cpus);
-                       if (!cpus_empty(*cpus))
+                       cpumask_clear_cpu(cpu_of(busiest), cpus);
+                       if (!cpumask_empty(cpus))
                                 goto redo;
                 }
         }
   
         if (!ld_moved) {
+               int active_balance = 0;
+ 
                 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
                 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
                     !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                         return -1;
+ 
+               if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
+                       return -1;
+ 
+               if (sd->nr_balance_failed++ < 2)
+                       return -1;
+ 
+               /*
+                * The only task running in a non-idle cpu can be moved to this
+                * cpu in an attempt to completely freeup the other CPU
+                * package. The same method used to move task in load_balance()
+                * have been extended for load_balance_newidle() to speedup
+                * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
+                *
+                * The package power saving logic comes from
+                * find_busiest_group().  If there are no imbalance, then
+                * f_b_g() will return NULL.  However when sched_mc={1,2} then
+                * f_b_g() will select a group from which a running task may be
+                * pulled to this cpu in order to make the other package idle.
+                * If there is no opportunity to make a package idle and if
+                * there are no imbalance, then f_b_g() will return NULL and no
+                * action will be taken in load_balance_newidle().
+                *
+                * Under normal task pull operation due to imbalance, there
+                * will be more than one task in the source run queue and
+                * move_tasks() will succeed.  ld_moved will be true and this
+                * active balance code will not be triggered.
+                */
+ 
+               /* Lock busiest in correct order while this_rq is held */
+               double_lock_balance(this_rq, busiest);
+ 
+               /*
+                * don't kick the migration_thread, if the curr
+                * task on busiest cpu can't be moved to this_cpu
+                */
+               if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
+                       double_unlock_balance(this_rq, busiest);
+                       all_pinned = 1;
+                       return ld_moved;
+               }
+ 
+               if (!busiest->active_balance) {
+                       busiest->active_balance = 1;
+                       busiest->push_cpu = this_cpu;
+                       active_balance = 1;
+               }
+ 
+               double_unlock_balance(this_rq, busiest);
+               /*
+                * Should not call ttwu while holding a rq->lock
+                */
+               spin_unlock(&this_rq->lock);
+               if (active_balance)
+                       wake_up_process(busiest->migration_thread);
+               spin_lock(&this_rq->lock);
+ 
         } else
                 sd->nr_balance_failed = 0;
   
@@@ -3743,7 -3761,10 +3806,10 @@@ static void idle_balance(int this_cpu, 
         struct sched_domain *sd;
         int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
-       cpumask_t tmpmask;
+       cpumask_var_t tmpmask;
+ 
+       if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
+               return;
   
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
@@@ -3754,7 -3775,7 +3820,7 @@@
                 if (sd->flags & SD_BALANCE_NEWIDLE)
                         /* If we've pulled tasks over stop searching: */
                         pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                          sd, &tmpmask);
+                                                          sd, tmpmask);
   
                 interval = msecs_to_jiffies(sd->balance_interval);
                 if (time_after(next_balance, sd->last_balance + interval))
@@@ -3769,6 -3790,7 +3835,7 @@@
                  */
                 this_rq->next_balance = next_balance;
         }
+       free_cpumask_var(tmpmask);
   }
   
   /*
@@@ -3806,7 -3828,7 +3873,7 @@@ static void active_load_balance(struct 
         /* Search for an sd spanning us and the target CPU. */
         for_each_domain(target_cpu, sd) {
                 if ((sd->flags & SD_LOAD_BALANCE) &&
-                   cpu_isset(busiest_cpu, sd->span))
+                   cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
                                 break;
         }
   
@@@ -3825,10 -3847,9 +3892,9 @@@
   #ifdef CONFIG_NO_HZ
   static struct {
         atomic_t load_balancer;
-       cpumask_t cpu_mask;
+       cpumask_var_t cpu_mask;
   } nohz ____cacheline_aligned = {
         .load_balancer = ATOMIC_INIT(-1),
-       .cpu_mask = CPU_MASK_NONE,
   };
   
   /*
@@@ -3856,7 -3877,7 +3922,7 @@@ int select_nohz_load_balancer(int stop_
         int cpu = smp_processor_id();
   
         if (stop_tick) {
-               cpu_set(cpu, nohz.cpu_mask);
+               cpumask_set_cpu(cpu, nohz.cpu_mask);
                 cpu_rq(cpu)->in_nohz_recently = 1;
   
                 /*
@@@ -3870,7 -3891,7 +3936,7 @@@
                 }
   
                 /* time for ilb owner also to sleep */
-               if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+               if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                         if (atomic_read(&nohz.load_balancer) == cpu)
                                 atomic_set(&nohz.load_balancer, -1);
                         return 0;
@@@ -3883,10 -3904,10 +3949,10 @@@
                 } else if (atomic_read(&nohz.load_balancer) == cpu)
                         return 1;
         } else {
-               if (!cpu_isset(cpu, nohz.cpu_mask))
+               if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                         return 0;
   
-               cpu_clear(cpu, nohz.cpu_mask);
+               cpumask_clear_cpu(cpu, nohz.cpu_mask);
   
                 if (atomic_read(&nohz.load_balancer) == cpu)
                         if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
@@@ -3914,7 -3935,11 +3980,11 @@@ static void rebalance_domains(int cpu, 
         unsigned long next_balance = jiffies + 60*HZ;
         int update_next_balance = 0;
         int need_serialize;
-       cpumask_t tmp;
+       cpumask_var_t tmp;
+ 
+       /* Fails alloc?  Rebalancing probably not a priority right now. */
+       if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
+               return;
   
         for_each_domain(cpu, sd) {
                 if (!(sd->flags & SD_LOAD_BALANCE))
@@@ -3939,7 -3964,7 +4009,7 @@@
                 }
   
                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                       if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+                       if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
                                 /*
                                  * We've pulled tasks over so either we're no
                                  * longer idle, or one of our SMT siblings is
@@@ -3973,6 -3998,8 +4043,8 @@@ out
          */
         if (likely(update_next_balance))
                 rq->next_balance = next_balance;
+ 
+       free_cpumask_var(tmp);
   }
   
   /*
@@@ -3997,12 -4024,13 +4069,13 @@@ static void run_rebalance_domains(struc
          */
         if (this_rq->idle_at_tick &&
             atomic_read(&nohz.load_balancer) == this_cpu) {
-               cpumask_t cpus = nohz.cpu_mask;
                 struct rq *rq;
                 int balance_cpu;
   
-               cpu_clear(this_cpu, cpus);
-               for_each_cpu_mask_nr(balance_cpu, cpus) {
+               for_each_cpu(balance_cpu, nohz.cpu_mask) {
+                       if (balance_cpu == this_cpu)
+                               continue;
+ 
                         /*
                          * If this cpu gets work to do, stop the load balancing
                          * work being done for other cpus. Next load
@@@ -4040,7 -4068,7 +4113,7 @@@ static inline void trigger_load_balance
                 rq->in_nohz_recently = 0;
   
                 if (atomic_read(&nohz.load_balancer) == cpu) {
-                       cpu_clear(cpu, nohz.cpu_mask);
+                       cpumask_clear_cpu(cpu, nohz.cpu_mask);
                         atomic_set(&nohz.load_balancer, -1);
                 }
   
@@@ -4053,7 -4081,7 +4126,7 @@@
                          * TBD: Traverse the sched domains and nominate
                          * the nearest cpu in the nohz.cpu_mask.
                          */
-                       int ilb = first_cpu(nohz.cpu_mask);
+                       int ilb = cpumask_first(nohz.cpu_mask);
   
                         if (ilb < nr_cpu_ids)
                                 resched_cpu(ilb);
@@@ -4065,7 -4093,7 +4138,7 @@@
          * cpus with ticks stopped, is it time for that to stop?
          */
         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
-           cpus_weight(nohz.cpu_mask) == num_online_cpus()) {
+           cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
                 resched_cpu(cpu);
                 return;
         }
@@@ -4075,7 -4103,7 +4148,7 @@@
          * someone else, then no need raise the SCHED_SOFTIRQ
          */
         if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
-           cpu_isset(cpu, nohz.cpu_mask))
+           cpumask_test_cpu(cpu, nohz.cpu_mask))
                 return;
   #endif
         if (time_after_eq(jiffies, rq->next_balance))
@@@ -4097,29 -4125,6 +4170,29 @@@ DEFINE_PER_CPU(struct kernel_stat, ksta
   
   EXPORT_PER_CPU_SYMBOL(kstat);
   
+ +/*
+ + * Return any ns on the sched_clock that have not yet been banked in
+ + * @p in case that task is currently running.
+ + */
+ +unsigned long long __task_delta_exec(struct task_struct *p, int update)
+ +{
+ +      s64 delta_exec;
+ +      struct rq *rq;
+ +
+ +      rq = task_rq(p);
+ +      WARN_ON_ONCE(!runqueue_is_locked());
+ +      WARN_ON_ONCE(!task_current(rq, p));
+ +
+ +      if (update)
+ +              update_rq_clock(rq);
+ +
+ +      delta_exec = rq->clock - p->se.exec_start;
+ +
+ +      WARN_ON_ONCE(delta_exec < 0);
+ +
+ +      return delta_exec;
+ +}
+ +
   /*
    * Return any ns on the sched_clock that have not yet been banked in
    * @p in case that task is currently running.
@@@ -4150,13 -4155,17 +4223,17 @@@ unsigned long long task_delta_exec(stru
    * Account user cpu time to a process.
    * @p: the process that the cpu time gets accounted to
    * @cputime: the cpu time spent in user space since the last update
+  * @cputime_scaled: cputime scaled by cpu frequency
    */
- void account_user_time(struct task_struct *p, cputime_t cputime)
+ void account_user_time(struct task_struct *p, cputime_t cputime,
+                      cputime_t cputime_scaled)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
         cputime64_t tmp;
   
+       /* Add user time to process. */
         p->utime = cputime_add(p->utime, cputime);
+       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
         account_group_user_time(p, cputime);
   
         /* Add user time to cpustat. */
@@@ -4173,51 -4182,48 +4250,48 @@@
    * Account guest cpu time to a process.
    * @p: the process that the cpu time gets accounted to
    * @cputime: the cpu time spent in virtual machine since the last update
+  * @cputime_scaled: cputime scaled by cpu frequency
    */
- static void account_guest_time(struct task_struct *p, cputime_t cputime)
+ static void account_guest_time(struct task_struct *p, cputime_t cputime,
+                              cputime_t cputime_scaled)
   {
         cputime64_t tmp;
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
   
         tmp = cputime_to_cputime64(cputime);
   
+       /* Add guest time to process. */
         p->utime = cputime_add(p->utime, cputime);
+       p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
         account_group_user_time(p, cputime);
         p->gtime = cputime_add(p->gtime, cputime);
   
+       /* Add guest time to cpustat. */
         cpustat->user = cputime64_add(cpustat->user, tmp);
         cpustat->guest = cputime64_add(cpustat->guest, tmp);
   }
   
- /*
-  * Account scaled user cpu time to a process.
-  * @p: the process that the cpu time gets accounted to
-  * @cputime: the cpu time spent in user space since the last update
-  */
- void account_user_time_scaled(struct task_struct *p, cputime_t cputime)
- {
-       p->utimescaled = cputime_add(p->utimescaled, cputime);
- }
- 
   /*
    * Account system cpu time to a process.
    * @p: the process that the cpu time gets accounted to
    * @hardirq_offset: the offset to subtract from hardirq_count()
    * @cputime: the cpu time spent in kernel space since the last update
+  * @cputime_scaled: cputime scaled by cpu frequency
    */
   void account_system_time(struct task_struct *p, int hardirq_offset,
-                        cputime_t cputime)
+                        cputime_t cputime, cputime_t cputime_scaled)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       struct rq *rq = this_rq();
         cputime64_t tmp;
   
         if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
-               account_guest_time(p, cputime);
+               account_guest_time(p, cputime, cputime_scaled);
                 return;
         }
   
+       /* Add system time to process. */
         p->stime = cputime_add(p->stime, cputime);
+       p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
         account_group_system_time(p, cputime);
   
         /* Add system time to cpustat. */
@@@ -4226,49 -4232,84 +4300,84 @@@
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
         else if (softirq_count())
                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
-       else if (p != rq->idle)
-               cpustat->system = cputime64_add(cpustat->system, tmp);
-       else if (atomic_read(&rq->nr_iowait) > 0)
-               cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
         else
-               cpustat->idle = cputime64_add(cpustat->idle, tmp);
+               cpustat->system = cputime64_add(cpustat->system, tmp);
+ 
         /* Account for system time used */
         acct_update_integrals(p);
   }
   
   /*
-  * Account scaled system cpu time to a process.
-  * @p: the process that the cpu time gets accounted to
-  * @hardirq_offset: the offset to subtract from hardirq_count()
-  * @cputime: the cpu time spent in kernel space since the last update
+  * Account for involuntary wait time.
+  * @steal: the cpu time spent in involuntary wait
    */
- void account_system_time_scaled(struct task_struct *p, cputime_t cputime)
+ void account_steal_time(cputime_t cputime)
   {
-       p->stimescaled = cputime_add(p->stimescaled, cputime);
+       struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+       cputime64_t cputime64 = cputime_to_cputime64(cputime);
+ 
+       cpustat->steal = cputime64_add(cpustat->steal, cputime64);
   }
   
   /*
-  * Account for involuntary wait time.
-  * @p: the process from which the cpu time has been stolen
-  * @steal: the cpu time spent in involuntary wait
+  * Account for idle time.
+  * @cputime: the cpu time spent in idle wait
    */
- void account_steal_time(struct task_struct *p, cputime_t steal)
+ void account_idle_time(cputime_t cputime)
   {
         struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-       cputime64_t tmp = cputime_to_cputime64(steal);
+       cputime64_t cputime64 = cputime_to_cputime64(cputime);
         struct rq *rq = this_rq();
   
-       if (p == rq->idle) {
-               p->stime = cputime_add(p->stime, steal);
-               account_group_system_time(p, steal);
-               if (atomic_read(&rq->nr_iowait) > 0)
-                       cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
-               else
-                       cpustat->idle = cputime64_add(cpustat->idle, tmp);
-       } else
-               cpustat->steal = cputime64_add(cpustat->steal, tmp);
+       if (atomic_read(&rq->nr_iowait) > 0)
+               cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+       else
+               cpustat->idle = cputime64_add(cpustat->idle, cputime64);
   }
   
+ #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+ 
+ /*
+  * Account a single tick of cpu time.
+  * @p: the process that the cpu time gets accounted to
+  * @user_tick: indicates if the tick is a user or a system tick
+  */
+ void account_process_tick(struct task_struct *p, int user_tick)
+ {
+       cputime_t one_jiffy = jiffies_to_cputime(1);
+       cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy);
+       struct rq *rq = this_rq();
+ 
+       if (user_tick)
+               account_user_time(p, one_jiffy, one_jiffy_scaled);
+       else if (p != rq->idle)
+               account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
+                                   one_jiffy_scaled);
+       else
+               account_idle_time(one_jiffy);
+ }
+ 
+ /*
+  * Account multiple ticks of steal time.
+  * @p: the process from which the cpu time has been stolen
+  * @ticks: number of stolen ticks
+  */
+ void account_steal_ticks(unsigned long ticks)
+ {
+       account_steal_time(jiffies_to_cputime(ticks));
+ }
+ 
+ /*
+  * Account multiple ticks of idle time.
+  * @ticks: number of stolen ticks
+  */
+ void account_idle_ticks(unsigned long ticks)
+ {
+       account_idle_time(jiffies_to_cputime(ticks));
+ }
+ 
+ #endif
+ 
   /*
    * Use precise platform statistics if available:
    */
@@@ -4347,7 -4388,6 +4456,7 @@@ void scheduler_tick(void
         update_rq_clock(rq);
         update_cpu_load(rq);
         curr->sched_class->task_tick(rq, curr, 0);
+ +      perf_counter_task_tick(curr, cpu);
         spin_unlock(&rq->lock);
   
   #ifdef CONFIG_SMP
@@@ -4397,7 -4437,7 +4506,7 @@@ void __kprobes sub_preempt_count(int va
         /*
          * Underflow?
          */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+        if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
                 return;
         /*
          * Is the spinlock portion underflowing?
@@@ -4543,7 -4583,6 +4652,7 @@@ need_resched_nonpreemptible
   
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
+ +              perf_counter_task_sched_out(prev, cpu);
   
                 rq->nr_switches++;
                 rq->curr = next;
@@@ -5474,10 -5513,9 +5583,9 @@@ out_unlock
         return retval;
   }
   
- long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
+ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
   {
-       cpumask_t cpus_allowed;
-       cpumask_t new_mask = *in_mask;
+       cpumask_var_t cpus_allowed, new_mask;
         struct task_struct *p;
         int retval;
   
@@@ -5499,6 -5537,14 +5607,14 @@@
         get_task_struct(p);
         read_unlock(&tasklist_lock);
   
+       if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_put_task;
+       }
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+               retval = -ENOMEM;
+               goto out_free_cpus_allowed;
+       }
         retval = -EPERM;
         if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
@@@ -5507,37 -5553,41 +5623,41 @@@
         if (retval)
                 goto out_unlock;
   
-       cpuset_cpus_allowed(p, &cpus_allowed);
-       cpus_and(new_mask, new_mask, cpus_allowed);
+       cpuset_cpus_allowed(p, cpus_allowed);
+       cpumask_and(new_mask, in_mask, cpus_allowed);
    again:
-       retval = set_cpus_allowed_ptr(p, &new_mask);
+       retval = set_cpus_allowed_ptr(p, new_mask);
   
         if (!retval) {
-               cpuset_cpus_allowed(p, &cpus_allowed);
-               if (!cpus_subset(new_mask, cpus_allowed)) {
+               cpuset_cpus_allowed(p, cpus_allowed);
+               if (!cpumask_subset(new_mask, cpus_allowed)) {
                         /*
                          * We must have raced with a concurrent cpuset
                          * update. Just reset the cpus_allowed to the
                          * cpuset's cpus_allowed
                          */
-                       new_mask = cpus_allowed;
+                       cpumask_copy(new_mask, cpus_allowed);
                         goto again;
                 }
         }
   out_unlock:
+       free_cpumask_var(new_mask);
+ out_free_cpus_allowed:
+       free_cpumask_var(cpus_allowed);
+ out_put_task:
         put_task_struct(p);
         put_online_cpus();
         return retval;
   }
   
   static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
-                            cpumask_t *new_mask)
+                            struct cpumask *new_mask)
   {
-       if (len < sizeof(cpumask_t)) {
-               memset(new_mask, 0, sizeof(cpumask_t));
-       } else if (len > sizeof(cpumask_t)) {
-               len = sizeof(cpumask_t);
-       }
+       if (len < cpumask_size())
+               cpumask_clear(new_mask);
+       else if (len > cpumask_size())
+               len = cpumask_size();
+ 
         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
   }
   
@@@ -5550,17 -5600,20 +5670,20 @@@
   asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
                                       unsigned long __user *user_mask_ptr)
   {
-       cpumask_t new_mask;
+       cpumask_var_t new_mask;
         int retval;
   
-       retval = get_user_cpu_mask(user_mask_ptr, len, &new_mask);
-       if (retval)
-               return retval;
+       if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+               return -ENOMEM;
   
-       return sched_setaffinity(pid, &new_mask);
+       retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+       if (retval == 0)
+               retval = sched_setaffinity(pid, new_mask);
+       free_cpumask_var(new_mask);
+       return retval;
   }
   
- long sched_getaffinity(pid_t pid, cpumask_t *mask)
+ long sched_getaffinity(pid_t pid, struct cpumask *mask)
   {
         struct task_struct *p;
         int retval;
@@@ -5577,7 -5630,7 +5700,7 @@@
         if (retval)
                 goto out_unlock;
   
-       cpus_and(*mask, p->cpus_allowed, cpu_online_map);
+       cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
   
   out_unlock:
         read_unlock(&tasklist_lock);
@@@ -5596,19 -5649,24 +5719,24 @@@ asmlinkage long sys_sched_getaffinity(p
                                       unsigned long __user *user_mask_ptr)
   {
         int ret;
-       cpumask_t mask;
+       cpumask_var_t mask;
   
-       if (len < sizeof(cpumask_t))
+       if (len < cpumask_size())
                 return -EINVAL;
   
-       ret = sched_getaffinity(pid, &mask);
-       if (ret < 0)
-               return ret;
+       if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+               return -ENOMEM;
   
-       if (copy_to_user(user_mask_ptr, &mask, sizeof(cpumask_t)))
-               return -EFAULT;
+       ret = sched_getaffinity(pid, mask);
+       if (ret == 0) {
+               if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+                       ret = -EFAULT;
+               else
+                       ret = cpumask_size();
+       }
+       free_cpumask_var(mask);
   
-       return sizeof(cpumask_t);
+       return ret;
   }
   
   /**
@@@ -5950,7 -6008,7 +6078,7 @@@ void __cpuinit init_idle(struct task_st
         idle->se.exec_start = sched_clock();
   
         idle->prio = idle->normal_prio = MAX_PRIO;
-       idle->cpus_allowed = cpumask_of_cpu(cpu);
+       cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
         __set_task_cpu(idle, cpu);
   
         rq->curr = rq->idle = idle;
@@@ -5977,9 -6035,9 +6105,9 @@@
    * indicates which cpus entered this state. This is used
    * in the rcu update to wait only for active cpus. For system
    * which do not switch off the HZ timer nohz_cpu_mask should
-  * always be CPU_MASK_NONE.
+  * always be CPU_BITS_NONE.
    */
- cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
+ cpumask_var_t nohz_cpu_mask;
   
   /*
    * Increase the granularity value when there are more CPUs,
@@@ -6034,7 -6092,7 +6162,7 @@@ static inline void sched_init_granulari
    * task must not exit() & deallocate itself prematurely. The
    * call is not atomic; no spinlocks may be held.
    */
- int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
+ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
   {
         struct migration_req req;
         unsigned long flags;
@@@ -6042,13 -6100,13 +6170,13 @@@
         int ret = 0;
   
         rq = task_rq_lock(p, &flags);
-       if (!cpus_intersects(*new_mask, cpu_online_map)) {
+       if (!cpumask_intersects(new_mask, cpu_online_mask)) {
                 ret = -EINVAL;
                 goto out;
         }
   
         if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
-                    !cpus_equal(p->cpus_allowed, *new_mask))) {
+                    !cpumask_equal(&p->cpus_allowed, new_mask))) {
                 ret = -EINVAL;
                 goto out;
         }
@@@ -6056,15 -6114,15 +6184,15 @@@
         if (p->sched_class->set_cpus_allowed)
                 p->sched_class->set_cpus_allowed(p, new_mask);
         else {
-               p->cpus_allowed = *new_mask;
-               p->rt.nr_cpus_allowed = cpus_weight(*new_mask);
+               cpumask_copy(&p->cpus_allowed, new_mask);
+               p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
         }
   
         /* Can the task run on the task's current CPU? If so, we're done */
-       if (cpu_isset(task_cpu(p), *new_mask))
+       if (cpumask_test_cpu(task_cpu(p), new_mask))
                 goto out;
   
-       if (migrate_task(p, any_online_cpu(*new_mask), &req)) {
+       if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                 /* Need help from migration thread: drop lock and wait. */
                 task_rq_unlock(rq, &flags);
                 wake_up_process(rq->migration_thread);
@@@ -6106,7 -6164,7 +6234,7 @@@ static int __migrate_task(struct task_s
         if (task_cpu(p) != src_cpu)
                 goto done;
         /* Affinity changed (again). */
-       if (!cpu_isset(dest_cpu, p->cpus_allowed))
+       if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
                 goto fail;
   
         on_rq = p->se.on_rq;
@@@ -6203,50 -6261,41 +6331,41 @@@ static int __migrate_task_irq(struct ta
    */
   static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
   {
-       unsigned long flags;
-       cpumask_t mask;
-       struct rq *rq;
         int dest_cpu;
+       const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
   
-       do {
-               /* On same node? */
-               mask = node_to_cpumask(cpu_to_node(dead_cpu));
-               cpus_and(mask, mask, p->cpus_allowed);
-               dest_cpu = any_online_cpu(mask);
+ again:
+       /* Look for allowed, online CPU in same node. */
+       for_each_cpu_and(dest_cpu, nodemask, cpu_online_mask)
+               if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                       goto move;
   
-               /* On any allowed CPU? */
-               if (dest_cpu >= nr_cpu_ids)
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
+       /* Any allowed, online CPU? */
+       dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_online_mask);
+       if (dest_cpu < nr_cpu_ids)
+               goto move;
   
-               /* No more Mr. Nice Guy. */
-               if (dest_cpu >= nr_cpu_ids) {
-                       cpumask_t cpus_allowed;
- 
-                       cpuset_cpus_allowed_locked(p, &cpus_allowed);
-                       /*
-                        * Try to stay on the same cpuset, where the
-                        * current cpuset may be a subset of all cpus.
-                        * The cpuset_cpus_allowed_locked() variant of
-                        * cpuset_cpus_allowed() will not block. It must be
-                        * called within calls to cpuset_lock/cpuset_unlock.
-                        */
-                       rq = task_rq_lock(p, &flags);
-                       p->cpus_allowed = cpus_allowed;
-                       dest_cpu = any_online_cpu(p->cpus_allowed);
-                       task_rq_unlock(rq, &flags);
+       /* No more Mr. Nice Guy. */
+       if (dest_cpu >= nr_cpu_ids) {
+               cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
+               dest_cpu = cpumask_any_and(cpu_online_mask, &p->cpus_allowed);
   
-                       /*
-                        * Don't tell them about moving exiting tasks or
-                        * kernel threads (both mm NULL), since they never
-                        * leave kernel.
-                        */
-                       if (p->mm && printk_ratelimit()) {
-                               printk(KERN_INFO "process %d (%s) no "
-                                      "longer affine to cpu%d\n",
-                                       task_pid_nr(p), p->comm, dead_cpu);
-                       }
+               /*
+                * Don't tell them about moving exiting tasks or
+                * kernel threads (both mm NULL), since they never
+                * leave kernel.
+                */
+               if (p->mm && printk_ratelimit()) {
+                       printk(KERN_INFO "process %d (%s) no "
+                              "longer affine to cpu%d\n",
+                              task_pid_nr(p), p->comm, dead_cpu);
                 }
-       } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+       }
+ 
+ move:
+       /* It can have affinity changed while we were choosing. */
+       if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
+               goto again;
   }
   
   /*
@@@ -6258,7 -6307,7 +6377,7 @@@
    */
   static void migrate_nr_uninterruptible(struct rq *rq_src)
   {
-       struct rq *rq_dest = cpu_rq(any_online_cpu(*CPU_MASK_ALL_PTR));
+       struct rq *rq_dest = cpu_rq(cpumask_any(cpu_online_mask));
         unsigned long flags;
   
         local_irq_save(flags);
@@@ -6548,7 -6597,7 +6667,7 @@@ static void set_rq_online(struct rq *rq
         if (!rq->online) {
                 const struct sched_class *class;
   
-               cpu_set(rq->cpu, rq->rd->online);
+               cpumask_set_cpu(rq->cpu, rq->rd->online);
                 rq->online = 1;
   
                 for_each_class(class) {
@@@ -6568,7 -6617,7 +6687,7 @@@ static void set_rq_offline(struct rq *r
                                 class->rq_offline(rq);
                 }
   
-               cpu_clear(rq->cpu, rq->rd->online);
+               cpumask_clear_cpu(rq->cpu, rq->rd->online);
                 rq->online = 0;
         }
   }
@@@ -6609,7 -6658,7 +6728,7 @@@ migration_call(struct notifier_block *n
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
   
                         set_rq_online(rq);
                 }
@@@ -6623,7 -6672,7 +6742,7 @@@
                         break;
                 /* Unbind it from offline cpu so it can run. Fall thru. */
                 kthread_bind(cpu_rq(cpu)->migration_thread,
-                            any_online_cpu(cpu_online_map));
+                            cpumask_any(cpu_online_mask));
                 kthread_stop(cpu_rq(cpu)->migration_thread);
                 cpu_rq(cpu)->migration_thread = NULL;
                 break;
@@@ -6673,7 -6722,7 +6792,7 @@@
                 rq = cpu_rq(cpu);
                 spin_lock_irqsave(&rq->lock, flags);
                 if (rq->rd) {
-                       BUG_ON(!cpu_isset(cpu, rq->rd->span));
+                       BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                         set_rq_offline(rq);
                 }
                 spin_unlock_irqrestore(&rq->lock, flags);
@@@ -6712,13 -6761,13 +6831,13 @@@ early_initcall(migration_init)
   #ifdef CONFIG_SCHED_DEBUG
   
   static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
-                                 cpumask_t *groupmask)
+                                 struct cpumask *groupmask)
   {
         struct sched_group *group = sd->groups;
         char str[256];
   
-       cpulist_scnprintf(str, sizeof(str), sd->span);
-       cpus_clear(*groupmask);
+       cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd));
+       cpumask_clear(groupmask);
   
         printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
   
@@@ -6732,11 -6781,11 +6851,11 @@@
   
         printk(KERN_CONT "span %s level %s\n", str, sd->name);
   
-       if (!cpu_isset(cpu, sd->span)) {
+       if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
                 printk(KERN_ERR "ERROR: domain->span does not contain "
                                 "CPU%d\n", cpu);
         }
-       if (!cpu_isset(cpu, group->cpumask)) {
+       if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) {
                 printk(KERN_ERR "ERROR: domain->groups does not contain"
                                 " CPU%d\n", cpu);
         }
@@@ -6756,31 -6805,32 +6875,32 @@@
                         break;
                 }
   
-               if (!cpus_weight(group->cpumask)) {
+               if (!cpumask_weight(sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: empty group\n");
                         break;
                 }
   
-               if (cpus_intersects(*groupmask, group->cpumask)) {
+               if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: repeated CPUs\n");
                         break;
                 }
   
-               cpus_or(*groupmask, *groupmask, group->cpumask);
+               cpumask_or(groupmask, groupmask, sched_group_cpus(group));
   
-               cpulist_scnprintf(str, sizeof(str), group->cpumask);
+               cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                 printk(KERN_CONT " %s", str);
   
                 group = group->next;
         } while (group != sd->groups);
         printk(KERN_CONT "\n");
   
-       if (!cpus_equal(sd->span, *groupmask))
+       if (!cpumask_equal(sched_domain_span(sd), groupmask))
                 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
   
-       if (sd->parent && !cpus_subset(*groupmask, sd->parent->span))
+       if (sd->parent &&
+           !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
                 printk(KERN_ERR "ERROR: parent span is not a superset "
                         "of domain->span\n");
         return 0;
@@@ -6788,7 -6838,7 +6908,7 @@@
   
   static void sched_domain_debug(struct sched_domain *sd, int cpu)
   {
-       cpumask_t *groupmask;
+       cpumask_var_t groupmask;
         int level = 0;
   
         if (!sd) {
@@@ -6798,8 -6848,7 +6918,7 @@@
   
         printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
   
-       groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
-       if (!groupmask) {
+       if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
                 printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
                 return;
         }
@@@ -6812,7 -6861,7 +6931,7 @@@
                 if (!sd)
                         break;
         }
-       kfree(groupmask);
+       free_cpumask_var(groupmask);
   }
   #else /* !CONFIG_SCHED_DEBUG */
   # define sched_domain_debug(sd, cpu) do { } while (0)
@@@ -6820,7 -6869,7 +6939,7 @@@
   
   static int sd_degenerate(struct sched_domain *sd)
   {
-       if (cpus_weight(sd->span) == 1)
+       if (cpumask_weight(sched_domain_span(sd)) == 1)
                 return 1;
   
         /* Following flags need at least 2 groups */
@@@ -6851,7 -6900,7 +6970,7 @@@ sd_parent_degenerate(struct sched_domai
         if (sd_degenerate(parent))
                 return 1;
   
-       if (!cpus_equal(sd->span, parent->span))
+       if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
                 return 0;
   
         /* Does parent contain flags not in child? */
@@@ -6875,6 -6924,16 +6994,16 @@@
         return 1;
   }
   
+ static void free_rootdomain(struct root_domain *rd)
+ {
+       cpupri_cleanup(&rd->cpupri);
+ 
+       free_cpumask_var(rd->rto_mask);
+       free_cpumask_var(rd->online);
+       free_cpumask_var(rd->span);
+       kfree(rd);
+ }
+ 
   static void rq_attach_root(struct rq *rq, struct root_domain *rd)
   {
         unsigned long flags;
@@@ -6884,38 -6943,62 +7013,62 @@@
         if (rq->rd) {
                 struct root_domain *old_rd = rq->rd;
   
-               if (cpu_isset(rq->cpu, old_rd->online))
+               if (cpumask_test_cpu(rq->cpu, old_rd->online))
                         set_rq_offline(rq);
   
-               cpu_clear(rq->cpu, old_rd->span);
+               cpumask_clear_cpu(rq->cpu, old_rd->span);
   
                 if (atomic_dec_and_test(&old_rd->refcount))
-                       kfree(old_rd);
+                       free_rootdomain(old_rd);
         }
   
         atomic_inc(&rd->refcount);
         rq->rd = rd;
   
-       cpu_set(rq->cpu, rd->span);
-       if (cpu_isset(rq->cpu, cpu_online_map))
+       cpumask_set_cpu(rq->cpu, rd->span);
+       if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
                 set_rq_online(rq);
   
         spin_unlock_irqrestore(&rq->lock, flags);
   }
   
- static void init_rootdomain(struct root_domain *rd)
+ static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
   {
         memset(rd, 0, sizeof(*rd));
   
-       cpus_clear(rd->span);
-       cpus_clear(rd->online);
+       if (bootmem) {
+               alloc_bootmem_cpumask_var(&def_root_domain.span);
+               alloc_bootmem_cpumask_var(&def_root_domain.online);
+               alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
+               cpupri_init(&rd->cpupri, true);
+               return 0;
+       }
+ 
+       if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+               goto free_span;
+       if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+               goto free_online;
   
-       cpupri_init(&rd->cpupri);
+       if (cpupri_init(&rd->cpupri, false) != 0)
+               goto free_rto_mask;
+       return 0;
+ 
+ free_rto_mask:
+       free_cpumask_var(rd->rto_mask);
+ free_online:
+       free_cpumask_var(rd->online);
+ free_span:
+       free_cpumask_var(rd->span);
+ out:
+       return -ENOMEM;
   }
   
   static void init_defrootdomain(void)
   {
-       init_rootdomain(&def_root_domain);
+       init_rootdomain(&def_root_domain, true);
+ 
         atomic_set(&def_root_domain.refcount, 1);
   }
   
@@@ -6927,7 -7010,10 +7080,10 @@@ static struct root_domain *alloc_rootdo
         if (!rd)
                 return NULL;
   
-       init_rootdomain(rd);
+       if (init_rootdomain(rd, false) != 0) {
+               kfree(rd);
+               return NULL;
+       }
   
         return rd;
   }
@@@ -6969,19 -7055,12 +7125,12 @@@ cpu_attach_domain(struct sched_domain *
   }
   
   /* cpus with isolated domains */
- static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
+ static cpumask_var_t cpu_isolated_map;
   
   /* Setup the mask of cpus configured for isolated domains */
   static int __init isolated_cpu_setup(char *str)
   {
-       static int __initdata ints[NR_CPUS];
-       int i;
- 
-       str = get_options(str, ARRAY_SIZE(ints), ints);
-       cpus_clear(cpu_isolated_map);
-       for (i = 1; i <= ints[0]; i++)
-               if (ints[i] < NR_CPUS)
-                       cpu_set(ints[i], cpu_isolated_map);
+       cpulist_parse(str, cpu_isolated_map);
         return 1;
   }
   
@@@ -6990,42 -7069,43 +7139,43 @@@ __setup("isolcpus=", isolated_cpu_setup
   /*
    * init_sched_build_groups takes the cpumask we wish to span, and a pointer
    * to a function which identifies what group(along with sched group) a CPU
-  * belongs to. The return value of group_fn must be a >= 0 and < NR_CPUS
-  * (due to the fact that we keep track of groups covered with a cpumask_t).
+  * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+  * (due to the fact that we keep track of groups covered with a struct cpumask).
    *
    * init_sched_build_groups will build a circular linked list of the groups
    * covered by the given span, and will set each group's ->cpumask correctly,
    * and ->cpu_power to 0.
    */
   static void
- init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
-                       int (*group_fn)(int cpu, const cpumask_t *cpu_map,
+ init_sched_build_groups(const struct cpumask *span,
+                       const struct cpumask *cpu_map,
+                       int (*group_fn)(int cpu, const struct cpumask *cpu_map,
                                         struct sched_group **sg,
-                                       cpumask_t *tmpmask),
-                       cpumask_t *covered, cpumask_t *tmpmask)
+                                       struct cpumask *tmpmask),
+                       struct cpumask *covered, struct cpumask *tmpmask)
   {
         struct sched_group *first = NULL, *last = NULL;
         int i;
   
-       cpus_clear(*covered);
+       cpumask_clear(covered);
   
-       for_each_cpu_mask_nr(i, *span) {
+       for_each_cpu(i, span) {
                 struct sched_group *sg;
                 int group = group_fn(i, cpu_map, &sg, tmpmask);
                 int j;
   
-               if (cpu_isset(i, *covered))
+               if (cpumask_test_cpu(i, covered))
                         continue;
   
-               cpus_clear(sg->cpumask);
+               cpumask_clear(sched_group_cpus(sg));
                 sg->__cpu_power = 0;
   
-               for_each_cpu_mask_nr(j, *span) {
+               for_each_cpu(j, span) {
                         if (group_fn(j, cpu_map, NULL, tmpmask) != group)
                                 continue;
   
-                       cpu_set(j, *covered);
-                       cpu_set(j, sg->cpumask);
+                       cpumask_set_cpu(j, covered);
+                       cpumask_set_cpu(j, sched_group_cpus(sg));
                 }
                 if (!first)
                         first = sg;
@@@ -7089,42 -7169,55 +7239,55 @@@ static int find_next_best_node(int node
    * should be one that prevents unnecessary balancing, but also spreads tasks
    * out optimally.
    */
- static void sched_domain_node_span(int node, cpumask_t *span)
+ static void sched_domain_node_span(int node, struct cpumask *span)
   {
         nodemask_t used_nodes;
-       node_to_cpumask_ptr(nodemask, node);
         int i;
   
-       cpus_clear(*span);
+       cpumask_clear(span);
         nodes_clear(used_nodes);
   
-       cpus_or(*span, *span, *nodemask);
+       cpumask_or(span, span, cpumask_of_node(node));
         node_set(node, used_nodes);
   
         for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                 int next_node = find_next_best_node(node, &used_nodes);
   
-               node_to_cpumask_ptr_next(nodemask, next_node);
-               cpus_or(*span, *span, *nodemask);
+               cpumask_or(span, span, cpumask_of_node(next_node));
         }
   }
   #endif /* CONFIG_NUMA */
   
   int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
   
+ /*
+  * The cpus mask in sched_group and sched_domain hangs off the end.
+  * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+  * for nr_cpu_ids < CONFIG_NR_CPUS.
+  */
+ struct static_sched_group {
+       struct sched_group sg;
+       DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+ };
+ 
+ struct static_sched_domain {
+       struct sched_domain sd;
+       DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+ };
+ 
   /*
    * SMT sched-domains:
    */
   #ifdef CONFIG_SCHED_SMT
- static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
+ static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
   
   static int
- cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                cpumask_t *unused)
+ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+                struct sched_group **sg, struct cpumask *unused)
   {
         if (sg)
-               *sg = &per_cpu(sched_group_cpus, cpu);
+               *sg = &per_cpu(sched_group_cpus, cpu).sg;
         return cpu;
   }
   #endif /* CONFIG_SCHED_SMT */
@@@ -7133,56 -7226,53 +7296,53 @@@
    * multi-core sched-domains:
    */
   #ifdef CONFIG_SCHED_MC
- static DEFINE_PER_CPU(struct sched_domain, core_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_core);
+ static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
   #endif /* CONFIG_SCHED_MC */
   
   #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
   static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
   {
         int group;
   
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
         if (sg)
-               *sg = &per_cpu(sched_group_core, group);
+               *sg = &per_cpu(sched_group_core, group).sg;
         return group;
   }
   #elif defined(CONFIG_SCHED_MC)
   static int
- cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *unused)
+ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *unused)
   {
         if (sg)
-               *sg = &per_cpu(sched_group_core, cpu);
+               *sg = &per_cpu(sched_group_core, cpu).sg;
         return cpu;
   }
   #endif
   
- static DEFINE_PER_CPU(struct sched_domain, phys_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
+ static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
   
   static int
- cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
-                 cpumask_t *mask)
+ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
+                 struct sched_group **sg, struct cpumask *mask)
   {
         int group;
   #ifdef CONFIG_SCHED_MC
-       *mask = cpu_coregroup_map(cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
+       group = cpumask_first(mask);
   #elif defined(CONFIG_SCHED_SMT)
-       *mask = per_cpu(cpu_sibling_map, cpu);
-       cpus_and(*mask, *mask, *cpu_map);
-       group = first_cpu(*mask);
+       cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+       group = cpumask_first(mask);
   #else
         group = cpu;
   #endif
         if (sg)
-               *sg = &per_cpu(sched_group_phys, group);
+               *sg = &per_cpu(sched_group_phys, group).sg;
         return group;
   }
   
@@@ -7196,19 -7286,19 +7356,19 @@@ static DEFINE_PER_CPU(struct sched_doma
   static struct sched_group ***sched_group_nodes_bycpu;
   
   static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
- static DEFINE_PER_CPU(struct sched_group, sched_group_allnodes);
+ static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
   
- static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map,
-                                struct sched_group **sg, cpumask_t *nodemask)
+ static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
+                                struct sched_group **sg,
+                                struct cpumask *nodemask)
   {
         int group;
   
-       *nodemask = node_to_cpumask(cpu_to_node(cpu));
-       cpus_and(*nodemask, *nodemask, *cpu_map);
-       group = first_cpu(*nodemask);
+       cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
+       group = cpumask_first(nodemask);
   
         if (sg)
-               *sg = &per_cpu(sched_group_allnodes, group);
+               *sg = &per_cpu(sched_group_allnodes, group).sg;
         return group;
   }
   
@@@ -7220,11 -7310,11 +7380,11 @@@ static void init_numa_sched_groups_powe
         if (!sg)
                 return;
         do {
-               for_each_cpu_mask_nr(j, sg->cpumask) {
+               for_each_cpu(j, sched_group_cpus(sg)) {
                         struct sched_domain *sd;
   
-                       sd = &per_cpu(phys_domains, j);
-                       if (j != first_cpu(sd->groups->cpumask)) {
+                       sd = &per_cpu(phys_domains, j).sd;
+                       if (j != cpumask_first(sched_group_cpus(sd->groups))) {
                                 /*
                                  * Only add "power" once for each
                                  * physical package.
@@@ -7241,11 -7331,12 +7401,12 @@@
   
   #ifdef CONFIG_NUMA
   /* Free memory allocated for various sched_group structures */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
   {
         int cpu, i;
   
-       for_each_cpu_mask_nr(cpu, *cpu_map) {
+       for_each_cpu(cpu, cpu_map) {
                 struct sched_group **sched_group_nodes
                         = sched_group_nodes_bycpu[cpu];
   
@@@ -7255,9 -7346,8 +7416,8 @@@
                 for (i = 0; i < nr_node_ids; i++) {
                         struct sched_group *oldsg, *sg = sched_group_nodes[i];
   
-                       *nodemask = node_to_cpumask(i);
-                       cpus_and(*nodemask, *nodemask, *cpu_map);
-                       if (cpus_empty(*nodemask))
+                       cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+                       if (cpumask_empty(nodemask))
                                 continue;
   
                         if (sg == NULL)
@@@ -7275,7 -7365,8 +7435,8 @@@ next_sg
         }
   }
   #else /* !CONFIG_NUMA */
- static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
+ static void free_sched_groups(const struct cpumask *cpu_map,
+                             struct cpumask *nodemask)
   {
   }
   #endif /* CONFIG_NUMA */
@@@ -7301,7 -7392,7 +7462,7 @@@ static void init_sched_groups_power(in
   
         WARN_ON(!sd || !sd->groups);
   
-       if (cpu != first_cpu(sd->groups->cpumask))
+       if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
                 return;
   
         child = sd->child;
@@@ -7366,48 -7457,6 +7527,6 @@@ SD_INIT_FUNC(CPU
    SD_INIT_FUNC(MC)
   #endif
   
- /*
-  * To minimize stack usage kmalloc room for cpumasks and share the
-  * space as the usage in build_sched_domains() dictates.  Used only
-  * if the amount of space is significant.
-  */
- struct allmasks {
-       cpumask_t tmpmask;                      /* make this one first */
-       union {
-               cpumask_t nodemask;
-               cpumask_t this_sibling_map;
-               cpumask_t this_core_map;
-       };
-       cpumask_t send_covered;
- 
- #ifdef CONFIG_NUMA
-       cpumask_t domainspan;
-       cpumask_t covered;
-       cpumask_t notcovered;
- #endif
- };
- 
- #if   NR_CPUS > 128
- #define SCHED_CPUMASK_DECLARE(v)      struct allmasks *v
- static inline void sched_cpumask_alloc(struct allmasks **masks)
- {
-       *masks = kmalloc(sizeof(**masks), GFP_KERNEL);
- }
- static inline void sched_cpumask_free(struct allmasks *masks)
- {
-       kfree(masks);
- }
- #else
- #define SCHED_CPUMASK_DECLARE(v)      struct allmasks _v, *v = &_v
- static inline void sched_cpumask_alloc(struct allmasks **masks)
- { }
- static inline void sched_cpumask_free(struct allmasks *masks)
- { }
- #endif
- 
- #define       SCHED_CPUMASK_VAR(v, a)         cpumask_t *v = (cpumask_t *) \
-                       ((unsigned long)(a) + offsetof(struct allmasks, v))
- 
   static int default_relax_domain_level = -1;
   
   static int __init setup_relax_domain_level(char *str)
@@@ -7447,17 -7496,38 +7566,38 @@@ static void set_domain_attribute(struc
    * Build sched domains for a given set of cpus and attach the sched domains
    * to the individual cpus
    */
- static int __build_sched_domains(const cpumask_t *cpu_map,
+ static int __build_sched_domains(const struct cpumask *cpu_map,
                                  struct sched_domain_attr *attr)
   {
-       int i;
+       int i, err = -ENOMEM;
         struct root_domain *rd;
-       SCHED_CPUMASK_DECLARE(allmasks);
-       cpumask_t *tmpmask;
+       cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
+               tmpmask;
   #ifdef CONFIG_NUMA
+       cpumask_var_t domainspan, covered, notcovered;
         struct sched_group **sched_group_nodes = NULL;
         int sd_allnodes = 0;
   
+       if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
+               goto out;
+       if (!alloc_cpumask_var(&covered, GFP_KERNEL))
+               goto free_domainspan;
+       if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
+               goto free_covered;
+ #endif
+ 
+       if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
+               goto free_notcovered;
+       if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
+               goto free_nodemask;
+       if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
+               goto free_this_sibling_map;
+       if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
+               goto free_this_core_map;
+       if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               goto free_send_covered;
+ 
+ #ifdef CONFIG_NUMA
         /*
          * Allocate the per-node list of sched groups
          */
@@@ -7465,54 -7535,35 +7605,35 @@@
                                     GFP_KERNEL);
         if (!sched_group_nodes) {
                 printk(KERN_WARNING "Can not alloc sched group node list\n");
-               return -ENOMEM;
+               goto free_tmpmask;
         }
   #endif
   
         rd = alloc_rootdomain();
         if (!rd) {
                 printk(KERN_WARNING "Cannot alloc root domain\n");
- #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
- #endif
-               return -ENOMEM;
-       }
- 
-       /* get space for all scratch cpumask variables */
-       sched_cpumask_alloc(&allmasks);
-       if (!allmasks) {
-               printk(KERN_WARNING "Cannot alloc cpumask array\n");
-               kfree(rd);
- #ifdef CONFIG_NUMA
-               kfree(sched_group_nodes);
- #endif
-               return -ENOMEM;
+               goto free_sched_groups;
         }
   
-       tmpmask = (cpumask_t *)allmasks;
- 
- 
   #ifdef CONFIG_NUMA
-       sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
+       sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
   #endif
   
         /*
          * Set up domains for cpus specified by the cpu_map.
          */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                 struct sched_domain *sd = NULL, *p;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
   
-               *nodemask = node_to_cpumask(cpu_to_node(i));
-               cpus_and(*nodemask, *nodemask, *cpu_map);
+               cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
   
   #ifdef CONFIG_NUMA
-               if (cpus_weight(*cpu_map) >
-                               SD_NODES_PER_DOMAIN*cpus_weight(*nodemask)) {
+               if (cpumask_weight(cpu_map) >
+                               SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
                         sd = &per_cpu(allnodes_domains, i);
                         SD_INIT(sd, ALLNODES);
                         set_domain_attribute(sd, attr);
-                       sd->span = *cpu_map;
+                       cpumask_copy(sched_domain_span(sd), cpu_map);
                         cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                         p = sd;
                         sd_allnodes = 1;
@@@ -7522,18 -7573,19 +7643,19 @@@
                 sd = &per_cpu(node_domains, i);
                 SD_INIT(sd, NODE);
                 set_domain_attribute(sd, attr);
-               sched_domain_node_span(cpu_to_node(i), &sd->span);
+               sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
                 sd->parent = p;
                 if (p)
                         p->child = sd;
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           sched_domain_span(sd), cpu_map);
   #endif
   
                 p = sd;
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
                 SD_INIT(sd, CPU);
                 set_domain_attribute(sd, attr);
-               sd->span = *nodemask;
+               cpumask_copy(sched_domain_span(sd), nodemask);
                 sd->parent = p;
                 if (p)
                         p->child = sd;
@@@ -7541,11 -7593,11 +7663,11 @@@
   
   #ifdef CONFIG_SCHED_MC
                 p = sd;
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
                 SD_INIT(sd, MC);
                 set_domain_attribute(sd, attr);
-               sd->span = cpu_coregroup_map(i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd), cpu_map,
+                                                  cpu_coregroup_mask(i));
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7553,11 -7605,11 +7675,11 @@@
   
   #ifdef CONFIG_SCHED_SMT
                 p = sd;
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
                 SD_INIT(sd, SIBLING);
                 set_domain_attribute(sd, attr);
-               sd->span = per_cpu(cpu_sibling_map, i);
-               cpus_and(sd->span, sd->span, *cpu_map);
+               cpumask_and(sched_domain_span(sd),
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
                 sd->parent = p;
                 p->child = sd;
                 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@@ -7566,13 -7618,10 +7688,10 @@@
   
   #ifdef CONFIG_SCHED_SMT
         /* Set up CPU (sibling) groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
-               *this_sibling_map = per_cpu(cpu_sibling_map, i);
-               cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
-               if (i != first_cpu(*this_sibling_map))
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(this_sibling_map,
+                           &per_cpu(cpu_sibling_map, i), cpu_map);
+               if (i != cpumask_first(this_sibling_map))
                         continue;
   
                 init_sched_build_groups(this_sibling_map, cpu_map,
@@@ -7583,13 -7632,9 +7702,9 @@@
   
   #ifdef CONFIG_SCHED_MC
         /* Set up multi-core groups */
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               SCHED_CPUMASK_VAR(this_core_map, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
-               *this_core_map = cpu_coregroup_map(i);
-               cpus_and(*this_core_map, *this_core_map, *cpu_map);
-               if (i != first_cpu(*this_core_map))
+       for_each_cpu(i, cpu_map) {
+               cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
+               if (i != cpumask_first(this_core_map))
                         continue;
   
                 init_sched_build_groups(this_core_map, cpu_map,
@@@ -7600,12 -7645,8 +7715,8 @@@
   
         /* Set up physical groups */
         for (i = 0; i < nr_node_ids; i++) {
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
-               *nodemask = node_to_cpumask(i);
-               cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask))
+               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+               if (cpumask_empty(nodemask))
                         continue;
   
                 init_sched_build_groups(nodemask, cpu_map,
@@@ -7616,8 -7657,6 +7727,6 @@@
   #ifdef CONFIG_NUMA
         /* Set up node groups */
         if (sd_allnodes) {
-               SCHED_CPUMASK_VAR(send_covered, allmasks);
- 
                 init_sched_build_groups(cpu_map, cpu_map,
                                         &cpu_to_allnodes_group,
                                         send_covered, tmpmask);
@@@ -7626,58 -7665,53 +7735,53 @@@
         for (i = 0; i < nr_node_ids; i++) {
                 /* Set up node groups */
                 struct sched_group *sg, *prev;
-               SCHED_CPUMASK_VAR(nodemask, allmasks);
-               SCHED_CPUMASK_VAR(domainspan, allmasks);
-               SCHED_CPUMASK_VAR(covered, allmasks);
                 int j;
   
-               *nodemask = node_to_cpumask(i);
-               cpus_clear(*covered);
- 
-               cpus_and(*nodemask, *nodemask, *cpu_map);
-               if (cpus_empty(*nodemask)) {
+               cpumask_clear(covered);
+               cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+               if (cpumask_empty(nodemask)) {
                         sched_group_nodes[i] = NULL;
                         continue;
                 }
   
                 sched_domain_node_span(i, domainspan);
-               cpus_and(*domainspan, *domainspan, *cpu_map);
+               cpumask_and(domainspan, domainspan, cpu_map);
   
-               sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
+               sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                 GFP_KERNEL, i);
                 if (!sg) {
                         printk(KERN_WARNING "Can not alloc domain group for "
                                 "node %d\n", i);
                         goto error;
                 }
                 sched_group_nodes[i] = sg;
-               for_each_cpu_mask_nr(j, *nodemask) {
+               for_each_cpu(j, nodemask) {
                         struct sched_domain *sd;
   
                         sd = &per_cpu(node_domains, j);
                         sd->groups = sg;
                 }
                 sg->__cpu_power = 0;
-               sg->cpumask = *nodemask;
+               cpumask_copy(sched_group_cpus(sg), nodemask);
                 sg->next = sg;
-               cpus_or(*covered, *covered, *nodemask);
+               cpumask_or(covered, covered, nodemask);
                 prev = sg;
   
                 for (j = 0; j < nr_node_ids; j++) {
-                       SCHED_CPUMASK_VAR(notcovered, allmasks);
                         int n = (i + j) % nr_node_ids;
-                       node_to_cpumask_ptr(pnodemask, n);
   
-                       cpus_complement(*notcovered, *covered);
-                       cpus_and(*tmpmask, *notcovered, *cpu_map);
-                       cpus_and(*tmpmask, *tmpmask, *domainspan);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_complement(notcovered, covered);
+                       cpumask_and(tmpmask, notcovered, cpu_map);
+                       cpumask_and(tmpmask, tmpmask, domainspan);
+                       if (cpumask_empty(tmpmask))
                                 break;
   
-                       cpus_and(*tmpmask, *tmpmask, *pnodemask);
-                       if (cpus_empty(*tmpmask))
+                       cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
+                       if (cpumask_empty(tmpmask))
                                 continue;
   
-                       sg = kmalloc_node(sizeof(struct sched_group),
+                       sg = kmalloc_node(sizeof(struct sched_group) +
+                                         cpumask_size(),
                                           GFP_KERNEL, i);
                         if (!sg) {
                                 printk(KERN_WARNING
@@@ -7685,9 -7719,9 +7789,9 @@@
                                 goto error;
                         }
                         sg->__cpu_power = 0;
-                       sg->cpumask = *tmpmask;
+                       cpumask_copy(sched_group_cpus(sg), tmpmask);
                         sg->next = prev->next;
-                       cpus_or(*covered, *covered, *tmpmask);
+                       cpumask_or(covered, covered, tmpmask);
                         prev->next = sg;
                         prev = sg;
                 }
@@@ -7696,22 -7730,22 +7800,22 @@@
   
         /* Calculate CPU power for physical packages and nodes */
   #ifdef CONFIG_SCHED_SMT
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(cpu_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
   #endif
   #ifdef CONFIG_SCHED_MC
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(core_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(core_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
   #endif
   
-       for_each_cpu_mask_nr(i, *cpu_map) {
-               struct sched_domain *sd = &per_cpu(phys_domains, i);
+       for_each_cpu(i, cpu_map) {
+               struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
   
                 init_sched_groups_power(i, sd);
         }
@@@ -7723,53 -7757,78 +7827,78 @@@
         if (sd_allnodes) {
                 struct sched_group *sg;
   
-               cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map, &sg,
+               cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
                                                                 tmpmask);
                 init_numa_sched_groups_power(sg);
         }
   #endif
   
         /* Attach the domains */
-       for_each_cpu_mask_nr(i, *cpu_map) {
+       for_each_cpu(i, cpu_map) {
                 struct sched_domain *sd;
   #ifdef CONFIG_SCHED_SMT
-               sd = &per_cpu(cpu_domains, i);
+               sd = &per_cpu(cpu_domains, i).sd;
   #elif defined(CONFIG_SCHED_MC)
-               sd = &per_cpu(core_domains, i);
+               sd = &per_cpu(core_domains, i).sd;
   #else
-               sd = &per_cpu(phys_domains, i);
+               sd = &per_cpu(phys_domains, i).sd;
   #endif
                 cpu_attach_domain(sd, rd, i);
         }
   
-       sched_cpumask_free(allmasks);
-       return 0;
+       err = 0;
+ 
+ free_tmpmask:
+       free_cpumask_var(tmpmask);
+ free_send_covered:
+       free_cpumask_var(send_covered);
+ free_this_core_map:
+       free_cpumask_var(this_core_map);
+ free_this_sibling_map:
+       free_cpumask_var(this_sibling_map);
+ free_nodemask:
+       free_cpumask_var(nodemask);
+ free_notcovered:
+ #ifdef CONFIG_NUMA
+       free_cpumask_var(notcovered);
+ free_covered:
+       free_cpumask_var(covered);
+ free_domainspan:
+       free_cpumask_var(domainspan);
+ out:
+ #endif
+       return err;
+ 
+ free_sched_groups:
+ #ifdef CONFIG_NUMA
+       kfree(sched_group_nodes);
+ #endif
+       goto free_tmpmask;
   
   #ifdef CONFIG_NUMA
   error:
         free_sched_groups(cpu_map, tmpmask);
-       sched_cpumask_free(allmasks);
-       kfree(rd);
-       return -ENOMEM;
+       free_rootdomain(rd);
+       goto free_tmpmask;
   #endif
   }
   
- static int build_sched_domains(const cpumask_t *cpu_map)
+ static int build_sched_domains(const struct cpumask *cpu_map)
   {
         return __build_sched_domains(cpu_map, NULL);
   }
   
- static cpumask_t *doms_cur;   /* current sched domains */
+ static struct cpumask *doms_cur;      /* current sched domains */
   static int ndoms_cur;         /* number of sched domains in 'doms_cur' */
   static struct sched_domain_attr *dattr_cur;
                                 /* attribues of custom domains in 'doms_cur' */
   
   /*
    * Special case: If a kmalloc of a doms_cur partition (array of
-  * cpumask_t) fails, then fallback to a single sched domain,
-  * as determined by the single cpumask_t fallback_doms.
+  * cpumask) fails, then fallback to a single sched domain,
+  * as determined by the single cpumask fallback_doms.
    */
- static cpumask_t fallback_doms;
+ static cpumask_var_t fallback_doms;
   
   /*
    * arch_update_cpu_topology lets virtualized architectures update the
@@@ -7786,16 -7845,16 +7915,16 @@@ int __attribute__((weak)) arch_update_c
    * For now this just excludes isolated cpus, but could be used to
    * exclude other special cases in the future.
    */
- static int arch_init_sched_domains(const cpumask_t *cpu_map)
+ static int arch_init_sched_domains(const struct cpumask *cpu_map)
   {
         int err;
   
         arch_update_cpu_topology();
         ndoms_cur = 1;
-       doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+       doms_cur = kmalloc(cpumask_size(), GFP_KERNEL);
         if (!doms_cur)
-               doms_cur = &fallback_doms;
-       cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map);
+               doms_cur = fallback_doms;
+       cpumask_andnot(doms_cur, cpu_map, cpu_isolated_map);
         dattr_cur = NULL;
         err = build_sched_domains(doms_cur);
         register_sched_domain_sysctl();
@@@ -7803,8 -7862,8 +7932,8 @@@
         return err;
   }
   
- static void arch_destroy_sched_domains(const cpumask_t *cpu_map,
-                                      cpumask_t *tmpmask)
+ static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
+                                      struct cpumask *tmpmask)
   {
         free_sched_groups(cpu_map, tmpmask);
   }
@@@ -7813,15 -7872,16 +7942,16 @@@
    * Detach sched domains from a group of cpus specified in cpu_map
    * These cpus will now be attached to the NULL domain
    */
- static void detach_destroy_domains(const cpumask_t *cpu_map)
+ static void detach_destroy_domains(const struct cpumask *cpu_map)
   {
-       cpumask_t tmpmask;
+       /* Save because hotplug lock held. */
+       static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
         int i;
   
-       for_each_cpu_mask_nr(i, *cpu_map)
+       for_each_cpu(i, cpu_map)
                 cpu_attach_domain(NULL, &def_root_domain, i);
         synchronize_sched();
-       arch_destroy_sched_domains(cpu_map, &tmpmask);
+       arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
   }
   
   /* handle null as "default" */
@@@ -7846,7 -7906,7 +7976,7 @@@ static int dattrs_equal(struct sched_do
    * doms_new[] to the current sched domain partitioning, doms_cur[].
    * It destroys each deleted domain and builds each new domain.
    *
-  * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
+  * 'doms_new' is an array of cpumask's of length 'ndoms_new'.
    * The masks don't intersect (don't overlap.) We should setup one
    * sched domain for each mask. CPUs not in any of the cpumasks will
    * not be load balanced. If the same cpumask appears both in the
@@@ -7860,13 -7920,14 +7990,14 @@@
    * the single partition 'fallback_doms', it also forces the domains
    * to be rebuilt.
    *
-  * If doms_new == NULL it will be replaced with cpu_online_map.
+  * If doms_new == NULL it will be replaced with cpu_online_mask.
    * ndoms_new == 0 is a special case for destroying existing domains,
    * and it will not create the default domain.
    *
    * Call with hotplug lock held
    */
- void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+ /* FIXME: Change to struct cpumask *doms_new[] */
+ void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                              struct sched_domain_attr *dattr_new)
   {
         int i, j, n;
@@@ -7885,7 -7946,7 +8016,7 @@@
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
                 for (j = 0; j < n && !new_topology; j++) {
-                       if (cpus_equal(doms_cur[i], doms_new[j])
+                       if (cpumask_equal(&doms_cur[i], &doms_new[j])
                             && dattrs_equal(dattr_cur, i, dattr_new, j))
                                 goto match1;
                 }
@@@ -7897,15 -7958,15 +8028,15 @@@ match1
   
         if (doms_new == NULL) {
                 ndoms_cur = 0;
-               doms_new = &fallback_doms;
-               cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
+               doms_new = fallback_doms;
+               cpumask_andnot(&doms_new[0], cpu_online_mask, cpu_isolated_map);
                 WARN_ON_ONCE(dattr_new);
         }
   
         /* Build new domains */
         for (i = 0; i < ndoms_new; i++) {
                 for (j = 0; j < ndoms_cur && !new_topology; j++) {
-                       if (cpus_equal(doms_new[i], doms_cur[j])
+                       if (cpumask_equal(&doms_new[i], &doms_cur[j])
                             && dattrs_equal(dattr_new, i, dattr_cur, j))
                                 goto match2;
                 }
@@@ -7917,7 -7978,7 +8048,7 @@@ match2
         }
   
         /* Remember the new sched domains */
-       if (doms_cur != &fallback_doms)
+       if (doms_cur != fallback_doms)
                 kfree(doms_cur);
         kfree(dattr_cur);       /* kfree(NULL) is safe */
         doms_cur = doms_new;
@@@ -7930,7 -7991,7 +8061,7 @@@
   }
   
   #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
- int arch_reinit_sched_domains(void)
+ static void arch_reinit_sched_domains(void)
   {
         get_online_cpus();
   
@@@ -7939,25 -8000,33 +8070,33 @@@
   
         rebuild_sched_domains();
         put_online_cpus();
- 
-       return 0;
   }
   
   static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
   {
-       int ret;
+       unsigned int level = 0;
+ 
+       if (sscanf(buf, "%u", &level) != 1)
+               return -EINVAL;
+ 
+       /*
+        * level is always be positive so don't check for
+        * level < POWERSAVINGS_BALANCE_NONE which is 0
+        * What happens on 0 or 1 byte write,
+        * need to check for count as well?
+        */
   
-       if (buf[0] != '0' && buf[0] != '1')
+       if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
                 return -EINVAL;
   
         if (smt)
-               sched_smt_power_savings = (buf[0] == '1');
+               sched_smt_power_savings = level;
         else
-               sched_mc_power_savings = (buf[0] == '1');
+               sched_mc_power_savings = level;
   
-       ret = arch_reinit_sched_domains();
+       arch_reinit_sched_domains();
   
-       return ret ? ret : count;
+       return count;
   }
   
   #ifdef CONFIG_SCHED_MC
@@@ -7992,7 -8061,7 +8131,7 @@@ static SYSDEV_CLASS_ATTR(sched_smt_powe
                    sched_smt_power_savings_store);
   #endif
   
- int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
+ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
   {
         int err = 0;
   
@@@ -8057,7 -8126,9 +8196,9 @@@ static int update_runtime(struct notifi
   
   void __init sched_init_smp(void)
   {
-       cpumask_t non_isolated_cpus;
+       cpumask_var_t non_isolated_cpus;
+ 
+       alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
   
   #if defined(CONFIG_NUMA)
         sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
@@@ -8066,10 -8137,10 +8207,10 @@@
   #endif
         get_online_cpus();
         mutex_lock(&sched_domains_mutex);
-       arch_init_sched_domains(&cpu_online_map);
-       cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
-       if (cpus_empty(non_isolated_cpus))
-               cpu_set(smp_processor_id(), non_isolated_cpus);
+       arch_init_sched_domains(cpu_online_mask);
+       cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+       if (cpumask_empty(non_isolated_cpus))
+               cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
         mutex_unlock(&sched_domains_mutex);
         put_online_cpus();
   
@@@ -8084,9 -8155,13 +8225,13 @@@
         init_hrtick();
   
         /* Move init over to a non-isolated CPU */
-       if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0)
+       if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
                 BUG();
         sched_init_granularity();
+       free_cpumask_var(non_isolated_cpus);
+ 
+       alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+       init_sched_rt_class();
   }
   #else
   void __init sched_init_smp(void)
@@@ -8401,6 -8476,15 +8546,15 @@@ void __init sched_init(void
          */
         current->sched_class = &fair_sched_class;
   
+       /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
+       alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+ #ifdef CONFIG_SMP
+ #ifdef CONFIG_NO_HZ
+       alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+ #endif
+       alloc_bootmem_cpumask_var(&cpu_isolated_map);
+ #endif /* SMP */
+ 
         scheduler_running = 1;
   }
   
diff --combined kernel/sys.c

index 1544c305751e146e1d48bc2e7e26f053801e704b,763c3c17ded3e6c818283da75dd0865b93858c7a..c2a951ae4223937de1c563aec821262908e7cbc8
--- 1/kernel/sys.c
--- 2/kernel/sys.c
+++ b/kernel/sys.c
@@@ -14,7 -14,6 +14,7 @@@
   #include <linux/prctl.h>
   #include <linux/highuid.h>
   #include <linux/fs.h>
+ +#include <linux/perf_counter.h>
   #include <linux/resource.h>
   #include <linux/kernel.h>
   #include <linux/kexec.h>
@@@ -34,6 -33,7 +34,7 @@@
   #include <linux/task_io_accounting_ops.h>
   #include <linux/seccomp.h>
   #include <linux/cpu.h>
+ #include <linux/ptrace.h>
   
   #include <linux/compat.h>
   #include <linux/syscalls.h>
@@@ -908,8 -908,8 +909,8 @@@ void do_sys_times(struct tms *tms
         struct task_cputime cputime;
         cputime_t cutime, cstime;
   
-       spin_lock_irq(&current->sighand->siglock);
         thread_group_cputime(current, &cputime);
+       spin_lock_irq(&current->sighand->siglock);
         cutime = current->signal->cutime;
         cstime = current->signal->cstime;
         spin_unlock_irq(&current->sighand->siglock);
@@@ -928,6 -928,7 +929,7 @@@ asmlinkage long sys_times(struct tms __
                 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
                         return -EFAULT;
         }
+       force_successful_syscall_return();
         return (long) jiffies_64_to_clock_t(get_jiffies_64());
   }
   
@@@ -1628,6 -1629,8 +1630,8 @@@ static void k_getrusage(struct task_str
         utime = stime = cputime_zero;
   
         if (who == RUSAGE_THREAD) {
+               utime = task_utime(current);
+               stime = task_stime(current);
                 accumulate_thread_rusage(p, r);
                 goto out;
         }
@@@ -1794,12 -1797,6 +1798,12 @@@ asmlinkage long sys_prctl(int option, u
                 case PR_SET_TSC:
                         error = SET_TSC_CTL(arg2);
                         break;
+ +              case PR_TASK_PERF_COUNTERS_DISABLE:
+ +                      error = perf_counter_task_disable();
+ +                      break;
+ +              case PR_TASK_PERF_COUNTERS_ENABLE:
+ +                      error = perf_counter_task_enable();
+ +                      break;
                 case PR_GET_TIMERSLACK:
                         error = current->timer_slack_ns;
                         break;
author	Ingo Molnar <mingo@elte.hu>
	Sun, 11 Jan 2009 01:42:53 +0000 (02:42 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Sun, 11 Jan 2009 01:42:53 +0000 (02:42 +0100)
		1	2
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/atomic_32.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/irq_vectors.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/apic.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/common.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irq.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irqinit_32.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/irqinit_64.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/acpi/processor_idle.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/char/sysrq.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/exec.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/init_task.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/kernel_stat.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/sched.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/syscalls.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/exit.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sys.c	patch \|	diff1 \|	diff2 \|	blob \| history