Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 15 Sep 2009 16:39:44 +0000 (09:39 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 15 Sep 2009 16:39:44 +0000 (09:39 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Sep 2009 16:39:44 +0000 (09:39 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 15 Sep 2009 16:39:44 +0000 (09:39 -0700)
diff --combined Documentation/kernel-parameters.txt

index 3a238644c81182571e63f6065fcfe9ccac988d98,e710093e3d32f61557ebfb93cd8b06dde2add9c5..4c12a290bee52a00186e4a4e4c4d726996320d8e
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -57,7 -57,6 +57,7 @@@ parameter is applicable
         ISAPNP  ISA PnP code is enabled.
         ISDN    Appropriate ISDN support is enabled.
         JOY     Appropriate joystick support is enabled.
+ +      KVM     Kernel Virtual Machine support is enabled.
         LIBATA  Libata driver is enabled
         LP      Printer support is enabled.
         LOOP    Loopback device support is enabled.
@@@ -1099,44 -1098,6 +1099,44 @@@ and is between 256 and 4096 characters
         kstack=N        [X86] Print N words from the kernel stack
                         in oops dumps.
   
+ +      kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
+ +                      Default is 0 (don't ignore, but inject #GP)
+ +
+ +      kvm.oos_shadow= [KVM] Disable out-of-sync shadow paging.
+ +                      Default is 1 (enabled)
+ +
+ +      kvm-amd.nested= [KVM,AMD] Allow nested virtualization in KVM/SVM.
+ +                      Default is 0 (off)
+ +
+ +      kvm-amd.npt=    [KVM,AMD] Disable nested paging (virtualized MMU)
+ +                      for all guests.
+ +                      Default is 1 (enabled) if in 64bit or 32bit-PAE mode
+ +
+ +      kvm-intel.bypass_guest_pf=
+ +                      [KVM,Intel] Disables bypassing of guest page faults
+ +                      on Intel chips. Default is 1 (enabled)
+ +
+ +      kvm-intel.ept=  [KVM,Intel] Disable extended page tables
+ +                      (virtualized MMU) support on capable Intel chips.
+ +                      Default is 1 (enabled)
+ +
+ +      kvm-intel.emulate_invalid_guest_state=
+ +                      [KVM,Intel] Enable emulation of invalid guest states
+ +                      Default is 0 (disabled)
+ +
+ +      kvm-intel.flexpriority=
+ +                      [KVM,Intel] Disable FlexPriority feature (TPR shadow).
+ +                      Default is 1 (enabled)
+ +
+ +      kvm-intel.unrestricted_guest=
+ +                      [KVM,Intel] Disable unrestricted guest feature
+ +                      (virtualized real and unpaged mode) on capable
+ +                      Intel chips. Default is 1 (enabled)
+ +
+ +      kvm-intel.vpid= [KVM,Intel] Disable Virtual Processor Identification
+ +                      feature (tagged TLBs) on capable Intel chips.
+ +                      Default is 1 (enabled)
+ +
         l2cr=           [PPC]
   
         l3cr=           [PPC]
@@@ -1542,14 -1503,6 +1542,14 @@@
                         [NFS] set the TCP port on which the NFSv4 callback
                         channel should listen.
   
+ +      nfs.cache_getent=
+ +                      [NFS] sets the pathname to the program which is used
+ +                      to update the NFS client cache entries.
+ +
+ +      nfs.cache_getent_timeout=
+ +                      [NFS] sets the timeout after which an attempt to
+ +                      update a cache entry is deemed to have failed.
+ +
         nfs.idmap_cache_timeout=
                         [NFS] set the maximum lifetime for idmapper cache
                         entries.
@@@ -1582,11 -1535,6 +1582,11 @@@
                         symbolic names: lapic and ioapic
                         Example: nmi_watchdog=2 or nmi_watchdog=panic,lapic
   
+ +      netpoll.carrier_timeout=
+ +                      [NET] Specifies amount of time (in seconds) that
+ +                      netpoll should wait for a carrier. By default netpoll
+ +                      waits 4 seconds.
+ +
         no387           [BUGS=X86-32] Tells the kernel to use the 387 maths
                         emulation library even if a 387 maths coprocessor
                         is present.
@@@ -1971,11 -1919,12 +1971,12 @@@
                         Format: { 0 | 1 }
                         See arch/parisc/kernel/pdc_chassis.c
   
-       percpu_alloc=   [X86] Select which percpu first chunk allocator to use.
-                       Allowed values are one of "lpage", "embed" and "4k".
-                       See comments in arch/x86/kernel/setup_percpu.c for
-                       details on each allocator.  This parameter is primarily
-                       for debugging and performance comparison.
+       percpu_alloc=   Select which percpu first chunk allocator to use.
+                       Currently supported values are "embed" and "page".
+                       Archs may support subset or none of the selections.
+                       See comments in mm/percpu.c for details on each
+                       allocator.  This parameter is primarily for debugging
+                       and performance comparison.
   
         pf.             [PARIDE]
                         See Documentation/blockdev/paride.txt.
@@@ -2447,18 -2396,6 +2448,18 @@@
         stifb=          [HW]
                         Format: bpp:<bpp1>[:<bpp2>[:<bpp3>...]]
   
+ +      sunrpc.min_resvport=
+ +      sunrpc.max_resvport=
+ +                      [NFS,SUNRPC]
+ +                      SunRPC servers often require that client requests
+ +                      originate from a privileged port (i.e. a port in the
+ +                      range 0 < portnr < 1024).
+ +                      An administrator who wishes to reserve some of these
+ +                      ports for other uses may adjust the range that the
+ +                      kernel's sunrpc client considers to be privileged
+ +                      using these two parameters to set the minimum and
+ +                      maximum port values.
+ +
         sunrpc.pool_mode=
                         [NFS]
                         Control how the NFS server code allocates CPUs to
@@@ -2475,15 -2412,6 +2476,15 @@@
                         pernode     one pool for each NUMA node (equivalent
                                     to global on non-NUMA machines)
   
+ +      sunrpc.tcp_slot_table_entries=
+ +      sunrpc.udp_slot_table_entries=
+ +                      [NFS,SUNRPC]
+ +                      Sets the upper limit on the number of simultaneous
+ +                      RPC calls that can be sent from the client to a
+ +                      server. Increasing these values may allow you to
+ +                      improve throughput, but will also increase the
+ +                      amount of memory reserved for use by the client.
+ +
         swiotlb=        [IA-64] Number of I/O TLB slabs
   
         switches=       [HW,M68k]
@@@ -2553,11 -2481,6 +2554,11 @@@
         trace_buf_size=nn[KMG]
                         [FTRACE] will set tracing buffer size.
   
+ +      trace_event=[event-list]
+ +                      [FTRACE] Set and start specified trace events in order
+ +                      to facilitate early boot debugging.
+ +                      See also Documentation/trace/events.txt
+ +
         trix=           [HW,OSS] MediaTrix AudioTrix Pro
                         Format:
                         <io>,<irq>,<dma>,<dma2>,<sb_io>,<sb_irq>,<sb_dma>,<mpu_io>,<mpu_irq>
diff --combined Makefile

index 60de4ef312547da0264a363c87d0165626a6db8c,e1e7a71355d84004d42317c8b1a8d1709c9dbd64..433493a2b77baacc36152f30f88fd2b1e7f83db4
--- 1/Makefile
--- 2/Makefile
+++ b/Makefile
@@@ -1,7 -1,7 +1,7 @@@
   VERSION = 2
   PATCHLEVEL = 6
   SUBLEVEL = 31
- -EXTRAVERSION = -rc6
+ +EXTRAVERSION =
   NAME = Man-Eating Seals of Antiquity
   
   # *DOCUMENTATION*
@@@ -325,7 -325,7 +325,7 @@@ CHECKFLAGS     := -D__linux__ -Dlinux -
   MODFLAGS      = -DMODULE
   CFLAGS_MODULE   = $(MODFLAGS)
   AFLAGS_MODULE   = $(MODFLAGS)
- LDFLAGS_MODULE  =
+ LDFLAGS_MODULE  = -T $(srctree)/scripts/module-common.lds
   CFLAGS_KERNEL =
   AFLAGS_KERNEL =
   CFLAGS_GCOV   = -fprofile-arcs -ftest-coverage
diff --combined arch/ia64/Kconfig

index e6246119932aad2e260d8ffd19329c30ed5d9fd1,328d2f8b8c3fafed20b504c9b3bce57ab69a519b..011a1cdf0eb53d4c97c39427bd1f0461690a6e79
--- 1/arch/ia64/Kconfig
--- 2/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@@ -89,6 -89,9 +89,9 @@@ config GENERIC_TIME_VSYSCAL
         bool
         default y
   
+ config HAVE_LEGACY_PER_CPU_AREA
+       def_bool y
+ 
   config HAVE_SETUP_PER_CPU_AREA
         def_bool y
   
@@@ -112,10 -115,6 +115,10 @@@ config IA64_UNCACHED_ALLOCATO
         bool
         select GENERIC_ALLOCATOR
   
+ +config ARCH_USES_PG_UNCACHED
+ +      def_bool y
+ +      depends on IA64_UNCACHED_ALLOCATOR
+ +
   config AUDIT_ARCH
         bool
         default y
diff --combined arch/powerpc/mm/stab.c

index ab5fb48b3e90ae07723ca9b3ead715d443c02fb2,6e9b69c9985611ac7a519f398893b5e158b64c24..687fddaa24c564fb19491b146525ba2ffc58ac67
--- 1/arch/powerpc/mm/stab.c
--- 2/arch/powerpc/mm/stab.c
+++ b/arch/powerpc/mm/stab.c
@@@ -31,7 -31,7 +31,7 @@@ struct stab_entry 
   
   #define NR_STAB_CACHE_ENTRIES 8
   static DEFINE_PER_CPU(long, stab_cache_ptr);
- static DEFINE_PER_CPU(long, stab_cache[NR_STAB_CACHE_ENTRIES]);
+ static DEFINE_PER_CPU(long [NR_STAB_CACHE_ENTRIES], stab_cache);
   
   /*
    * Create a segment table entry for the given esid/vsid pair.
@@@ -164,7 -164,7 +164,7 @@@ void switch_stab(struct task_struct *ts
   {
         struct stab_entry *stab = (struct stab_entry *) get_paca()->stab_addr;
         struct stab_entry *ste;
- -      unsigned long offset = __get_cpu_var(stab_cache_ptr);
+ +      unsigned long offset;
         unsigned long pc = KSTK_EIP(tsk);
         unsigned long stack = KSTK_ESP(tsk);
         unsigned long unmapped_base;
@@@ -172,15 -172,6 +172,15 @@@
         /* Force previous translations to complete. DRENG */
         asm volatile("isync" : : : "memory");
   
+ +      /*
+ +       * We need interrupts hard-disabled here, not just soft-disabled,
+ +       * so that a PMU interrupt can't occur, which might try to access
+ +       * user memory (to get a stack trace) and possible cause an STAB miss
+ +       * which would update the stab_cache/stab_cache_ptr per-cpu variables.
+ +       */
+ +      hard_irq_disable();
+ +
+ +      offset = __get_cpu_var(stab_cache_ptr);
         if (offset <= NR_STAB_CACHE_ENTRIES) {
                 int i;
   
diff --combined arch/s390/kernel/vmlinux.lds.S

index 7315f9e67e1db1ff9397e31fc667712244e42520,82415c75b996447a202cbda9fe032222bc45eb6a..bc15ef93e6568b9094ee5620128a8cad95883a60
--- 1/arch/s390/kernel/vmlinux.lds.S
--- 2/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@@ -52,18 -52,55 +52,18 @@@ SECTION
         . = ALIGN(PAGE_SIZE);
         _eshared = .;           /* End of shareable data */
   
- -      . = ALIGN(16);          /* Exception table */
- -      __ex_table : {
- -              __start___ex_table = .;
- -              *(__ex_table)
- -              __stop___ex_table = .;
- -      } :data
- -
- -      .data : {               /* Data */
- -              DATA_DATA
- -              CONSTRUCTORS
- -      }
- -
- -      . = ALIGN(PAGE_SIZE);
- -      .data_nosave : {
- -      __nosave_begin = .;
- -              *(.data.nosave)
- -      }
- -      . = ALIGN(PAGE_SIZE);
- -      __nosave_end = .;
- -
- -      . = ALIGN(PAGE_SIZE);
- -      .data.page_aligned : {
- -              *(.data.idt)
- -      }
+ +      EXCEPTION_TABLE(16) :data
   
- -      . = ALIGN(0x100);
- -      .data.cacheline_aligned : {
- -              *(.data.cacheline_aligned)
- -      }
+ +      RW_DATA_SECTION(0x100, PAGE_SIZE, THREAD_SIZE)
   
- -      . = ALIGN(0x100);
- -      .data.read_mostly : {
- -              *(.data.read_mostly)
- -      }
         _edata = .;             /* End of data section */
   
- -      . = ALIGN(THREAD_SIZE); /* init_task */
- -      .data.init_task : {
- -              *(.data.init_task)
- -      }
- -
         /* will be freed after init */
         . = ALIGN(PAGE_SIZE);   /* Init code and data */
         __init_begin = .;
- -      .init.text : {
- -              _sinittext = .;
- -              INIT_TEXT
- -              _einittext = .;
- -      }
+ +
+ +      INIT_TEXT_SECTION(PAGE_SIZE)
+ +
         /*
          * .exit.text is discarded at runtime, not link time,
          * to deal with references from __bug_table
@@@ -74,23 -111,56 +74,20 @@@
   
         /* early.c uses stsi, which requires page aligned data. */
         . = ALIGN(PAGE_SIZE);
- -      .init.data : {
- -              INIT_DATA
- -      }
- -      . = ALIGN(0x100);
- -      .init.setup : {
- -              __setup_start = .;
- -              *(.init.setup)
- -              __setup_end = .;
- -      }
- -      .initcall.init : {
- -              __initcall_start = .;
- -              INITCALLS
- -              __initcall_end = .;
- -      }
- -
- -      .con_initcall.init : {
- -              __con_initcall_start = .;
- -              *(.con_initcall.init)
- -              __con_initcall_end = .;
- -      }
- -      SECURITY_INIT
- -
- -#ifdef CONFIG_BLK_DEV_INITRD
- -      . = ALIGN(0x100);
- -      .init.ramfs : {
- -              __initramfs_start = .;
- -              *(.init.ramfs)
- -              . = ALIGN(2);
- -              __initramfs_end = .;
- -      }
- -#endif
+ +      INIT_DATA_SECTION(0x100)
   
         PERCPU(PAGE_SIZE)
         . = ALIGN(PAGE_SIZE);
         __init_end = .;         /* freed after init ends here */
   
- -      /* BSS */
- -      .bss : {
- -              __bss_start = .;
- -              *(.bss)
- -              . = ALIGN(2);
- -              __bss_stop = .;
- -      }
+ +      BSS_SECTION(0, 2, 0)
   
         _end = . ;
   
-       /* Sections to be discarded */
-       /DISCARD/ : {
-               EXIT_DATA
-               *(.exitcall.exit)
-       }
- 
         /* Debugging sections.  */
         STABS_DEBUG
         DWARF_DEBUG
+ 
+       /* Sections to be discarded */
+       DISCARDS
   }
diff --combined arch/sparc/Kconfig

index 2bd5c287538a7b674348679b57244c515a4c2d0b,fbd1233b392dda3a97b452df0b3853a3d1797dab..86b82348b97c05c3709f02cda2826065deb72cf0
--- 1/arch/sparc/Kconfig
--- 2/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@@ -25,9 -25,6 +25,9 @@@ config SPAR
         select ARCH_WANT_OPTIONAL_GPIOLIB
         select RTC_CLASS
         select RTC_DRV_M48T59
+ +      select HAVE_PERF_COUNTERS
+ +      select HAVE_DMA_ATTRS
+ +      select HAVE_DMA_API_DEBUG
   
   config SPARC32
         def_bool !64BIT
@@@ -47,7 -44,6 +47,7 @@@ config SPARC6
         select RTC_DRV_BQ4802
         select RTC_DRV_SUN4V
         select RTC_DRV_STARFIRE
+ +      select HAVE_PERF_COUNTERS
   
   config ARCH_DEFCONFIG
         string
@@@ -99,7 -95,7 +99,7 @@@ config AUDIT_ARC
   config HAVE_SETUP_PER_CPU_AREA
         def_bool y if SPARC64
   
- config HAVE_DYNAMIC_PER_CPU_AREA
+ config NEED_PER_CPU_EMBED_FIRST_CHUNK
         def_bool y if SPARC64
   
   config GENERIC_HARDIRQS_NO__DO_IRQ
@@@ -441,17 -437,6 +441,17 @@@ config SERIAL_CONSOL
   
           If unsure, say N.
   
+ +config SPARC_LEON
+ +      bool "Sparc Leon processor family"
+ +      depends on SPARC32
+ +      ---help---
+ +        If you say Y here if you are running on a SPARC-LEON processor.
+ +        The LEON processor is a synthesizable VHDL model of the
+ +        SPARC-v8 standard. LEON is  part of the GRLIB collection of
+ +        IP cores that are distributed under GPL. GRLIB can be downloaded
+ +        from www.gaisler.com. You can download a sparc-linux cross-compilation
+ +        toolchain at www.gaisler.com.
+ +
   endmenu
   
   menu "Bus options (PCI etc.)"
diff --combined arch/x86/Kconfig

index a800b0faaad66798bab88823f14672a48a0c5042,869d7d301448874ebb65f62a9270cd5a8174d462..e98e81a04971f1c953533982f6b127573e4058c1
--- 1/arch/x86/Kconfig
--- 2/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@@ -38,7 -38,7 +38,7 @@@ config X8
         select HAVE_FUNCTION_GRAPH_FP_TEST
         select HAVE_FUNCTION_TRACE_MCOUNT_TEST
         select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE
- -      select HAVE_FTRACE_SYSCALLS
+ +      select HAVE_SYSCALL_TRACEPOINTS
         select HAVE_KVM
         select HAVE_ARCH_KGDB
         select HAVE_ARCH_TRACEHOOK
@@@ -150,7 -150,10 +150,10 @@@ config ARCH_HAS_CACHE_LINE_SIZ
   config HAVE_SETUP_PER_CPU_AREA
         def_bool y
   
- config HAVE_DYNAMIC_PER_CPU_AREA
+ config NEED_PER_CPU_EMBED_FIRST_CHUNK
+       def_bool y
+ 
+ config NEED_PER_CPU_PAGE_FIRST_CHUNK
         def_bool y
   
   config HAVE_CPUMASK_OF_CPU_MAP
@@@ -179,10 -182,6 +182,10 @@@ config ARCH_SUPPORTS_OPTIMIZED_INLININ
   config ARCH_SUPPORTS_DEBUG_PAGEALLOC
         def_bool y
   
+ +config HAVE_INTEL_TXT
+ +      def_bool y
+ +      depends on EXPERIMENTAL && DMAR && ACPI
+ +
   # Use the generic interrupt handling code in kernel/irq/:
   config GENERIC_HARDIRQS
         bool
@@@ -590,6 -589,7 +593,6 @@@ config GART_IOMM
         bool "GART IOMMU support" if EMBEDDED
         default y
         select SWIOTLB
- -      select AGP
         depends on X86_64 && PCI
         ---help---
           Support for full DMA access of devices with 32bit memory access only
@@@ -1417,10 -1417,6 +1420,10 @@@ config X86_PA
   
           If unsure, say Y.
   
+ +config ARCH_USES_PG_UNCACHED
+ +      def_bool y
+ +      depends on X86_PAT
+ +
   config EFI
         bool "EFI runtime service support"
         depends on ACPI
diff --combined arch/x86/include/asm/percpu.h

index 04eacefcfd26035d588f44c08a3114f6097f5c62,a18c038a3079aa2d41a2f62440f060ff17d6aff5..b65a36defeb737434f8823c21c625f14db8e6347
--- 1/arch/x86/include/asm/percpu.h
--- 2/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@@ -49,7 -49,7 +49,7 @@@
   #define __percpu_arg(x)               "%%"__stringify(__percpu_seg)":%P" #x
   #define __my_cpu_offset               percpu_read(this_cpu_off)
   #else
- -#define __percpu_arg(x)               "%" #x
+ +#define __percpu_arg(x)               "%P" #x
   #endif
   
   /*
@@@ -104,48 -104,36 +104,48 @@@ do {                                                    
         }                                               \
   } while (0)
   
- -#define percpu_from_op(op, var)                               \
+ +#define percpu_from_op(op, var, constraint)           \
   ({                                                    \
         typeof(var) ret__;                              \
         switch (sizeof(var)) {                          \
         case 1:                                         \
                 asm(op "b "__percpu_arg(1)",%0"         \
                     : "=q" (ret__)                      \
- -                  : "m" (var));                       \
+ +                  : constraint);                      \
                 break;                                  \
         case 2:                                         \
                 asm(op "w "__percpu_arg(1)",%0"         \
                     : "=r" (ret__)                      \
- -                  : "m" (var));                       \
+ +                  : constraint);                      \
                 break;                                  \
         case 4:                                         \
                 asm(op "l "__percpu_arg(1)",%0"         \
                     : "=r" (ret__)                      \
- -                  : "m" (var));                       \
+ +                  : constraint);                      \
                 break;                                  \
         case 8:                                         \
                 asm(op "q "__percpu_arg(1)",%0"         \
                     : "=r" (ret__)                      \
- -                  : "m" (var));                       \
+ +                  : constraint);                      \
                 break;                                  \
         default: __bad_percpu_size();                   \
         }                                               \
         ret__;                                          \
   })
   
- -#define percpu_read(var)      percpu_from_op("mov", per_cpu__##var)
+ +/*
+ + * percpu_read() makes gcc load the percpu variable every time it is
+ + * accessed while percpu_read_stable() allows the value to be cached.
+ + * percpu_read_stable() is more efficient and can be used if its value
+ + * is guaranteed to be valid across cpus.  The current users include
+ + * get_current() and get_thread_info() both of which are actually
+ + * per-thread variables implemented as per-cpu variables and thus
+ + * stable for the duration of the respective task.
+ + */
+ +#define percpu_read(var)      percpu_from_op("mov", per_cpu__##var,   \
+ +                                             "m" (per_cpu__##var))
+ +#define percpu_read_stable(var)       percpu_from_op("mov", per_cpu__##var,   \
+ +                                             "p" (&per_cpu__##var))
   #define percpu_write(var, val)        percpu_to_op("mov", per_cpu__##var, val)
   #define percpu_add(var, val)  percpu_to_op("add", per_cpu__##var, val)
   #define percpu_sub(var, val)  percpu_to_op("sub", per_cpu__##var, val)
@@@ -168,15 -156,6 +168,6 @@@
   /* We can use this directly for local CPU (faster). */
   DECLARE_PER_CPU(unsigned long, this_cpu_off);
   
- #ifdef CONFIG_NEED_MULTIPLE_NODES
- void *pcpu_lpage_remapped(void *kaddr);
- #else
- static inline void *pcpu_lpage_remapped(void *kaddr)
- {
-       return NULL;
- }
- #endif
- 
   #endif /* !__ASSEMBLY__ */
   
   #ifdef CONFIG_SMP
diff --combined arch/x86/kernel/cpu/mcheck/mce.c

index 9bfe9d2ea61570e54f28cccae5f66b1ed5cf63bd,14ce5d49b2ad240ec8dc63b9b2fd9a4ae8986658..fdd51b554355049298ed2474fc7e5f1468c558fc
--- 1/arch/x86/kernel/cpu/mcheck/mce.c
--- 2/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@@ -183,11 -183,6 +183,11 @@@ void mce_log(struct mce *mce
         set_bit(0, &mce_need_notify);
   }
   
+ +void __weak decode_mce(struct mce *m)
+ +{
+ +      return;
+ +}
+ +
   static void print_mce(struct mce *m)
   {
         printk(KERN_EMERG
@@@ -210,8 -205,6 +210,8 @@@
         printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
                         m->cpuvendor, m->cpuid, m->time, m->socketid,
                         m->apicid);
+ +
+ +      decode_mce(m);
   }
   
   static void print_mce_head(void)
@@@ -222,10 -215,7 +222,10 @@@
   static void print_mce_tail(void)
   {
         printk(KERN_EMERG "This is not a software problem!\n"
- -             "Run through mcelog --ascii to decode and contact your hardware vendor\n");
+ +#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD))
+ +             "Run through mcelog --ascii to decode and contact your hardware vendor\n"
+ +#endif
+ +             );
   }
   
   #define PANIC_TIMEOUT 5 /* 5 seconds */
@@@ -1101,7 -1091,7 +1101,7 @@@ void mce_log_therm_throt_event(__u64 st
    */
   static int check_interval = 5 * 60; /* 5 minutes */
   
- static DEFINE_PER_CPU(int, next_interval); /* in jiffies */
+ static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
   static DEFINE_PER_CPU(struct timer_list, mce_timer);
   
   static void mcheck_timer(unsigned long data)
@@@ -1120,7 -1110,7 +1120,7 @@@
          * Alert userspace if needed.  If we logged an MCE, reduce the
          * polling interval, otherwise increase the polling interval.
          */
-       n = &__get_cpu_var(next_interval);
+       n = &__get_cpu_var(mce_next_interval);
         if (mce_notify_irq())
                 *n = max(*n/2, HZ/100);
         else
@@@ -1236,13 -1226,8 +1236,13 @@@ static void mce_init(void
   }
   
   /* Add per CPU specific workarounds here */
- -static void mce_cpu_quirks(struct cpuinfo_x86 *c)
+ +static int mce_cpu_quirks(struct cpuinfo_x86 *c)
   {
+ +      if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
+ +              pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
+ +              return -EOPNOTSUPP;
+ +      }
+ +
         /* This should be disabled by the BIOS, but isn't always */
         if (c->x86_vendor == X86_VENDOR_AMD) {
                 if (c->x86 == 15 && banks > 4) {
@@@ -1288,20 -1273,11 +1288,20 @@@
                 if ((c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xe)) &&
                         monarch_timeout < 0)
                         monarch_timeout = USEC_PER_SEC;
+ +
+ +              /*
+ +               * There are also broken BIOSes on some Pentium M and
+ +               * earlier systems:
+ +               */
+ +              if (c->x86 == 6 && c->x86_model <= 13 && mce_bootlog < 0)
+ +                      mce_bootlog = 0;
         }
         if (monarch_timeout < 0)
                 monarch_timeout = 0;
         if (mce_bootlog != 0)
                 mce_panic_timeout = 30;
+ +
+ +      return 0;
   }
   
   static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
@@@ -1335,7 -1311,7 +1335,7 @@@ static void mce_cpu_features(struct cpu
   static void mce_init_timer(void)
   {
         struct timer_list *t = &__get_cpu_var(mce_timer);
-       int *n = &__get_cpu_var(next_interval);
+       int *n = &__get_cpu_var(mce_next_interval);
   
         if (mce_ignore_ce)
                 return;
@@@ -1362,10 -1338,11 +1362,10 @@@ void __cpuinit mcheck_init(struct cpuin
         if (!mce_available(c))
                 return;
   
- -      if (mce_cap_init() < 0) {
+ +      if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) {
                 mce_disabled = 1;
                 return;
         }
- -      mce_cpu_quirks(c);
   
         machine_check_vector = do_machine_check;
   
@@@ -1935,7 -1912,7 +1935,7 @@@ mce_cpu_callback(struct notifier_block 
         case CPU_DOWN_FAILED:
         case CPU_DOWN_FAILED_FROZEN:
                 t->expires = round_jiffies(jiffies +
-                                               __get_cpu_var(next_interval));
+                                          __get_cpu_var(mce_next_interval));
                 add_timer_on(t, cpu);
                 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
                 break;
diff --combined arch/x86/kernel/cpu/mcheck/mce_amd.c

index 1fecba404fd8167ecc810bede9c08b3e1be0812f,bd2a2fa84628d1b8badb6ea8f59ba87a7084a39a..8cd5224943b5d00cb42c41c93bd1d6e6ce1d48df
--- 1/arch/x86/kernel/cpu/mcheck/mce_amd.c
--- 2/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@@ -69,7 -69,7 +69,7 @@@ struct threshold_bank 
         struct threshold_block  *blocks;
         cpumask_var_t           cpus;
   };
- static DEFINE_PER_CPU(struct threshold_bank *, threshold_banks[NR_BANKS]);
+ static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
   
   #ifdef CONFIG_SMP
   static unsigned char shared_bank[NR_BANKS] = {
@@@ -489,14 -489,12 +489,14 @@@ static __cpuinit int threshold_create_b
         int i, err = 0;
         struct threshold_bank *b = NULL;
         char name[32];
+ +      struct cpuinfo_x86 *c = &cpu_data(cpu);
+ +
   
         sprintf(name, "threshold_bank%i", bank);
   
   #ifdef CONFIG_SMP
         if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {   /* symlink */
- -              i = cpumask_first(cpu_core_mask(cpu));
+ +              i = cpumask_first(c->llc_shared_map);
   
                 /* first core not up yet */
                 if (cpu_data(i).cpu_core_id)
@@@ -516,7 -514,7 +516,7 @@@
                 if (err)
                         goto out;
   
- -              cpumask_copy(b->cpus, cpu_core_mask(cpu));
+ +              cpumask_copy(b->cpus, c->llc_shared_map);
                 per_cpu(threshold_banks, cpu)[bank] = b;
   
                 goto out;
@@@ -541,7 -539,7 +541,7 @@@
   #ifndef CONFIG_SMP
         cpumask_setall(b->cpus);
   #else
- -      cpumask_copy(b->cpus, cpu_core_mask(cpu));
+ +      cpumask_copy(b->cpus, c->llc_shared_map);
   #endif
   
         per_cpu(threshold_banks, cpu)[bank] = b;
diff --combined arch/x86/kernel/cpu/perf_counter.c

index f9cd0849bd4281384b0497b2584fa7dafe00fffd,3d4ebbd2e1290c149fb817fd916d678591a3f1dc..2732e2c1e4d340257e10a072fd7f4e2dbaa0a0a3
--- 1/arch/x86/kernel/cpu/perf_counter.c
--- 2/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@@ -6,7 -6,6 +6,7 @@@
    *  Copyright (C) 2009 Jaswinder Singh Rajput
    *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
    *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ + *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
    *
    *  For licencing details see kernel-base/COPYING
    */
@@@ -21,7 -20,6 +21,7 @@@
   #include <linux/sched.h>
   #include <linux/uaccess.h>
   #include <linux/highmem.h>
+ +#include <linux/cpu.h>
   
   #include <asm/apic.h>
   #include <asm/stacktrace.h>
@@@ -29,52 -27,12 +29,52 @@@
   
   static u64 perf_counter_mask __read_mostly;
   
+ +/* The maximal number of PEBS counters: */
+ +#define MAX_PEBS_COUNTERS     4
+ +
+ +/* The size of a BTS record in bytes: */
+ +#define BTS_RECORD_SIZE               24
+ +
+ +/* The size of a per-cpu BTS buffer in bytes: */
+ +#define BTS_BUFFER_SIZE               (BTS_RECORD_SIZE * 1024)
+ +
+ +/* The BTS overflow threshold in bytes from the end of the buffer: */
+ +#define BTS_OVFL_TH           (BTS_RECORD_SIZE * 64)
+ +
+ +
+ +/*
+ + * Bits in the debugctlmsr controlling branch tracing.
+ + */
+ +#define X86_DEBUGCTL_TR                       (1 << 6)
+ +#define X86_DEBUGCTL_BTS              (1 << 7)
+ +#define X86_DEBUGCTL_BTINT            (1 << 8)
+ +#define X86_DEBUGCTL_BTS_OFF_OS               (1 << 9)
+ +#define X86_DEBUGCTL_BTS_OFF_USR      (1 << 10)
+ +
+ +/*
+ + * A debug store configuration.
+ + *
+ + * We only support architectures that use 64bit fields.
+ + */
+ +struct debug_store {
+ +      u64     bts_buffer_base;
+ +      u64     bts_index;
+ +      u64     bts_absolute_maximum;
+ +      u64     bts_interrupt_threshold;
+ +      u64     pebs_buffer_base;
+ +      u64     pebs_index;
+ +      u64     pebs_absolute_maximum;
+ +      u64     pebs_interrupt_threshold;
+ +      u64     pebs_counter_reset[MAX_PEBS_COUNTERS];
+ +};
+ +
   struct cpu_hw_counters {
         struct perf_counter     *counters[X86_PMC_IDX_MAX];
         unsigned long           used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
         unsigned long           active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
         unsigned long           interrupts;
         int                     enabled;
+ +      struct debug_store      *ds;
   };
   
   /*
@@@ -100,8 -58,6 +100,8 @@@ struct x86_pmu 
         int             apic;
         u64             max_period;
         u64             intel_ctrl;
+ +      void            (*enable_bts)(u64 config);
+ +      void            (*disable_bts)(void);
   };
   
   static struct x86_pmu x86_pmu __read_mostly;
@@@ -621,9 -577,6 +621,9 @@@ x86_perf_counter_update(struct perf_cou
         u64 prev_raw_count, new_raw_count;
         s64 delta;
   
+ +      if (idx == X86_PMC_IDX_FIXED_BTS)
+ +              return 0;
+ +
         /*
          * Careful: an NMI might modify the previous counter value.
          *
@@@ -713,110 -666,10 +713,110 @@@ static void release_pmc_hardware(void
   #endif
   }
   
+ +static inline bool bts_available(void)
+ +{
+ +      return x86_pmu.enable_bts != NULL;
+ +}
+ +
+ +static inline void init_debug_store_on_cpu(int cpu)
+ +{
+ +      struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+ +
+ +      if (!ds)
+ +              return;
+ +
+ +      wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+ +                   (u32)((u64)(unsigned long)ds),
+ +                   (u32)((u64)(unsigned long)ds >> 32));
+ +}
+ +
+ +static inline void fini_debug_store_on_cpu(int cpu)
+ +{
+ +      if (!per_cpu(cpu_hw_counters, cpu).ds)
+ +              return;
+ +
+ +      wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+ +}
+ +
+ +static void release_bts_hardware(void)
+ +{
+ +      int cpu;
+ +
+ +      if (!bts_available())
+ +              return;
+ +
+ +      get_online_cpus();
+ +
+ +      for_each_online_cpu(cpu)
+ +              fini_debug_store_on_cpu(cpu);
+ +
+ +      for_each_possible_cpu(cpu) {
+ +              struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
+ +
+ +              if (!ds)
+ +                      continue;
+ +
+ +              per_cpu(cpu_hw_counters, cpu).ds = NULL;
+ +
+ +              kfree((void *)(unsigned long)ds->bts_buffer_base);
+ +              kfree(ds);
+ +      }
+ +
+ +      put_online_cpus();
+ +}
+ +
+ +static int reserve_bts_hardware(void)
+ +{
+ +      int cpu, err = 0;
+ +
+ +      if (!bts_available())
+ +              return 0;
+ +
+ +      get_online_cpus();
+ +
+ +      for_each_possible_cpu(cpu) {
+ +              struct debug_store *ds;
+ +              void *buffer;
+ +
+ +              err = -ENOMEM;
+ +              buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+ +              if (unlikely(!buffer))
+ +                      break;
+ +
+ +              ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+ +              if (unlikely(!ds)) {
+ +                      kfree(buffer);
+ +                      break;
+ +              }
+ +
+ +              ds->bts_buffer_base = (u64)(unsigned long)buffer;
+ +              ds->bts_index = ds->bts_buffer_base;
+ +              ds->bts_absolute_maximum =
+ +                      ds->bts_buffer_base + BTS_BUFFER_SIZE;
+ +              ds->bts_interrupt_threshold =
+ +                      ds->bts_absolute_maximum - BTS_OVFL_TH;
+ +
+ +              per_cpu(cpu_hw_counters, cpu).ds = ds;
+ +              err = 0;
+ +      }
+ +
+ +      if (err)
+ +              release_bts_hardware();
+ +      else {
+ +              for_each_online_cpu(cpu)
+ +                      init_debug_store_on_cpu(cpu);
+ +      }
+ +
+ +      put_online_cpus();
+ +
+ +      return err;
+ +}
+ +
   static void hw_perf_counter_destroy(struct perf_counter *counter)
   {
         if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
                 release_pmc_hardware();
+ +              release_bts_hardware();
                 mutex_unlock(&pmc_reserve_mutex);
         }
   }
@@@ -859,42 -712,6 +859,42 @@@ set_ext_hw_attr(struct hw_perf_counter 
         return 0;
   }
   
+ +static void intel_pmu_enable_bts(u64 config)
+ +{
+ +      unsigned long debugctlmsr;
+ +
+ +      debugctlmsr = get_debugctlmsr();
+ +
+ +      debugctlmsr |= X86_DEBUGCTL_TR;
+ +      debugctlmsr |= X86_DEBUGCTL_BTS;
+ +      debugctlmsr |= X86_DEBUGCTL_BTINT;
+ +
+ +      if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+ +              debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
+ +
+ +      if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+ +              debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
+ +
+ +      update_debugctlmsr(debugctlmsr);
+ +}
+ +
+ +static void intel_pmu_disable_bts(void)
+ +{
+ +      struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ +      unsigned long debugctlmsr;
+ +
+ +      if (!cpuc->ds)
+ +              return;
+ +
+ +      debugctlmsr = get_debugctlmsr();
+ +
+ +      debugctlmsr &=
+ +              ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
+ +                X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
+ +
+ +      update_debugctlmsr(debugctlmsr);
+ +}
+ +
   /*
    * Setup the hardware configuration for a given attr_type
    */
@@@ -911,13 -728,9 +911,13 @@@ static int __hw_perf_counter_init(struc
         err = 0;
         if (!atomic_inc_not_zero(&active_counters)) {
                 mutex_lock(&pmc_reserve_mutex);
- -              if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
- -                      err = -EBUSY;
- -              else
+ +              if (atomic_read(&active_counters) == 0) {
+ +                      if (!reserve_pmc_hardware())
+ +                              err = -EBUSY;
+ +                      else
+ +                              err = reserve_bts_hardware();
+ +              }
+ +              if (!err)
                         atomic_inc(&active_counters);
                 mutex_unlock(&pmc_reserve_mutex);
         }
@@@ -980,20 -793,6 +980,20 @@@
         if (config == -1LL)
                 return -EINVAL;
   
+ +      /*
+ +       * Branch tracing:
+ +       */
+ +      if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+ +          (hwc->sample_period == 1)) {
+ +              /* BTS is not supported by this architecture. */
+ +              if (!bts_available())
+ +                      return -EOPNOTSUPP;
+ +
+ +              /* BTS is currently only allowed for user-mode. */
+ +              if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ +                      return -EOPNOTSUPP;
+ +      }
+ +
         hwc->config |= config;
   
         return 0;
@@@ -1018,18 -817,7 +1018,18 @@@ static void p6_pmu_disable_all(void
   
   static void intel_pmu_disable_all(void)
   {
+ +      struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ +
+ +      if (!cpuc->enabled)
+ +              return;
+ +
+ +      cpuc->enabled = 0;
+ +      barrier();
+ +
         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+ +
+ +      if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+ +              intel_pmu_disable_bts();
   }
   
   static void amd_pmu_disable_all(void)
@@@ -1087,25 -875,7 +1087,25 @@@ static void p6_pmu_enable_all(void
   
   static void intel_pmu_enable_all(void)
   {
+ +      struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+ +
+ +      if (cpuc->enabled)
+ +              return;
+ +
+ +      cpuc->enabled = 1;
+ +      barrier();
+ +
         wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+ +
+ +      if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
+ +              struct perf_counter *counter =
+ +                      cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+ +
+ +              if (WARN_ON_ONCE(!counter))
+ +                      return;
+ +
+ +              intel_pmu_enable_bts(counter->hw.config);
+ +      }
   }
   
   static void amd_pmu_enable_all(void)
@@@ -1192,11 -962,6 +1192,11 @@@ p6_pmu_disable_counter(struct hw_perf_c
   static inline void
   intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
   {
+ +      if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+ +              intel_pmu_disable_bts();
+ +              return;
+ +      }
+ +
         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
                 intel_pmu_disable_fixed(hwc, idx);
                 return;
@@@ -1211,7 -976,7 +1211,7 @@@ amd_pmu_disable_counter(struct hw_perf_
         x86_pmu_disable_counter(hwc, idx);
   }
   
- static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
+ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
   
   /*
    * Set the next IRQ period, based on the hwc->period_left value.
@@@ -1225,9 -990,6 +1225,9 @@@ x86_perf_counter_set_period(struct perf
         s64 period = hwc->sample_period;
         int err, ret = 0;
   
+ +      if (idx == X86_PMC_IDX_FIXED_BTS)
+ +              return 0;
+ +
         /*
          * If we are way outside a reasoable range then just skip forward:
          */
@@@ -1253,7 -1015,7 +1253,7 @@@
         if (left > x86_pmu.max_period)
                 left = x86_pmu.max_period;
   
-       per_cpu(prev_left[idx], smp_processor_id()) = left;
+       per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
   
         /*
          * The hw counter starts counting from this counter offset,
@@@ -1310,14 -1072,6 +1310,14 @@@ static void p6_pmu_enable_counter(struc
   
   static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
   {
+ +      if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+ +              if (!__get_cpu_var(cpu_hw_counters).enabled)
+ +                      return;
+ +
+ +              intel_pmu_enable_bts(hwc->config);
+ +              return;
+ +      }
+ +
         if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
                 intel_pmu_enable_fixed(hwc, idx);
                 return;
@@@ -1339,16 -1093,11 +1339,16 @@@ fixed_mode_idx(struct perf_counter *cou
   {
         unsigned int event;
   
+ +      event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+ +
+ +      if (unlikely((event ==
+ +                    x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
+ +                   (hwc->sample_period == 1)))
+ +              return X86_PMC_IDX_FIXED_BTS;
+ +
         if (!x86_pmu.num_counters_fixed)
                 return -1;
   
- -      event = hwc->config & ARCH_PERFMON_EVENT_MASK;
- -
         if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
                 return X86_PMC_IDX_FIXED_INSTRUCTIONS;
         if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
@@@ -1369,15 -1118,7 +1369,15 @@@ static int x86_pmu_enable(struct perf_c
         int idx;
   
         idx = fixed_mode_idx(counter, hwc);
- -      if (idx >= 0) {
+ +      if (idx == X86_PMC_IDX_FIXED_BTS) {
+ +              /* BTS is already occupied. */
+ +              if (test_and_set_bit(idx, cpuc->used_mask))
+ +                      return -EAGAIN;
+ +
+ +              hwc->config_base        = 0;
+ +              hwc->counter_base       = 0;
+ +              hwc->idx                = idx;
+ +      } else if (idx >= 0) {
                 /*
                  * Try to get the fixed counter, if that is already taken
                  * then try to get a generic counter:
@@@ -1470,7 -1211,7 +1470,7 @@@ void perf_counter_print_debug(void
                 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
                 rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
   
-               prev_left = per_cpu(prev_left[idx], cpu);
+               prev_left = per_cpu(pmc_prev_left[idx], cpu);
   
                 pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
                         cpu, idx, pmc_ctrl);
@@@ -1488,44 -1229,6 +1488,44 @@@
         local_irq_restore(flags);
   }
   
+ +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
+ +                                     struct perf_sample_data *data)
+ +{
+ +      struct debug_store *ds = cpuc->ds;
+ +      struct bts_record {
+ +              u64     from;
+ +              u64     to;
+ +              u64     flags;
+ +      };
+ +      struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
+ +      unsigned long orig_ip = data->regs->ip;
+ +      struct bts_record *at, *top;
+ +
+ +      if (!counter)
+ +              return;
+ +
+ +      if (!ds)
+ +              return;
+ +
+ +      at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ +      top = (struct bts_record *)(unsigned long)ds->bts_index;
+ +
+ +      ds->bts_index = ds->bts_buffer_base;
+ +
+ +      for (; at < top; at++) {
+ +              data->regs->ip  = at->from;
+ +              data->addr      = at->to;
+ +
+ +              perf_counter_output(counter, 1, data);
+ +      }
+ +
+ +      data->regs->ip  = orig_ip;
+ +      data->addr      = 0;
+ +
+ +      /* There's new data available. */
+ +      counter->pending_kill = POLL_IN;
+ +}
+ +
   static void x86_pmu_disable(struct perf_counter *counter)
   {
         struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@@ -1550,15 -1253,6 +1550,15 @@@
          * that we are disabling:
          */
         x86_perf_counter_update(counter, hwc, idx);
+ +
+ +      /* Drain the remaining BTS records. */
+ +      if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
+ +              struct perf_sample_data data;
+ +              struct pt_regs regs;
+ +
+ +              data.regs = &regs;
+ +              intel_pmu_drain_bts_buffer(cpuc, &data);
+ +      }
         cpuc->counters[idx] = NULL;
         clear_bit(idx, cpuc->used_mask);
   
@@@ -1586,7 -1280,6 +1586,7 @@@ static int intel_pmu_save_and_restart(s
   
   static void intel_pmu_reset(void)
   {
+ +      struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
         unsigned long flags;
         int idx;
   
@@@ -1604,8 -1297,6 +1604,8 @@@
         for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
                 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
         }
+ +      if (ds)
+ +              ds->bts_index = ds->bts_buffer_base;
   
         local_irq_restore(flags);
   }
@@@ -1671,7 -1362,6 +1671,7 @@@ static int intel_pmu_handle_irq(struct 
         cpuc = &__get_cpu_var(cpu_hw_counters);
   
         perf_disable();
+ +      intel_pmu_drain_bts_buffer(cpuc, &data);
         status = intel_pmu_get_status();
         if (!status) {
                 perf_enable();
@@@ -1881,8 -1571,6 +1881,8 @@@ static struct x86_pmu intel_pmu = 
          * the generic counter period:
          */
         .max_period             = (1ULL << 31) - 1,
+ +      .enable_bts             = intel_pmu_enable_bts,
+ +      .disable_bts            = intel_pmu_disable_bts,
   };
   
   static struct x86_pmu amd_pmu = {
@@@ -2110,8 -1798,8 +2110,8 @@@ void callchain_store(struct perf_callch
                 entry->ip[entry->nr++] = ip;
   }
   
- static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
- static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+ static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
+ static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
   static DEFINE_PER_CPU(int, in_nmi_frame);
   
   
@@@ -2264,9 -1952,9 +2264,9 @@@ struct perf_callchain_entry *perf_callc
         struct perf_callchain_entry *entry;
   
         if (in_nmi())
-               entry = &__get_cpu_var(nmi_entry);
+               entry = &__get_cpu_var(pmc_nmi_entry);
         else
-               entry = &__get_cpu_var(irq_entry);
+               entry = &__get_cpu_var(pmc_irq_entry);
   
         entry->nr = 0;
   
@@@ -2274,8 -1962,3 +2274,8 @@@
   
         return entry;
   }
+ +
+ +void hw_perf_counter_setup_online(int cpu)
+ +{
+ +      init_debug_store_on_cpu(cpu);
+ +}
diff --combined arch/x86/kernel/vmlinux.lds.S

index 9fc178255c0465e01c35c3d5439fb54866eacaf1,bbf4fd044d07721c1d6e049ac826e172c8d5bec0..0ccb57d5ee35fc6c7242e435e6780e27bdfff75f
--- 1/arch/x86/kernel/vmlinux.lds.S
--- 2/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@@ -46,10 -46,11 +46,10 @@@ PHDRS 
         data PT_LOAD FLAGS(7);          /* RWE */
   #ifdef CONFIG_X86_64
         user PT_LOAD FLAGS(7);          /* RWE */
- -      data.init PT_LOAD FLAGS(7);     /* RWE */
   #ifdef CONFIG_SMP
         percpu PT_LOAD FLAGS(7);        /* RWE */
   #endif
- -      data.init2 PT_LOAD FLAGS(7);    /* RWE */
+ +      init PT_LOAD FLAGS(7);          /* RWE */
   #endif
         note PT_NOTE FLAGS(0);          /* ___ */
   }
@@@ -102,43 -103,65 +102,43 @@@ SECTION
                 __stop___ex_table = .;
         } :text = 0x9090
   
- -      RODATA
+ +      RO_DATA(PAGE_SIZE)
   
         /* Data */
- -      . = ALIGN(PAGE_SIZE);
         .data : AT(ADDR(.data) - LOAD_OFFSET) {
                 /* Start of data section */
                 _sdata = .;
- -              DATA_DATA
- -              CONSTRUCTORS
- -      } :data
+ +
+ +              /* init_task */
+ +              INIT_TASK_DATA(THREAD_SIZE)
   
   #ifdef CONFIG_X86_32
- -      /* 32 bit has nosave before _edata */
- -      . = ALIGN(PAGE_SIZE);
- -      .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- -              __nosave_begin = .;
- -              *(.data.nosave)
- -              . = ALIGN(PAGE_SIZE);
- -              __nosave_end = .;
- -      }
+ +              /* 32 bit has nosave before _edata */
+ +              NOSAVE_DATA
   #endif
   
- -      . = ALIGN(PAGE_SIZE);
- -      .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
- -              *(.data.page_aligned)
+ +              PAGE_ALIGNED_DATA(PAGE_SIZE)
                 *(.data.idt)
- -      }
   
- -#ifdef CONFIG_X86_32
- -      . = ALIGN(32);
- -#else
- -      . = ALIGN(PAGE_SIZE);
- -      . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
- -#endif
- -      .data.cacheline_aligned :
- -              AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
- -              *(.data.cacheline_aligned)
- -      }
+ +              CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES)
   
- -      /* rarely changed data like cpu maps */
- -#ifdef CONFIG_X86_32
- -      . = ALIGN(32);
- -#else
- -      . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
- -#endif
- -      .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
- -              *(.data.read_mostly)
+ +              DATA_DATA
+ +              CONSTRUCTORS
+ +
+ +              /* rarely changed data like cpu maps */
+ +              READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES)
   
                 /* End of data section */
                 _edata = .;
- -      }
+ +      } :data
   
   #ifdef CONFIG_X86_64
   
   #define VSYSCALL_ADDR (-10*1024*1024)
- -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
- -                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
- -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
- -                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+ +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \
+ +                            PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
+ +#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \
+ +                            PAGE_SIZE - 1) & ~(PAGE_SIZE - 1))
   
   #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
   #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
@@@ -204,29 -227,35 +204,29 @@@
   
   #endif /* CONFIG_X86_64 */
   
- -      /* init_task */
- -      . = ALIGN(THREAD_SIZE);
- -      .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
- -              *(.data.init_task)
+ +      /* Init code and data - will be freed after init */
+ +      . = ALIGN(PAGE_SIZE);
+ +      .init.begin : AT(ADDR(.init.begin) - LOAD_OFFSET) {
+ +              __init_begin = .; /* paired with __init_end */
         }
- -#ifdef CONFIG_X86_64
- -       :data.init
- -#endif
   
+ +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
         /*
- -       * smp_locks might be freed after init
- -       * start/end must be page aligned
+ +       * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+ +       * output PHDR, so the next output section - .init.text - should
+ +       * start another segment - init.
          */
- -      . = ALIGN(PAGE_SIZE);
- -      .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
- -              __smp_locks = .;
- -              *(.smp_locks)
- -              __smp_locks_end = .;
- -              . = ALIGN(PAGE_SIZE);
- -      }
+ +      PERCPU_VADDR(0, :percpu)
+ +#endif
   
- -      /* Init code and data - will be freed after init */
- -      . = ALIGN(PAGE_SIZE);
         .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
- -              __init_begin = .; /* paired with __init_end */
                 _sinittext = .;
                 INIT_TEXT
                 _einittext = .;
         }
+ +#ifdef CONFIG_X86_64
+ +      :init
+ +#endif
   
         .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
                 INIT_DATA
@@@ -297,7 -326,17 +297,7 @@@
         }
   #endif
   
- -#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
- -      /*
- -       * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
- -       * output PHDR, so the next output section - __data_nosave - should
- -       * start another section data.init2.  Also, pda should be at the head of
- -       * percpu area.  Preallocate it and define the percpu offset symbol
- -       * so that it can be accessed as a percpu variable.
- -       */
- -      . = ALIGN(PAGE_SIZE);
- -      PERCPU_VADDR(0, :percpu)
- -#else
+ +#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
         PERCPU(PAGE_SIZE)
   #endif
   
@@@ -308,22 -347,15 +308,22 @@@
                 __init_end = .;
         }
   
+ +      /*
+ +       * smp_locks might be freed after init
+ +       * start/end must be page aligned
+ +       */
+ +      . = ALIGN(PAGE_SIZE);
+ +      .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+ +              __smp_locks = .;
+ +              *(.smp_locks)
+ +              __smp_locks_end = .;
+ +              . = ALIGN(PAGE_SIZE);
+ +      }
+ +
   #ifdef CONFIG_X86_64
         .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
- -              . = ALIGN(PAGE_SIZE);
- -              __nosave_begin = .;
- -              *(.data.nosave)
- -              . = ALIGN(PAGE_SIZE);
- -              __nosave_end = .;
- -      } :data.init2
- -      /* use another section data.init2, see PERCPU_VADDR() above */
+ +              NOSAVE_DATA
+ +      }
   #endif
   
         /* BSS */
@@@ -348,15 -380,12 +348,12 @@@
                 _end = .;
         }
   
-       /* Sections to be discarded */
-       /DISCARD/ : {
-               *(.exitcall.exit)
-               *(.eh_frame)
-               *(.discard)
-       }
- 
           STABS_DEBUG
           DWARF_DEBUG
+ 
+       /* Sections to be discarded */
+       DISCARDS
+       /DISCARD/ : { *(.eh_frame) }
   }
   
   
diff --combined arch/x86/mm/pageattr.c

index e245775ec8566bea3edc3cd72cbcde7ab5f3295e,f53cfc7f963d969d1886d07f8e2e1ac3f7883940..24952fdc7e407a7cd2f9fe1237012c1c265da11a
--- 1/arch/x86/mm/pageattr.c
--- 2/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@@ -12,6 -12,7 +12,7 @@@
   #include <linux/seq_file.h>
   #include <linux/debugfs.h>
   #include <linux/pfn.h>
+ #include <linux/percpu.h>
   
   #include <asm/e820.h>
   #include <asm/processor.h>
@@@ -686,7 -687,7 +687,7 @@@ static int cpa_process_alias(struct cpa
   {
         struct cpa_data alias_cpa;
         unsigned long laddr = (unsigned long)__va(cpa->pfn << PAGE_SHIFT);
-       unsigned long vaddr, remapped;
+       unsigned long vaddr;
         int ret;
   
         if (cpa->pfn >= max_pfn_mapped)
@@@ -744,24 -745,6 +745,6 @@@
         }
   #endif
   
-       /*
-        * If the PMD page was partially used for per-cpu remapping,
-        * the recycled area needs to be split and modified.  Because
-        * the area is always proper subset of a PMD page
-        * cpa->numpages is guaranteed to be 1 for these areas, so
-        * there's no need to loop over and check for further remaps.
-        */
-       remapped = (unsigned long)pcpu_lpage_remapped((void *)laddr);
-       if (remapped) {
-               WARN_ON(cpa->numpages > 1);
-               alias_cpa = *cpa;
-               alias_cpa.vaddr = &remapped;
-               alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
-               ret = __change_page_attr_set_clr(&alias_cpa, 0);
-               if (ret)
-                       return ret;
-       }
- 
         return 0;
   }
   
@@@ -822,7 -805,6 +805,7 @@@ static int change_page_attr_set_clr(uns
   {
         struct cpa_data cpa;
         int ret, cache, checkalias;
+ +      unsigned long baddr = 0;
   
         /*
          * Check, if we are requested to change a not supported
@@@ -854,11 -836,6 +837,11 @@@
                          */
                         WARN_ON_ONCE(1);
                 }
+ +              /*
+ +               * Save address for cache flush. *addr is modified in the call
+ +               * to __change_page_attr_set_clr() below.
+ +               */
+ +              baddr = *addr;
         }
   
         /* Must avoid aliasing mappings in the highmem code */
@@@ -906,7 -883,7 +889,7 @@@
                         cpa_flush_array(addr, numpages, cache,
                                         cpa.flags, pages);
                 } else
- -                      cpa_flush_range(*addr, numpages, cache);
+ +                      cpa_flush_range(baddr, numpages, cache);
         } else
                 cpa_flush_all(cache);
   
diff --combined block/cfq-iosched.c

index 0e3814b662af3f7f1bd7c2f074d95150535ab132,1b2d12cda43e559421189a74dd8e4f6d480078cf..1ca813b16e7840cf10086d79b7e2bb7b0c07005e
--- 1/block/cfq-iosched.c
--- 2/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@@ -48,7 -48,7 +48,7 @@@ static int cfq_slice_idle = HZ / 125
   static struct kmem_cache *cfq_pool;
   static struct kmem_cache *cfq_ioc_pool;
   
- static DEFINE_PER_CPU(unsigned long, ioc_count);
+ static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
   static struct completion *ioc_gone;
   static DEFINE_SPINLOCK(ioc_gone_lock);
   
@@@ -134,8 -134,13 +134,8 @@@ struct cfq_data 
         struct rb_root prio_trees[CFQ_PRIO_LISTS];
   
         unsigned int busy_queues;
- -      /*
- -       * Used to track any pending rt requests so we can pre-empt current
- -       * non-RT cfqq in service when this value is non-zero.
- -       */
- -      unsigned int busy_rt_queues;
   
- -      int rq_in_driver;
+ +      int rq_in_driver[2];
         int sync_flight;
   
         /*
@@@ -186,6 -191,7 +186,6 @@@ enum cfqq_state_flags 
         CFQ_CFQQ_FLAG_on_rr = 0,        /* on round-robin busy list */
         CFQ_CFQQ_FLAG_wait_request,     /* waiting for a request */
         CFQ_CFQQ_FLAG_must_dispatch,    /* must be allowed a dispatch */
- -      CFQ_CFQQ_FLAG_must_alloc,       /* must be allowed rq alloc */
         CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */
         CFQ_CFQQ_FLAG_fifo_expire,      /* FIFO checked in this slice */
         CFQ_CFQQ_FLAG_idle_window,      /* slice idling enabled */
@@@ -212,6 -218,7 +212,6 @@@ static inline int cfq_cfqq_##name(cons
   CFQ_CFQQ_FNS(on_rr);
   CFQ_CFQQ_FNS(wait_request);
   CFQ_CFQQ_FNS(must_dispatch);
- -CFQ_CFQQ_FNS(must_alloc);
   CFQ_CFQQ_FNS(must_alloc_slice);
   CFQ_CFQQ_FNS(fifo_expire);
   CFQ_CFQQ_FNS(idle_window);
@@@ -232,11 -239,6 +232,11 @@@ static struct cfq_queue *cfq_get_queue(
   static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
                                                 struct io_context *);
   
+ +static inline int rq_in_driver(struct cfq_data *cfqd)
+ +{
+ +      return cfqd->rq_in_driver[0] + cfqd->rq_in_driver[1];
+ +}
+ +
   static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
                                             int is_sync)
   {
@@@ -255,7 -257,7 +255,7 @@@ static inline void cic_set_cfqq(struct 
    */
   static inline int cfq_bio_sync(struct bio *bio)
   {
- -      if (bio_data_dir(bio) == READ || bio_sync(bio))
+ +      if (bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO))
                 return 1;
   
         return 0;
@@@ -646,6 -648,8 +646,6 @@@ static void cfq_add_cfqq_rr(struct cfq_
         BUG_ON(cfq_cfqq_on_rr(cfqq));
         cfq_mark_cfqq_on_rr(cfqq);
         cfqd->busy_queues++;
- -      if (cfq_class_rt(cfqq))
- -              cfqd->busy_rt_queues++;
   
         cfq_resort_rr_list(cfqd, cfqq);
   }
@@@ -669,6 -673,8 +669,6 @@@ static void cfq_del_cfqq_rr(struct cfq_
   
         BUG_ON(!cfqd->busy_queues);
         cfqd->busy_queues--;
- -      if (cfq_class_rt(cfqq))
- -              cfqd->busy_rt_queues--;
   }
   
   /*
@@@ -754,9 -760,9 +754,9 @@@ static void cfq_activate_request(struc
   {
         struct cfq_data *cfqd = q->elevator->elevator_data;
   
- -      cfqd->rq_in_driver++;
+ +      cfqd->rq_in_driver[rq_is_sync(rq)]++;
         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
- -                                              cfqd->rq_in_driver);
+ +                                              rq_in_driver(cfqd));
   
         cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
   }
@@@ -764,12 -770,11 +764,12 @@@
   static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
   {
         struct cfq_data *cfqd = q->elevator->elevator_data;
+ +      const int sync = rq_is_sync(rq);
   
- -      WARN_ON(!cfqd->rq_in_driver);
- -      cfqd->rq_in_driver--;
+ +      WARN_ON(!cfqd->rq_in_driver[sync]);
+ +      cfqd->rq_in_driver[sync]--;
         cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
- -                                              cfqd->rq_in_driver);
+ +                                              rq_in_driver(cfqd));
   }
   
   static void cfq_remove_request(struct request *rq)
@@@ -1075,7 -1080,7 +1075,7 @@@ static void cfq_arm_slice_timer(struct 
         /*
          * still requests with the driver, don't idle
          */
- -      if (cfqd->rq_in_driver)
+ +      if (rq_in_driver(cfqd))
                 return;
   
         /*
@@@ -1110,7 -1115,6 +1110,7 @@@ static void cfq_dispatch_insert(struct 
   
         cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
   
+ +      cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
         cfq_remove_request(rq);
         cfqq->dispatched++;
         elv_dispatch_sort(q, rq);
@@@ -1174,6 -1178,20 +1174,6 @@@ static struct cfq_queue *cfq_select_que
         if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq))
                 goto expire;
   
- -      /*
- -       * If we have a RT cfqq waiting, then we pre-empt the current non-rt
- -       * cfqq.
- -       */
- -      if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) {
- -              /*
- -               * We simulate this as cfqq timed out so that it gets to bank
- -               * the remaining of its time slice.
- -               */
- -              cfq_log_cfqq(cfqd, cfqq, "preempt");
- -              cfq_slice_expired(cfqd, 1);
- -              goto new_queue;
- -      }
- -
         /*
          * The active queue has requests and isn't expired, allow it to
          * dispatch.
@@@ -1293,12 -1311,6 +1293,12 @@@ static int cfq_dispatch_requests(struc
         if (!cfqq)
                 return 0;
   
+ +      /*
+ +       * Drain async requests before we start sync IO
+ +       */
+ +      if (cfq_cfqq_idle_window(cfqq) && cfqd->rq_in_driver[BLK_RW_ASYNC])
+ +              return 0;
+ +
         /*
          * If this is an async queue and we have sync IO in flight, let it wait
          */
@@@ -1350,7 -1362,7 +1350,7 @@@
                 cfq_slice_expired(cfqd, 0);
         }
   
- -      cfq_log(cfqd, "dispatched a request");
+ +      cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
         return 1;
   }
   
@@@ -1415,7 -1427,7 +1415,7 @@@ static void cfq_cic_free_rcu(struct rcu
         cic = container_of(head, struct cfq_io_context, rcu_head);
   
         kmem_cache_free(cfq_ioc_pool, cic);
-       elv_ioc_count_dec(ioc_count);
+       elv_ioc_count_dec(cfq_ioc_count);
   
         if (ioc_gone) {
                 /*
@@@ -1424,7 -1436,7 +1424,7 @@@
                  * complete ioc_gone and set it back to NULL
                  */
                 spin_lock(&ioc_gone_lock);
-               if (ioc_gone && !elv_ioc_count_read(ioc_count)) {
+               if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
                         complete(ioc_gone);
                         ioc_gone = NULL;
                 }
@@@ -1550,7 -1562,7 +1550,7 @@@ cfq_alloc_io_context(struct cfq_data *c
                 INIT_HLIST_NODE(&cic->cic_list);
                 cic->dtor = cfq_free_io_context;
                 cic->exit = cfq_exit_io_context;
-               elv_ioc_count_inc(ioc_count);
+               elv_ioc_count_inc(cfq_ioc_count);
         }
   
         return cic;
@@@ -2118,11 -2130,11 +2118,11 @@@ static void cfq_insert_request(struct r
    */
   static void cfq_update_hw_tag(struct cfq_data *cfqd)
   {
- -      if (cfqd->rq_in_driver > cfqd->rq_in_driver_peak)
- -              cfqd->rq_in_driver_peak = cfqd->rq_in_driver;
+ +      if (rq_in_driver(cfqd) > cfqd->rq_in_driver_peak)
+ +              cfqd->rq_in_driver_peak = rq_in_driver(cfqd);
   
         if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
- -          cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
+ +          rq_in_driver(cfqd) <= CFQ_HW_QUEUE_MIN)
                 return;
   
         if (cfqd->hw_tag_samples++ < 50)
@@@ -2149,9 -2161,9 +2149,9 @@@ static void cfq_completed_request(struc
   
         cfq_update_hw_tag(cfqd);
   
- -      WARN_ON(!cfqd->rq_in_driver);
+ +      WARN_ON(!cfqd->rq_in_driver[sync]);
         WARN_ON(!cfqq->dispatched);
- -      cfqd->rq_in_driver--;
+ +      cfqd->rq_in_driver[sync]--;
         cfqq->dispatched--;
   
         if (cfq_cfqq_sync(cfqq))
@@@ -2185,7 -2197,7 +2185,7 @@@
                         cfq_arm_slice_timer(cfqd);
         }
   
- -      if (!cfqd->rq_in_driver)
+ +      if (!rq_in_driver(cfqd))
                 cfq_schedule_dispatch(cfqd);
   }
   
@@@ -2217,7 -2229,8 +2217,7 @@@ static void cfq_prio_boost(struct cfq_q
   
   static inline int __cfq_may_queue(struct cfq_queue *cfqq)
   {
- -      if ((cfq_cfqq_wait_request(cfqq) || cfq_cfqq_must_alloc(cfqq)) &&
- -          !cfq_cfqq_must_alloc_slice(cfqq)) {
+ +      if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
                 cfq_mark_cfqq_must_alloc_slice(cfqq);
                 return ELV_MQUEUE_MUST;
         }
@@@ -2304,6 -2317,7 +2304,6 @@@ cfq_set_request(struct request_queue *q
         }
   
         cfqq->allocated[rw]++;
- -      cfq_clear_cfqq_must_alloc(cfqq);
         atomic_inc(&cfqq->ref);
   
         spin_unlock_irqrestore(q->queue_lock, flags);
@@@ -2654,7 -2668,7 +2654,7 @@@ static void __exit cfq_exit(void
          * this also protects us from entering cfq_slab_kill() with
          * pending RCU callbacks
          */
-       if (elv_ioc_count_read(ioc_count))
+       if (elv_ioc_count_read(cfq_ioc_count))
                 wait_for_completion(&all_gone);
         cfq_slab_kill();
   }
diff --combined include/linux/percpu-defs.h

index 0761491b3eec791b6869d989c4c905429d2a686a,aefc2f12b48c3276a77f56d6fdab68e6dd1cee1a..9bd03193ecd48c5e2c1283309a2c8ac2914cab37
--- 1/include/linux/percpu-defs.h
--- 2/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@@ -10,22 -10,70 +10,70 @@@
   /*
    * Base implementations of per-CPU variable declarations and definitions, where
    * the section in which the variable is to be placed is provided by the
-  * 'section' argument.  This may be used to affect the parameters governing the
+  * 'sec' argument.  This may be used to affect the parameters governing the
    * variable's storage.
    *
    * NOTE!  The sections for the DECLARE and for the DEFINE must match, lest
    * linkage errors occur due the compiler generating the wrong code to access
    * that section.
    */
- #define DECLARE_PER_CPU_SECTION(type, name, section)                  \
-       extern                                                          \
-       __attribute__((__section__(PER_CPU_BASE_SECTION section)))      \
-       PER_CPU_ATTRIBUTES __typeof__(type) per_cpu__##name
- 
- #define DEFINE_PER_CPU_SECTION(type, name, section)                   \
-       __attribute__((__section__(PER_CPU_BASE_SECTION section)))      \
-       PER_CPU_ATTRIBUTES PER_CPU_DEF_ATTRIBUTES                       \
+ #define __PCPU_ATTRS(sec)                                             \
+       __attribute__((section(PER_CPU_BASE_SECTION sec)))              \
+       PER_CPU_ATTRIBUTES
+ 
+ #define __PCPU_DUMMY_ATTRS                                            \
+       __attribute__((section(".discard"), unused))
+ 
+ /*
+  * s390 and alpha modules require percpu variables to be defined as
+  * weak to force the compiler to generate GOT based external
+  * references for them.  This is necessary because percpu sections
+  * will be located outside of the usually addressable area.
+  *
+  * This definition puts the following two extra restrictions when
+  * defining percpu variables.
+  *
+  * 1. The symbol must be globally unique, even the static ones.
+  * 2. Static percpu variables cannot be defined inside a function.
+  *
+  * Archs which need weak percpu definitions should define
+  * ARCH_NEEDS_WEAK_PER_CPU in asm/percpu.h when necessary.
+  *
+  * To ensure that the generic code observes the above two
+  * restrictions, if CONFIG_DEBUG_FORCE_WEAK_PER_CPU is set weak
+  * definition is used for all cases.
+  */
+ #if defined(ARCH_NEEDS_WEAK_PER_CPU) || defined(CONFIG_DEBUG_FORCE_WEAK_PER_CPU)
+ /*
+  * __pcpu_scope_* dummy variable is used to enforce scope.  It
+  * receives the static modifier when it's used in front of
+  * DEFINE_PER_CPU() and will trigger build failure if
+  * DECLARE_PER_CPU() is used for the same variable.
+  *
+  * __pcpu_unique_* dummy variable is used to enforce symbol uniqueness
+  * such that hidden weak symbol collision, which will cause unrelated
+  * variables to share the same address, can be detected during build.
+  */
+ #define DECLARE_PER_CPU_SECTION(type, name, sec)                      \
+       extern __PCPU_DUMMY_ATTRS char __pcpu_scope_##name;             \
+       extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
+ 
+ #define DEFINE_PER_CPU_SECTION(type, name, sec)                               \
+       __PCPU_DUMMY_ATTRS char __pcpu_scope_##name;                    \
+       __PCPU_DUMMY_ATTRS char __pcpu_unique_##name;                   \
+       __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES __weak                 \
+       __typeof__(type) per_cpu__##name
+ #else
+ /*
+  * Normal declaration and definition macros.
+  */
+ #define DECLARE_PER_CPU_SECTION(type, name, sec)                      \
+       extern __PCPU_ATTRS(sec) __typeof__(type) per_cpu__##name
+ 
+ #define DEFINE_PER_CPU_SECTION(type, name, sec)                               \
+       __PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES                        \
         __typeof__(type) per_cpu__##name
+ #endif
   
   /*
    * Variant on the per-CPU variable declaration/definition theme used for
@@@ -66,24 -114,14 +114,24 @@@
         DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
         ____cacheline_aligned_in_smp
   
+ +#define DECLARE_PER_CPU_ALIGNED(type, name)                           \
+ +      DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
+ +      ____cacheline_aligned
+ +
+ +#define DEFINE_PER_CPU_ALIGNED(type, name)                            \
+ +      DEFINE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)     \
+ +      ____cacheline_aligned
+ +
   /*
    * Declaration/definition used for per-CPU variables that must be page aligned.
    */
- -#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name)                              \
- -      DECLARE_PER_CPU_SECTION(type, name, ".page_aligned")
+ +#define DECLARE_PER_CPU_PAGE_ALIGNED(type, name)                      \
+ +      DECLARE_PER_CPU_SECTION(type, name, ".page_aligned")            \
+ +      __aligned(PAGE_SIZE)
   
   #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                               \
- -      DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")
+ +      DEFINE_PER_CPU_SECTION(type, name, ".page_aligned")             \
+ +      __aligned(PAGE_SIZE)
   
   /*
    * Intermodule exports for per-CPU variables.
diff --combined init/main.c

index b34fd8e5edef6b50a9bedeef4dc1ff8625a76394,2f9544d8435aa503ddb90078c129ec793f1893aa..63904bb6ae37bf30b1e0b59fc266af7a080491cf
--- 1/init/main.c
--- 2/init/main.c
+++ b/init/main.c
@@@ -353,7 -353,6 +353,6 @@@ static void __init smp_init(void
   #define smp_init()    do { } while (0)
   #endif
   
- static inline void setup_per_cpu_areas(void) { }
   static inline void setup_nr_cpu_ids(void) { }
   static inline void smp_prepare_cpus(unsigned int maxcpus) { }
   
@@@ -374,29 -373,6 +373,6 @@@ static void __init setup_nr_cpu_ids(voi
         nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
   }
   
- #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
- unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
- 
- EXPORT_SYMBOL(__per_cpu_offset);
- 
- static void __init setup_per_cpu_areas(void)
- {
-       unsigned long size, i;
-       char *ptr;
-       unsigned long nr_possible_cpus = num_possible_cpus();
- 
-       /* Copy section for each CPU (we discard the original) */
-       size = ALIGN(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
-       ptr = alloc_bootmem_pages(size * nr_possible_cpus);
- 
-       for_each_possible_cpu(i) {
-               __per_cpu_offset[i] = ptr - __per_cpu_start;
-               memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
-               ptr += size;
-       }
- }
- #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
- 
   /* Called by boot processor to activate the rest. */
   static void __init smp_init(void)
   {
@@@ -451,7 -427,6 +427,7 @@@ static noinline void __init_refok rest_
   {
         int pid;
   
+ +      rcu_scheduler_starting();
         kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND);
         numa_default_policy();
         pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
@@@ -463,6 -438,7 +439,6 @@@
          * at least once to get things moving:
          */
         init_idle_bootup_task(current);
- -      rcu_scheduler_starting();
         preempt_enable_no_resched();
         schedule();
         preempt_disable();
@@@ -631,6 -607,7 +607,6 @@@ asmlinkage void __init start_kernel(voi
         softirq_init();
         timekeeping_init();
         time_init();
- -      sched_clock_init();
         profile_init();
         if (!irqs_disabled())
                 printk(KERN_CRIT "start_kernel(): bug: interrupts were "
@@@ -681,7 -658,6 +657,7 @@@
         numa_policy_init();
         if (late_time_init)
                 late_time_init();
+ +      sched_clock_init();
         calibrate_delay();
         pidmap_init();
         anon_vma_init();
@@@ -733,14 -709,13 +709,14 @@@ static void __init do_ctors(void
   int initcall_debug;
   core_param(initcall_debug, initcall_debug, bool, 0644);
   
+ +static char msgbuf[64];
+ +static struct boot_trace_call call;
+ +static struct boot_trace_ret ret;
+ +
   int do_one_initcall(initcall_t fn)
   {
         int count = preempt_count();
         ktime_t calltime, delta, rettime;
- -      char msgbuf[64];
- -      struct boot_trace_call call;
- -      struct boot_trace_ret ret;
   
         if (initcall_debug) {
                 call.caller = task_pid_nr(current);
diff --combined kernel/module.c

index 46580edff0cbb14da6603e6a0201d15486775ae4,3a4db71ea4944b6ff5d9d325ceebbc622c1a2efd..05ce49ced8f6caa51501941a40d46c0fded63717
--- 1/kernel/module.c
--- 2/kernel/module.c
+++ b/kernel/module.c
@@@ -55,11 -55,6 +55,11 @@@
   #include <linux/percpu.h>
   #include <linux/kmemleak.h>
   
+ +#define CREATE_TRACE_POINTS
+ +#include <trace/events/module.h>
+ +
+ +EXPORT_TRACEPOINT_SYMBOL(module_get);
+ +
   #if 0
   #define DEBUGP printk
   #else
@@@ -369,7 -364,7 +369,7 @@@ EXPORT_SYMBOL_GPL(find_module)
   
   #ifdef CONFIG_SMP
   
- #ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ #ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
   
   static void *percpu_modalloc(unsigned long size, unsigned long align,
                              const char *name)
@@@ -394,7 -389,7 +394,7 @@@ static void percpu_modfree(void *freeme
         free_percpu(freeme);
   }
   
- #else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+ #else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */
   
   /* Number of blocks used and allocated. */
   static unsigned int pcpu_num_used, pcpu_num_allocated;
@@@ -540,7 -535,7 +540,7 @@@ static int percpu_modinit(void
   }
   __initcall(percpu_modinit);
   
- #endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+ #endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */
   
   static unsigned int find_pcpusec(Elf_Ehdr *hdr,
                                  Elf_Shdr *sechdrs,
@@@ -914,18 -909,16 +914,18 @@@ void __symbol_put(const char *symbol
   }
   EXPORT_SYMBOL(__symbol_put);
   
+ +/* Note this assumes addr is a function, which it currently always is. */
   void symbol_put_addr(void *addr)
   {
         struct module *modaddr;
+ +      unsigned long a = (unsigned long)dereference_function_descriptor(addr);
   
- -      if (core_kernel_text((unsigned long)addr))
+ +      if (core_kernel_text(a))
                 return;
   
         /* module_text_address is safe here: we're supposed to have reference
          * to module from symbol_get, so it can't go away. */
- -      modaddr = __module_text_address((unsigned long)addr);
+ +      modaddr = __module_text_address(a);
         BUG_ON(!modaddr);
         module_put(modaddr);
   }
@@@ -947,8 -940,6 +947,8 @@@ void module_put(struct module *module
         if (module) {
                 unsigned int cpu = get_cpu();
                 local_dec(__module_ref_addr(module, cpu));
+ +              trace_module_put(module, _RET_IP_,
+ +                               local_read(__module_ref_addr(module, cpu)));
                 /* Maybe they're waiting for us to drop reference? */
                 if (unlikely(!module_is_live(module)))
                         wake_up_process(module->waiter);
@@@ -1281,10 -1272,6 +1281,10 @@@ static void add_notes_attrs(struct modu
         struct module_notes_attrs *notes_attrs;
         struct bin_attribute *nattr;
   
+ +      /* failed to create section attributes, so can't create notes */
+ +      if (!mod->sect_attrs)
+ +              return;
+ +
         /* Count notes sections and allocate structures.  */
         notes = 0;
         for (i = 0; i < nsect; i++)
@@@ -1504,8 -1491,6 +1504,8 @@@ static int __unlink_module(void *_mod
   /* Free a module, remove from lists, etc (must hold module_mutex). */
   static void free_module(struct module *mod)
   {
+ +      trace_module_free(mod);
+ +
         /* Delete from various lists */
         stop_machine(__unlink_module, mod, NULL);
         remove_notes_attrs(mod);
@@@ -2373,8 -2358,6 +2373,8 @@@ static noinline struct module *load_mod
         /* Get rid of temporary copy */
         vfree(hdr);
   
+ +      trace_module_load(mod);
+ +
         /* Done! */
         return mod;
   
diff --combined kernel/perf_counter.c

index e7f60f8e31edc99a57a57fb4fb0c2353c9093341,b0bdb36ccfc82e4cbfaebb0d4054af10b55bf709..8cb94a52d1bb3c21154e8dd028eecc93e1588866
--- 1/kernel/perf_counter.c
--- 2/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@@ -46,17 -46,11 +46,17 @@@ static atomic_t nr_task_counters __read
   
   /*
    * perf counter paranoia level:
- - *  0 - not paranoid
- - *  1 - disallow cpu counters to unpriv
- - *  2 - disallow kernel profiling to unpriv
+ + *  -1 - not paranoid at all
+ + *   0 - disallow raw tracepoint access for unpriv
+ + *   1 - disallow cpu counters for unpriv
+ + *   2 - disallow kernel profiling for unpriv
    */
- -int sysctl_perf_counter_paranoid __read_mostly;
+ +int sysctl_perf_counter_paranoid __read_mostly = 1;
+ +
+ +static inline bool perf_paranoid_tracepoint_raw(void)
+ +{
+ +      return sysctl_perf_counter_paranoid > -1;
+ +}
   
   static inline bool perf_paranoid_cpu(void)
   {
@@@ -106,16 -100,16 +106,16 @@@ hw_perf_group_sched_in(struct perf_coun
   
   void __weak perf_counter_print_debug(void)    { }
   
- static DEFINE_PER_CPU(int, disable_count);
+ static DEFINE_PER_CPU(int, perf_disable_count);
   
   void __perf_disable(void)
   {
-       __get_cpu_var(disable_count)++;
+       __get_cpu_var(perf_disable_count)++;
   }
   
   bool __perf_enable(void)
   {
-       return !--__get_cpu_var(disable_count);
+       return !--__get_cpu_var(perf_disable_count);
   }
   
   void perf_disable(void)
@@@ -475,8 -469,7 +475,8 @@@ static void update_counter_times(struc
         struct perf_counter_context *ctx = counter->ctx;
         u64 run_end;
   
- -      if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+ +      if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+ +          counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE)
                 return;
   
         counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
@@@ -525,7 -518,7 +525,7 @@@ static void __perf_counter_disable(voi
          */
         if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
                 update_context_time(ctx);
- -              update_counter_times(counter);
+ +              update_group_times(counter);
                 if (counter == counter->group_leader)
                         group_sched_out(counter, cpuctx, ctx);
                 else
@@@ -580,7 -573,7 +580,7 @@@ static void perf_counter_disable(struc
          * in, so we can change the state safely.
          */
         if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
- -              update_counter_times(counter);
+ +              update_group_times(counter);
                 counter->state = PERF_COUNTER_STATE_OFF;
         }
   
@@@ -857,27 -850,6 +857,27 @@@ retry
         spin_unlock_irq(&ctx->lock);
   }
   
+ +/*
+ + * Put a counter into inactive state and update time fields.
+ + * Enabling the leader of a group effectively enables all
+ + * the group members that aren't explicitly disabled, so we
+ + * have to update their ->tstamp_enabled also.
+ + * Note: this works for group members as well as group leaders
+ + * since the non-leader members' sibling_lists will be empty.
+ + */
+ +static void __perf_counter_mark_enabled(struct perf_counter *counter,
+ +                                      struct perf_counter_context *ctx)
+ +{
+ +      struct perf_counter *sub;
+ +
+ +      counter->state = PERF_COUNTER_STATE_INACTIVE;
+ +      counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+ +      list_for_each_entry(sub, &counter->sibling_list, list_entry)
+ +              if (sub->state >= PERF_COUNTER_STATE_INACTIVE)
+ +                      sub->tstamp_enabled =
+ +                              ctx->time - sub->total_time_enabled;
+ +}
+ +
   /*
    * Cross CPU call to enable a performance counter
    */
@@@ -905,7 -877,8 +905,7 @@@ static void __perf_counter_enable(void 
   
         if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
                 goto unlock;
- -      counter->state = PERF_COUNTER_STATE_INACTIVE;
- -      counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+ +      __perf_counter_mark_enabled(counter, ctx);
   
         /*
          * If the counter is in a group and isn't the group leader,
@@@ -998,9 -971,11 +998,9 @@@ static void perf_counter_enable(struct 
          * Since we have the lock this context can't be scheduled
          * in, so we can change the state safely.
          */
- -      if (counter->state == PERF_COUNTER_STATE_OFF) {
- -              counter->state = PERF_COUNTER_STATE_INACTIVE;
- -              counter->tstamp_enabled =
- -                      ctx->time - counter->total_time_enabled;
- -      }
+ +      if (counter->state == PERF_COUNTER_STATE_OFF)
+ +              __perf_counter_mark_enabled(counter, ctx);
+ +
    out:
         spin_unlock_irq(&ctx->lock);
   }
@@@ -1504,7 -1479,9 +1504,7 @@@ static void perf_counter_enable_on_exec
                 counter->attr.enable_on_exec = 0;
                 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
                         continue;
- -              counter->state = PERF_COUNTER_STATE_INACTIVE;
- -              counter->tstamp_enabled =
- -                      ctx->time - counter->total_time_enabled;
+ +              __perf_counter_mark_enabled(counter, ctx);
                 enabled = 1;
         }
   
@@@ -1526,21 -1503,10 +1526,21 @@@
    */
   static void __perf_counter_read(void *info)
   {
+ +      struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
         struct perf_counter *counter = info;
         struct perf_counter_context *ctx = counter->ctx;
         unsigned long flags;
   
+ +      /*
+ +       * If this is a task context, we need to check whether it is
+ +       * the current task context of this cpu.  If not it has been
+ +       * scheduled out before the smp call arrived.  In that case
+ +       * counter->count would have been updated to a recent sample
+ +       * when the counter was scheduled out.
+ +       */
+ +      if (ctx->task && cpuctx->task_ctx != ctx)
+ +              return;
+ +
         local_irq_save(flags);
         if (ctx->is_active)
                 update_context_time(ctx);
@@@ -1698,11 -1664,6 +1698,11 @@@ static void free_counter(struct perf_co
                         atomic_dec(&nr_task_counters);
         }
   
+ +      if (counter->output) {
+ +              fput(counter->output->filp);
+ +              counter->output = NULL;
+ +      }
+ +
         if (counter->destroy)
                 counter->destroy(counter);
   
@@@ -1819,7 -1780,7 +1819,7 @@@ static int perf_counter_read_group(stru
         size += err;
   
         list_for_each_entry(sub, &leader->sibling_list, list_entry) {
- -              err = perf_counter_read_entry(counter, read_format,
+ +              err = perf_counter_read_entry(sub, read_format,
                                 buf + size);
                 if (err < 0)
                         return err;
@@@ -1988,8 -1949,6 +1988,8 @@@ unlock
         return ret;
   }
   
+ +int perf_counter_set_output(struct perf_counter *counter, int output_fd);
+ +
   static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
   {
         struct perf_counter *counter = file->private_data;
@@@ -2013,9 -1972,6 +2013,9 @@@
         case PERF_COUNTER_IOC_PERIOD:
                 return perf_counter_period(counter, (u64 __user *)arg);
   
+ +      case PERF_COUNTER_IOC_SET_OUTPUT:
+ +              return perf_counter_set_output(counter, arg);
+ +
         default:
                 return -ENOTTY;
         }
@@@ -2052,10 -2008,6 +2052,10 @@@ int perf_counter_task_disable(void
         return 0;
   }
   
+ +#ifndef PERF_COUNTER_INDEX_OFFSET
+ +# define PERF_COUNTER_INDEX_OFFSET 0
+ +#endif
+ +
   static int perf_counter_index(struct perf_counter *counter)
   {
         if (counter->state != PERF_COUNTER_STATE_ACTIVE)
@@@ -2286,11 -2238,6 +2286,11 @@@ static int perf_mmap(struct file *file
   
         WARN_ON_ONCE(counter->ctx->parent_ctx);
         mutex_lock(&counter->mmap_mutex);
+ +      if (counter->output) {
+ +              ret = -EINVAL;
+ +              goto unlock;
+ +      }
+ +
         if (atomic_inc_not_zero(&counter->mmap_count)) {
                 if (nr_pages != counter->data->nr_pages)
                         ret = -EINVAL;
@@@ -2676,7 -2623,6 +2676,7 @@@ static int perf_output_begin(struct per
                              struct perf_counter *counter, unsigned int size,
                              int nmi, int sample)
   {
+ +      struct perf_counter *output_counter;
         struct perf_mmap_data *data;
         unsigned int offset, head;
         int have_lost;
@@@ -2686,17 -2632,13 +2686,17 @@@
                 u64                      lost;
         } lost_event;
   
+ +      rcu_read_lock();
         /*
          * For inherited counters we send all the output towards the parent.
          */
         if (counter->parent)
                 counter = counter->parent;
   
- -      rcu_read_lock();
+ +      output_counter = rcu_dereference(counter->output);
+ +      if (output_counter)
+ +              counter = output_counter;
+ +
         data = rcu_dereference(counter->data);
         if (!data)
                 goto out;
@@@ -3977,7 -3919,6 +3977,7 @@@ static const struct pmu *tp_perf_counte
          * have these.
          */
         if ((counter->attr.sample_type & PERF_SAMPLE_RAW) &&
+ +                      perf_paranoid_tracepoint_raw() &&
                         !capable(CAP_SYS_ADMIN))
                 return ERR_PTR(-EPERM);
   
@@@ -4110,7 -4051,6 +4110,7 @@@ perf_counter_alloc(struct perf_counter_
         hwc->sample_period = attr->sample_period;
         if (attr->freq && attr->sample_freq)
                 hwc->sample_period = 1;
+ +      hwc->last_period = hwc->sample_period;
   
         atomic64_set(&hwc->period_left, hwc->sample_period);
   
@@@ -4215,7 -4155,6 +4215,7 @@@ static int perf_copy_attr(struct perf_c
                         if (val)
                                 goto err_size;
                 }
+ +              size = sizeof(*attr);
         }
   
         ret = copy_from_user(attr, uattr, size);
@@@ -4247,57 -4186,6 +4247,57 @@@ err_size
         goto out;
   }
   
+ +int perf_counter_set_output(struct perf_counter *counter, int output_fd)
+ +{
+ +      struct perf_counter *output_counter = NULL;
+ +      struct file *output_file = NULL;
+ +      struct perf_counter *old_output;
+ +      int fput_needed = 0;
+ +      int ret = -EINVAL;
+ +
+ +      if (!output_fd)
+ +              goto set;
+ +
+ +      output_file = fget_light(output_fd, &fput_needed);
+ +      if (!output_file)
+ +              return -EBADF;
+ +
+ +      if (output_file->f_op != &perf_fops)
+ +              goto out;
+ +
+ +      output_counter = output_file->private_data;
+ +
+ +      /* Don't chain output fds */
+ +      if (output_counter->output)
+ +              goto out;
+ +
+ +      /* Don't set an output fd when we already have an output channel */
+ +      if (counter->data)
+ +              goto out;
+ +
+ +      atomic_long_inc(&output_file->f_count);
+ +
+ +set:
+ +      mutex_lock(&counter->mmap_mutex);
+ +      old_output = counter->output;
+ +      rcu_assign_pointer(counter->output, output_counter);
+ +      mutex_unlock(&counter->mmap_mutex);
+ +
+ +      if (old_output) {
+ +              /*
+ +               * we need to make sure no existing perf_output_*()
+ +               * is still referencing this counter.
+ +               */
+ +              synchronize_rcu();
+ +              fput(old_output->filp);
+ +      }
+ +
+ +      ret = 0;
+ +out:
+ +      fput_light(output_file, fput_needed);
+ +      return ret;
+ +}
+ +
   /**
    * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
    *
@@@ -4317,15 -4205,15 +4317,15 @@@ SYSCALL_DEFINE5(perf_counter_open
         struct file *group_file = NULL;
         int fput_needed = 0;
         int fput_needed2 = 0;
- -      int ret;
+ +      int err;
   
         /* for future expandability... */
- -      if (flags)
+ +      if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
                 return -EINVAL;
   
- -      ret = perf_copy_attr(attr_uptr, &attr);
- -      if (ret)
- -              return ret;
+ +      err = perf_copy_attr(attr_uptr, &attr);
+ +      if (err)
+ +              return err;
   
         if (!attr.exclude_kernel) {
                 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
@@@ -4348,8 -4236,8 +4348,8 @@@
          * Look up the group leader (we will attach this counter to it):
          */
         group_leader = NULL;
- -      if (group_fd != -1) {
- -              ret = -EINVAL;
+ +      if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
+ +              err = -EINVAL;
                 group_file = fget_light(group_fd, &fput_needed);
                 if (!group_file)
                         goto err_put_context;
@@@ -4378,24 -4266,18 +4378,24 @@@
   
         counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
                                      NULL, GFP_KERNEL);
- -      ret = PTR_ERR(counter);
+ +      err = PTR_ERR(counter);
         if (IS_ERR(counter))
                 goto err_put_context;
   
- -      ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
- -      if (ret < 0)
+ +      err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+ +      if (err < 0)
                 goto err_free_put_context;
   
- -      counter_file = fget_light(ret, &fput_needed2);
+ +      counter_file = fget_light(err, &fput_needed2);
         if (!counter_file)
                 goto err_free_put_context;
   
+ +      if (flags & PERF_FLAG_FD_OUTPUT) {
+ +              err = perf_counter_set_output(counter, group_fd);
+ +              if (err)
+ +                      goto err_fput_free_put_context;
+ +      }
+ +
         counter->filp = counter_file;
         WARN_ON_ONCE(ctx->parent_ctx);
         mutex_lock(&ctx->mutex);
@@@ -4409,20 -4291,20 +4409,20 @@@
         list_add_tail(&counter->owner_entry, &current->perf_counter_list);
         mutex_unlock(&current->perf_counter_mutex);
   
+ +err_fput_free_put_context:
         fput_light(counter_file, fput_needed2);
   
- -out_fput:
- -      fput_light(group_file, fput_needed);
- -
- -      return ret;
- -
   err_free_put_context:
- -      kfree(counter);
+ +      if (err < 0)
+ +              kfree(counter);
   
   err_put_context:
- -      put_ctx(ctx);
+ +      if (err < 0)
+ +              put_ctx(ctx);
+ +
+ +      fput_light(group_file, fput_needed);
   
- -      goto out_fput;
+ +      return err;
   }
   
   /*
diff --combined kernel/sched.c

index e27a53685ed9cb4a24d9e1b58650a32161e0a0ea,d3d7e7694da64131d431636be5779397381872a1..d9db3fb17573b8aaf8449ade1d168eb20f9ba4f6
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -64,6 -64,7 +64,6 @@@
   #include <linux/tsacct_kern.h>
   #include <linux/kprobes.h>
   #include <linux/delayacct.h>
- -#include <linux/reciprocal_div.h>
   #include <linux/unistd.h>
   #include <linux/pagemap.h>
   #include <linux/hrtimer.h>
@@@ -119,8 -120,30 +119,8 @@@
    */
   #define RUNTIME_INF   ((u64)~0ULL)
   
- -#ifdef CONFIG_SMP
- -
   static void double_rq_lock(struct rq *rq1, struct rq *rq2);
   
- -/*
- - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
- - * Since cpu_power is a 'constant', we can use a reciprocal divide.
- - */
- -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
- -{
- -      return reciprocal_divide(load, sg->reciprocal_cpu_power);
- -}
- -
- -/*
- - * Each time a sched group cpu_power is changed,
- - * we must compute its reciprocal value
- - */
- -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
- -{
- -      sg->__cpu_power += val;
- -      sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
- -}
- -#endif
- -
   static inline int rt_policy(int policy)
   {
         if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
@@@ -286,8 -309,8 +286,8 @@@ void set_tg_uid(struct user_struct *use
   
   /*
    * Root task group.
- - *    Every UID task group (including init_task_group aka UID-0) will
- - *    be a child to this group.
+ + *    Every UID task group (including init_task_group aka UID-0) will
+ + *    be a child to this group.
    */
   struct task_group root_task_group;
   
@@@ -295,12 -318,12 +295,12 @@@
   /* Default task group's sched entity on each cpu */
   static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
   /* Default task group's cfs_rq on each cpu */
- static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp;
- -static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_cfs_rq);
++static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
   #endif /* CONFIG_FAIR_GROUP_SCHED */
   
   #ifdef CONFIG_RT_GROUP_SCHED
   static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
- static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
   #endif /* CONFIG_RT_GROUP_SCHED */
   #else /* !CONFIG_USER_SCHED */
   #define root_task_group init_task_group
@@@ -593,7 -616,6 +593,7 @@@ struct rq 
   
         unsigned char idle_at_tick;
         /* For active balancing */
+ +      int post_schedule;
         int active_balance;
         int push_cpu;
         /* cpu of this runqueue: */
@@@ -604,9 -626,6 +604,9 @@@
   
         struct task_struct *migration_thread;
         struct list_head migration_queue;
+ +
+ +      u64 rt_avg;
+ +      u64 age_stamp;
   #endif
   
         /* calc_load related fields */
@@@ -674,7 -693,6 +674,7 @@@ static inline int cpu_of(struct rq *rq
   #define this_rq()             (&__get_cpu_var(runqueues))
   #define task_rq(p)            cpu_rq(task_cpu(p))
   #define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
+ +#define raw_rq()              (&__raw_get_cpu_var(runqueues))
   
   inline void update_rq_clock(struct rq *rq)
   {
@@@ -842,14 -860,6 +842,14 @@@ unsigned int sysctl_sched_shares_rateli
    */
   unsigned int sysctl_sched_shares_thresh = 4;
   
+ +/*
+ + * period over which we average the RT time consumption, measured
+ + * in ms.
+ + *
+ + * default: 1s
+ + */
+ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
+ +
   /*
    * period over which we measure -rt task cpu usage in us.
    * default: 1s
@@@ -1268,37 -1278,12 +1268,37 @@@ void wake_up_idle_cpu(int cpu
   }
   #endif /* CONFIG_NO_HZ */
   
+ +static u64 sched_avg_period(void)
+ +{
+ +      return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+ +}
+ +
+ +static void sched_avg_update(struct rq *rq)
+ +{
+ +      s64 period = sched_avg_period();
+ +
+ +      while ((s64)(rq->clock - rq->age_stamp) > period) {
+ +              rq->age_stamp += period;
+ +              rq->rt_avg /= 2;
+ +      }
+ +}
+ +
+ +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+ +{
+ +      rq->rt_avg += rt_delta;
+ +      sched_avg_update(rq);
+ +}
+ +
   #else /* !CONFIG_SMP */
   static void resched_task(struct task_struct *p)
   {
         assert_spin_locked(&task_rq(p)->lock);
         set_tsk_need_resched(p);
   }
+ +
+ +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+ +{
+ +}
   #endif /* CONFIG_SMP */
   
   #if BITS_PER_LONG == 32
@@@ -1528,35 -1513,28 +1528,35 @@@ static unsigned long cpu_avg_load_per_t
   
   #ifdef CONFIG_FAIR_GROUP_SCHED
   
+ +struct update_shares_data {
+ +      unsigned long rq_weight[NR_CPUS];
+ +};
+ +
+ +static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
+ +
   static void __set_se_shares(struct sched_entity *se, unsigned long shares);
   
   /*
    * Calculate and set the cpu's group shares.
    */
- -static void
- -update_group_shares_cpu(struct task_group *tg, int cpu,
- -                      unsigned long sd_shares, unsigned long sd_rq_weight)
+ +static void update_group_shares_cpu(struct task_group *tg, int cpu,
+ +                                  unsigned long sd_shares,
+ +                                  unsigned long sd_rq_weight,
+ +                                  struct update_shares_data *usd)
   {
- -      unsigned long shares;
- -      unsigned long rq_weight;
- -
- -      if (!tg->se[cpu])
- -              return;
+ +      unsigned long shares, rq_weight;
+ +      int boost = 0;
   
- -      rq_weight = tg->cfs_rq[cpu]->rq_weight;
+ +      rq_weight = usd->rq_weight[cpu];
+ +      if (!rq_weight) {
+ +              boost = 1;
+ +              rq_weight = NICE_0_LOAD;
+ +      }
   
         /*
- -       *           \Sum shares * rq_weight
- -       * shares =  -----------------------
- -       *               \Sum rq_weight
- -       *
+ +       *             \Sum_j shares_j * rq_weight_i
+ +       * shares_i =  -----------------------------
+ +       *                  \Sum_j rq_weight_j
          */
         shares = (sd_shares * rq_weight) / sd_rq_weight;
         shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
@@@ -1567,8 -1545,8 +1567,8 @@@
                 unsigned long flags;
   
                 spin_lock_irqsave(&rq->lock, flags);
- -              tg->cfs_rq[cpu]->shares = shares;
- -
+ +              tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
+ +              tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                 __set_se_shares(tg->se[cpu], shares);
                 spin_unlock_irqrestore(&rq->lock, flags);
         }
@@@ -1581,30 -1559,22 +1581,30 @@@
    */
   static int tg_shares_up(struct task_group *tg, void *data)
   {
- -      unsigned long weight, rq_weight = 0;
- -      unsigned long shares = 0;
+ +      unsigned long weight, rq_weight = 0, shares = 0;
+ +      struct update_shares_data *usd;
         struct sched_domain *sd = data;
+ +      unsigned long flags;
         int i;
   
+ +      if (!tg->se[0])
+ +              return 0;
+ +
+ +      local_irq_save(flags);
+ +      usd = &__get_cpu_var(update_shares_data);
+ +
         for_each_cpu(i, sched_domain_span(sd)) {
+ +              weight = tg->cfs_rq[i]->load.weight;
+ +              usd->rq_weight[i] = weight;
+ +
                 /*
                  * If there are currently no tasks on the cpu pretend there
                  * is one of average load so that when a new task gets to
                  * run here it will not get delayed by group starvation.
                  */
- -              weight = tg->cfs_rq[i]->load.weight;
                 if (!weight)
                         weight = NICE_0_LOAD;
   
- -              tg->cfs_rq[i]->rq_weight = weight;
                 rq_weight += weight;
                 shares += tg->cfs_rq[i]->shares;
         }
@@@ -1616,9 -1586,7 +1616,9 @@@
                 shares = tg->shares;
   
         for_each_cpu(i, sched_domain_span(sd))
- -              update_group_shares_cpu(tg, i, shares, rq_weight);
+ +              update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+ +
+ +      local_irq_restore(flags);
   
         return 0;
   }
@@@ -1648,14 -1616,8 +1648,14 @@@ static int tg_load_down(struct task_gro
   
   static void update_shares(struct sched_domain *sd)
   {
- -      u64 now = cpu_clock(raw_smp_processor_id());
- -      s64 elapsed = now - sd->last_update;
+ +      s64 elapsed;
+ +      u64 now;
+ +
+ +      if (root_task_group_empty())
+ +              return;
+ +
+ +      now = cpu_clock(raw_smp_processor_id());
+ +      elapsed = now - sd->last_update;
   
         if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
                 sd->last_update = now;
@@@ -1665,9 -1627,6 +1665,9 @@@
   
   static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
   {
+ +      if (root_task_group_empty())
+ +              return;
+ +
         spin_unlock(&rq->lock);
         update_shares(sd);
         spin_lock(&rq->lock);
@@@ -1675,9 -1634,6 +1675,9 @@@
   
   static void update_h_load(long cpu)
   {
+ +      if (root_task_group_empty())
+ +              return;
+ +
         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
   }
   
@@@ -2312,7 -2268,8 +2312,7 @@@ find_idlest_group(struct sched_domain *
                 }
   
                 /* Adjust by relative CPU power of the group */
- -              avg_load = sg_div_cpu_power(group,
- -                              avg_load * SCHED_LOAD_SCALE);
+ +              avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
   
                 if (local_group) {
                         this_load = avg_load;
@@@ -2680,32 -2637,9 +2680,32 @@@ void sched_fork(struct task_struct *p, 
         set_task_cpu(p, cpu);
   
         /*
- -       * Make sure we do not leak PI boosting priority to the child:
+ +       * Make sure we do not leak PI boosting priority to the child.
          */
         p->prio = current->normal_prio;
+ +
+ +      /*
+ +       * Revert to default priority/policy on fork if requested.
+ +       */
+ +      if (unlikely(p->sched_reset_on_fork)) {
+ +              if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+ +                      p->policy = SCHED_NORMAL;
+ +
+ +              if (p->normal_prio < DEFAULT_PRIO)
+ +                      p->prio = DEFAULT_PRIO;
+ +
+ +              if (PRIO_TO_NICE(p->static_prio) < 0) {
+ +                      p->static_prio = NICE_TO_PRIO(0);
+ +                      set_load_weight(p);
+ +              }
+ +
+ +              /*
+ +               * We don't need the reset flag anymore after the fork. It has
+ +               * fulfilled its duty:
+ +               */
+ +              p->sched_reset_on_fork = 0;
+ +      }
+ +
         if (!rt_prio(p->prio))
                 p->sched_class = &fair_sched_class;
   
@@@ -2862,6 -2796,12 +2862,6 @@@ static void finish_task_switch(struct r
   {
         struct mm_struct *mm = rq->prev_mm;
         long prev_state;
- -#ifdef CONFIG_SMP
- -      int post_schedule = 0;
- -
- -      if (current->sched_class->needs_post_schedule)
- -              post_schedule = current->sched_class->needs_post_schedule(rq);
- -#endif
   
         rq->prev_mm = NULL;
   
@@@ -2880,6 -2820,10 +2880,6 @@@
         finish_arch_switch(prev);
         perf_counter_task_sched_in(current, cpu_of(rq));
         finish_lock_switch(rq, prev);
- -#ifdef CONFIG_SMP
- -      if (post_schedule)
- -              current->sched_class->post_schedule(rq);
- -#endif
   
         fire_sched_in_preempt_notifiers(current);
         if (mm)
@@@ -2894,42 -2838,6 +2894,42 @@@
         }
   }
   
+ +#ifdef CONFIG_SMP
+ +
+ +/* assumes rq->lock is held */
+ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+ +{
+ +      if (prev->sched_class->pre_schedule)
+ +              prev->sched_class->pre_schedule(rq, prev);
+ +}
+ +
+ +/* rq->lock is NOT held, but preemption is disabled */
+ +static inline void post_schedule(struct rq *rq)
+ +{
+ +      if (rq->post_schedule) {
+ +              unsigned long flags;
+ +
+ +              spin_lock_irqsave(&rq->lock, flags);
+ +              if (rq->curr->sched_class->post_schedule)
+ +                      rq->curr->sched_class->post_schedule(rq);
+ +              spin_unlock_irqrestore(&rq->lock, flags);
+ +
+ +              rq->post_schedule = 0;
+ +      }
+ +}
+ +
+ +#else
+ +
+ +static inline void pre_schedule(struct rq *rq, struct task_struct *p)
+ +{
+ +}
+ +
+ +static inline void post_schedule(struct rq *rq)
+ +{
+ +}
+ +
+ +#endif
+ +
   /**
    * schedule_tail - first thing a freshly forked thread must call.
    * @prev: the thread we just switched away from.
@@@ -2940,13 -2848,6 +2940,13 @@@ asmlinkage void schedule_tail(struct ta
         struct rq *rq = this_rq();
   
         finish_task_switch(rq, prev);
+ +
+ +      /*
+ +       * FIXME: do we need to worry about rq being invalidated by the
+ +       * task_switch?
+ +       */
+ +      post_schedule(rq);
+ +
   #ifdef __ARCH_WANT_UNLOCKED_CTXSW
         /* In this case, finish_task_switch does not reenable preemption */
         preempt_enable();
@@@ -3478,10 -3379,9 +3478,10 @@@ static int move_one_task(struct rq *thi
   {
         const struct sched_class *class;
   
- -      for (class = sched_class_highest; class; class = class->next)
+ +      for_each_class(class) {
                 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
                         return 1;
+ +      }
   
         return 0;
   }
@@@ -3644,7 -3544,7 +3644,7 @@@ static inline void update_sd_power_savi
          * capacity but still has some space to pick up some load
          * from other group and save more power
          */
- -      if (sgs->sum_nr_running > sgs->group_capacity - 1)
+ +      if (sgs->sum_nr_running + 1 > sgs->group_capacity)
                 return;
   
         if (sgs->sum_nr_running > sds->leader_nr_running ||
@@@ -3711,77 -3611,6 +3711,77 @@@ static inline int check_power_save_busi
   }
   #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
   
+ +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+ +{
+ +      unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ +      unsigned long smt_gain = sd->smt_gain;
+ +
+ +      smt_gain /= weight;
+ +
+ +      return smt_gain;
+ +}
+ +
+ +unsigned long scale_rt_power(int cpu)
+ +{
+ +      struct rq *rq = cpu_rq(cpu);
+ +      u64 total, available;
+ +
+ +      sched_avg_update(rq);
+ +
+ +      total = sched_avg_period() + (rq->clock - rq->age_stamp);
+ +      available = total - rq->rt_avg;
+ +
+ +      if (unlikely((s64)total < SCHED_LOAD_SCALE))
+ +              total = SCHED_LOAD_SCALE;
+ +
+ +      total >>= SCHED_LOAD_SHIFT;
+ +
+ +      return div_u64(available, total);
+ +}
+ +
+ +static void update_cpu_power(struct sched_domain *sd, int cpu)
+ +{
+ +      unsigned long weight = cpumask_weight(sched_domain_span(sd));
+ +      unsigned long power = SCHED_LOAD_SCALE;
+ +      struct sched_group *sdg = sd->groups;
+ +
+ +      /* here we could scale based on cpufreq */
+ +
+ +      if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+ +              power *= arch_scale_smt_power(sd, cpu);
+ +              power >>= SCHED_LOAD_SHIFT;
+ +      }
+ +
+ +      power *= scale_rt_power(cpu);
+ +      power >>= SCHED_LOAD_SHIFT;
+ +
+ +      if (!power)
+ +              power = 1;
+ +
+ +      sdg->cpu_power = power;
+ +}
+ +
+ +static void update_group_power(struct sched_domain *sd, int cpu)
+ +{
+ +      struct sched_domain *child = sd->child;
+ +      struct sched_group *group, *sdg = sd->groups;
+ +      unsigned long power;
+ +
+ +      if (!child) {
+ +              update_cpu_power(sd, cpu);
+ +              return;
+ +      }
+ +
+ +      power = 0;
+ +
+ +      group = child->groups;
+ +      do {
+ +              power += group->cpu_power;
+ +              group = group->next;
+ +      } while (group != child->groups);
+ +
+ +      sdg->cpu_power = power;
+ +}
   
   /**
    * update_sg_lb_stats - Update sched_group's statistics for load balancing.
@@@ -3795,8 -3624,7 +3795,8 @@@
    * @balance: Should we balance.
    * @sgs: variable to hold the statistics for this group.
    */
- -static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+ +static inline void update_sg_lb_stats(struct sched_domain *sd,
+ +                      struct sched_group *group, int this_cpu,
                         enum cpu_idle_type idle, int load_idx, int *sd_idle,
                         int local_group, const struct cpumask *cpus,
                         int *balance, struct sg_lb_stats *sgs)
@@@ -3807,11 -3635,8 +3807,11 @@@
         unsigned long sum_avg_load_per_task;
         unsigned long avg_load_per_task;
   
- -      if (local_group)
+ +      if (local_group) {
                 balance_cpu = group_first_cpu(group);
+ +              if (balance_cpu == this_cpu)
+ +                      update_group_power(sd, this_cpu);
+ +      }
   
         /* Tally up the load of all CPUs in the group */
         sum_avg_load_per_task = avg_load_per_task = 0;
@@@ -3860,7 -3685,8 +3860,7 @@@
         }
   
         /* Adjust by relative CPU power of the group */
- -      sgs->avg_load = sg_div_cpu_power(group,
- -                      sgs->group_load * SCHED_LOAD_SCALE);
+ +      sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
   
   
         /*
@@@ -3872,14 -3698,14 +3872,14 @@@
          *      normalized nr_running number somewhere that negates
          *      the hierarchy?
          */
- -      avg_load_per_task = sg_div_cpu_power(group,
- -                      sum_avg_load_per_task * SCHED_LOAD_SCALE);
+ +      avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
+ +              group->cpu_power;
   
         if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                 sgs->group_imb = 1;
   
- -      sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
- -
+ +      sgs->group_capacity =
+ +              DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
   }
   
   /**
@@@ -3897,13 -3723,9 +3897,13 @@@ static inline void update_sd_lb_stats(s
                         const struct cpumask *cpus, int *balance,
                         struct sd_lb_stats *sds)
   {
+ +      struct sched_domain *child = sd->child;
         struct sched_group *group = sd->groups;
         struct sg_lb_stats sgs;
- -      int load_idx;
+ +      int load_idx, prefer_sibling = 0;
+ +
+ +      if (child && child->flags & SD_PREFER_SIBLING)
+ +              prefer_sibling = 1;
   
         init_sd_power_savings_stats(sd, sds, idle);
         load_idx = get_sd_load_idx(sd, idle);
@@@ -3914,22 -3736,14 +3914,22 @@@
                 local_group = cpumask_test_cpu(this_cpu,
                                                sched_group_cpus(group));
                 memset(&sgs, 0, sizeof(sgs));
- -              update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+ +              update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
                                 local_group, cpus, balance, &sgs);
   
                 if (local_group && balance && !(*balance))
                         return;
   
                 sds->total_load += sgs.group_load;
- -              sds->total_pwr += group->__cpu_power;
+ +              sds->total_pwr += group->cpu_power;
+ +
+ +              /*
+ +               * In case the child domain prefers tasks go to siblings
+ +               * first, lower the group capacity to one so that we'll try
+ +               * and move all the excess tasks away.
+ +               */
+ +              if (prefer_sibling)
+ +                      sgs.group_capacity = min(sgs.group_capacity, 1UL);
   
                 if (local_group) {
                         sds->this_load = sgs.avg_load;
@@@ -3949,6 -3763,7 +3949,6 @@@
                 update_sd_power_savings_stats(group, sds, local_group, &sgs);
                 group = group->next;
         } while (group != sd->groups);
- -
   }
   
   /**
@@@ -3986,28 -3801,28 +3986,28 @@@ static inline void fix_small_imbalance(
          * moving them.
          */
   
- -      pwr_now += sds->busiest->__cpu_power *
+ +      pwr_now += sds->busiest->cpu_power *
                         min(sds->busiest_load_per_task, sds->max_load);
- -      pwr_now += sds->this->__cpu_power *
+ +      pwr_now += sds->this->cpu_power *
                         min(sds->this_load_per_task, sds->this_load);
         pwr_now /= SCHED_LOAD_SCALE;
   
         /* Amount of load we'd subtract */
- -      tmp = sg_div_cpu_power(sds->busiest,
- -                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+ +      tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+ +              sds->busiest->cpu_power;
         if (sds->max_load > tmp)
- -              pwr_move += sds->busiest->__cpu_power *
+ +              pwr_move += sds->busiest->cpu_power *
                         min(sds->busiest_load_per_task, sds->max_load - tmp);
   
         /* Amount of load we'd add */
- -      if (sds->max_load * sds->busiest->__cpu_power <
+ +      if (sds->max_load * sds->busiest->cpu_power <
                 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
- -              tmp = sg_div_cpu_power(sds->this,
- -                      sds->max_load * sds->busiest->__cpu_power);
+ +              tmp = (sds->max_load * sds->busiest->cpu_power) /
+ +                      sds->this->cpu_power;
         else
- -              tmp = sg_div_cpu_power(sds->this,
- -                      sds->busiest_load_per_task * SCHED_LOAD_SCALE);
- -      pwr_move += sds->this->__cpu_power *
+ +              tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+ +                      sds->this->cpu_power;
+ +      pwr_move += sds->this->cpu_power *
                         min(sds->this_load_per_task, sds->this_load + tmp);
         pwr_move /= SCHED_LOAD_SCALE;
   
@@@ -4042,8 -3857,8 +4042,8 @@@ static inline void calculate_imbalance(
                         sds->max_load - sds->busiest_load_per_task);
   
         /* How much load to actually move to equalise the imbalance */
- -      *imbalance = min(max_pull * sds->busiest->__cpu_power,
- -              (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
+ +      *imbalance = min(max_pull * sds->busiest->cpu_power,
+ +              (sds->avg_load - sds->this_load) * sds->this->cpu_power)
                         / SCHED_LOAD_SCALE;
   
         /*
@@@ -4161,26 -3976,6 +4161,26 @@@ ret
         return NULL;
   }
   
+ +static struct sched_group *group_of(int cpu)
+ +{
+ +      struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+ +
+ +      if (!sd)
+ +              return NULL;
+ +
+ +      return sd->groups;
+ +}
+ +
+ +static unsigned long power_of(int cpu)
+ +{
+ +      struct sched_group *group = group_of(cpu);
+ +
+ +      if (!group)
+ +              return SCHED_LOAD_SCALE;
+ +
+ +      return group->cpu_power;
+ +}
+ +
   /*
    * find_busiest_queue - find the busiest runqueue among the cpus in group.
    */
@@@ -4193,18 -3988,15 +4193,18 @@@ find_busiest_queue(struct sched_group *
         int i;
   
         for_each_cpu(i, sched_group_cpus(group)) {
+ +              unsigned long power = power_of(i);
+ +              unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
                 unsigned long wl;
   
                 if (!cpumask_test_cpu(i, cpus))
                         continue;
   
                 rq = cpu_rq(i);
- -              wl = weighted_cpuload(i);
+ +              wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
+ +              wl /= power;
   
- -              if (rq->nr_running == 1 && wl > imbalance)
+ +              if (capacity && rq->nr_running == 1 && wl > imbalance)
                         continue;
   
                 if (wl > max_load) {
@@@ -5533,7 -5325,7 +5533,7 @@@ need_resched
         preempt_disable();
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
- -      rcu_qsctr_inc(cpu);
+ +      rcu_sched_qs(cpu);
         prev = rq->curr;
         switch_count = &prev->nivcsw;
   
@@@ -5557,7 -5349,10 +5557,7 @@@ need_resched_nonpreemptible
                 switch_count = &prev->nvcsw;
         }
   
- -#ifdef CONFIG_SMP
- -      if (prev->sched_class->pre_schedule)
- -              prev->sched_class->pre_schedule(rq, prev);
- -#endif
+ +      pre_schedule(rq, prev);
   
         if (unlikely(!rq->nr_running))
                 idle_balance(cpu, rq);
@@@ -5583,8 -5378,6 +5583,8 @@@
         } else
                 spin_unlock_irq(&rq->lock);
   
+ +      post_schedule(rq);
+ +
         if (unlikely(reacquire_kernel_lock(current) < 0))
                 goto need_resched_nonpreemptible;
   
@@@ -6330,25 -6123,17 +6330,25 @@@ static int __sched_setscheduler(struct 
         unsigned long flags;
         const struct sched_class *prev_class = p->sched_class;
         struct rq *rq;
+ +      int reset_on_fork;
   
         /* may grab non-irq protected spin_locks */
         BUG_ON(in_interrupt());
   recheck:
         /* double check policy once rq lock held */
- -      if (policy < 0)
+ +      if (policy < 0) {
+ +              reset_on_fork = p->sched_reset_on_fork;
                 policy = oldpolicy = p->policy;
- -      else if (policy != SCHED_FIFO && policy != SCHED_RR &&
- -                      policy != SCHED_NORMAL && policy != SCHED_BATCH &&
- -                      policy != SCHED_IDLE)
- -              return -EINVAL;
+ +      } else {
+ +              reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+ +              policy &= ~SCHED_RESET_ON_FORK;
+ +
+ +              if (policy != SCHED_FIFO && policy != SCHED_RR &&
+ +                              policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+ +                              policy != SCHED_IDLE)
+ +                      return -EINVAL;
+ +      }
+ +
         /*
          * Valid priorities for SCHED_FIFO and SCHED_RR are
          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
@@@ -6392,10 -6177,6 +6392,10 @@@
                 /* can't change other user's priorities */
                 if (!check_same_owner(p))
                         return -EPERM;
+ +
+ +              /* Normal users shall not reset the sched_reset_on_fork flag */
+ +              if (p->sched_reset_on_fork && !reset_on_fork)
+ +                      return -EPERM;
         }
   
         if (user) {
@@@ -6439,8 -6220,6 +6439,8 @@@
         if (running)
                 p->sched_class->put_prev_task(rq, p);
   
+ +      p->sched_reset_on_fork = reset_on_fork;
+ +
         oldprio = p->prio;
         __setscheduler(rq, p, policy, param->sched_priority);
   
@@@ -6557,15 -6336,14 +6557,15 @@@ SYSCALL_DEFINE1(sched_getscheduler, pid
         if (p) {
                 retval = security_task_getscheduler(p);
                 if (!retval)
- -                      retval = p->policy;
+ +                      retval = p->policy
+ +                              | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
         }
         read_unlock(&tasklist_lock);
         return retval;
   }
   
   /**
- - * sys_sched_getscheduler - get the RT priority of a thread
+ + * sys_sched_getparam - get the RT priority of a thread
    * @pid: the pid in question.
    * @param: structure containing the RT priority.
    */
@@@ -6793,9 -6571,19 +6793,9 @@@ static inline int should_resched(void
   
   static void __cond_resched(void)
   {
- -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
- -      __might_sleep(__FILE__, __LINE__);
- -#endif
- -      /*
- -       * The BKS might be reacquired before we have dropped
- -       * PREEMPT_ACTIVE, which could trigger a second
- -       * cond_resched() call.
- -       */
- -      do {
- -              add_preempt_count(PREEMPT_ACTIVE);
- -              schedule();
- -              sub_preempt_count(PREEMPT_ACTIVE);
- -      } while (need_resched());
+ +      add_preempt_count(PREEMPT_ACTIVE);
+ +      schedule();
+ +      sub_preempt_count(PREEMPT_ACTIVE);
   }
   
   int __sched _cond_resched(void)
@@@ -6809,20 -6597,18 +6809,20 @@@
   EXPORT_SYMBOL(_cond_resched);
   
   /*
- - * cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ + * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
    * call schedule, and on return reacquire the lock.
    *
    * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
    * operations here to prevent schedule() from being called twice (once via
    * spin_unlock(), once by hand).
    */
- -int cond_resched_lock(spinlock_t *lock)
+ +int __cond_resched_lock(spinlock_t *lock)
   {
         int resched = should_resched();
         int ret = 0;
   
+ +      lockdep_assert_held(lock);
+ +
         if (spin_needbreak(lock) || resched) {
                 spin_unlock(lock);
                 if (resched)
@@@ -6834,9 -6620,9 +6834,9 @@@
         }
         return ret;
   }
- -EXPORT_SYMBOL(cond_resched_lock);
+ +EXPORT_SYMBOL(__cond_resched_lock);
   
- -int __sched cond_resched_softirq(void)
+ +int __sched __cond_resched_softirq(void)
   {
         BUG_ON(!in_softirq());
   
@@@ -6848,7 -6634,7 +6848,7 @@@
         }
         return 0;
   }
- -EXPORT_SYMBOL(cond_resched_softirq);
+ +EXPORT_SYMBOL(__cond_resched_softirq);
   
   /**
    * yield - yield the current processor to other threads.
@@@ -6872,13 -6658,11 +6872,13 @@@ EXPORT_SYMBOL(yield)
    */
   void __sched io_schedule(void)
   {
- -      struct rq *rq = &__raw_get_cpu_var(runqueues);
+ +      struct rq *rq = raw_rq();
   
         delayacct_blkio_start();
         atomic_inc(&rq->nr_iowait);
+ +      current->in_iowait = 1;
         schedule();
+ +      current->in_iowait = 0;
         atomic_dec(&rq->nr_iowait);
         delayacct_blkio_end();
   }
@@@ -6886,14 -6670,12 +6886,14 @@@ EXPORT_SYMBOL(io_schedule)
   
   long __sched io_schedule_timeout(long timeout)
   {
- -      struct rq *rq = &__raw_get_cpu_var(runqueues);
+ +      struct rq *rq = raw_rq();
         long ret;
   
         delayacct_blkio_start();
         atomic_inc(&rq->nr_iowait);
+ +      current->in_iowait = 1;
         ret = schedule_timeout(timeout);
+ +      current->in_iowait = 0;
         atomic_dec(&rq->nr_iowait);
         delayacct_blkio_end();
         return ret;
@@@ -7210,12 -6992,8 +7210,12 @@@ int set_cpus_allowed_ptr(struct task_st
   
         if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) {
                 /* Need help from migration thread: drop lock and wait. */
+ +              struct task_struct *mt = rq->migration_thread;
+ +
+ +              get_task_struct(mt);
                 task_rq_unlock(rq, &flags);
                 wake_up_process(rq->migration_thread);
+ +              put_task_struct(mt);
                 wait_for_completion(&req.done);
                 tlb_migrate_finish(p->mm);
                 return 0;
@@@ -7273,11 -7051,6 +7273,11 @@@ fail
         return ret;
   }
   
+ +#define RCU_MIGRATION_IDLE    0
+ +#define RCU_MIGRATION_NEED_QS 1
+ +#define RCU_MIGRATION_GOT_QS  2
+ +#define RCU_MIGRATION_MUST_SYNC       3
+ +
   /*
    * migration_thread - this is a highprio system thread that performs
    * thread migration by bumping thread off CPU then 'pushing' onto
@@@ -7285,7 -7058,6 +7285,7 @@@
    */
   static int migration_thread(void *data)
   {
+ +      int badcpu;
         int cpu = (long)data;
         struct rq *rq;
   
@@@ -7320,17 -7092,8 +7320,17 @@@
                 req = list_entry(head->next, struct migration_req, list);
                 list_del_init(head->next);
   
- -              spin_unlock(&rq->lock);
- -              __migrate_task(req->task, cpu, req->dest_cpu);
+ +              if (req->task != NULL) {
+ +                      spin_unlock(&rq->lock);
+ +                      __migrate_task(req->task, cpu, req->dest_cpu);
+ +              } else if (likely(cpu == (badcpu = smp_processor_id()))) {
+ +                      req->dest_cpu = RCU_MIGRATION_GOT_QS;
+ +                      spin_unlock(&rq->lock);
+ +              } else {
+ +                      req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
+ +                      spin_unlock(&rq->lock);
+ +                      WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
+ +              }
                 local_irq_enable();
   
                 complete(&req->done);
@@@ -7862,7 -7625,7 +7862,7 @@@ static int __init migration_init(void
         migration_call(&migration_notifier, CPU_ONLINE, cpu);
         register_cpu_notifier(&migration_notifier);
   
- -      return err;
+ +      return 0;
   }
   early_initcall(migration_init);
   #endif
@@@ -7909,7 -7672,7 +7909,7 @@@ static int sched_domain_debug_one(struc
                         break;
                 }
   
- -              if (!group->__cpu_power) {
+ +              if (!group->cpu_power) {
                         printk(KERN_CONT "\n");
                         printk(KERN_ERR "ERROR: domain->cpu_power not "
                                         "set\n");
@@@ -7933,9 -7696,9 +7933,9 @@@
                 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
   
                 printk(KERN_CONT " %s", str);
- -              if (group->__cpu_power != SCHED_LOAD_SCALE) {
- -                      printk(KERN_CONT " (__cpu_power = %d)",
- -                              group->__cpu_power);
+ +              if (group->cpu_power != SCHED_LOAD_SCALE) {
+ +                      printk(KERN_CONT " (cpu_power = %d)",
+ +                              group->cpu_power);
                 }
   
                 group = group->next;
@@@ -8078,7 -7841,7 +8078,7 @@@ static void rq_attach_root(struct rq *r
         rq->rd = rd;
   
         cpumask_set_cpu(rq->cpu, rd->span);
- -      if (cpumask_test_cpu(rq->cpu, cpu_online_mask))
+ +      if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
                 set_rq_online(rq);
   
         spin_unlock_irqrestore(&rq->lock, flags);
@@@ -8220,7 -7983,7 +8220,7 @@@ init_sched_build_groups(const struct cp
                         continue;
   
                 cpumask_clear(sched_group_cpus(sg));
- -              sg->__cpu_power = 0;
+ +              sg->cpu_power = 0;
   
                 for_each_cpu(j, span) {
                         if (group_fn(j, cpu_map, NULL, tmpmask) != group)
@@@ -8328,39 -8091,6 +8328,39 @@@ struct static_sched_domain 
         DECLARE_BITMAP(span, CONFIG_NR_CPUS);
   };
   
+ +struct s_data {
+ +#ifdef CONFIG_NUMA
+ +      int                     sd_allnodes;
+ +      cpumask_var_t           domainspan;
+ +      cpumask_var_t           covered;
+ +      cpumask_var_t           notcovered;
+ +#endif
+ +      cpumask_var_t           nodemask;
+ +      cpumask_var_t           this_sibling_map;
+ +      cpumask_var_t           this_core_map;
+ +      cpumask_var_t           send_covered;
+ +      cpumask_var_t           tmpmask;
+ +      struct sched_group      **sched_group_nodes;
+ +      struct root_domain      *rd;
+ +};
+ +
+ +enum s_alloc {
+ +      sa_sched_groups = 0,
+ +      sa_rootdomain,
+ +      sa_tmpmask,
+ +      sa_send_covered,
+ +      sa_this_core_map,
+ +      sa_this_sibling_map,
+ +      sa_nodemask,
+ +      sa_sched_group_nodes,
+ +#ifdef CONFIG_NUMA
+ +      sa_notcovered,
+ +      sa_covered,
+ +      sa_domainspan,
+ +#endif
+ +      sa_none,
+ +};
+ +
   /*
    * SMT sched-domains:
    */
@@@ -8478,76 -8208,11 +8478,76 @@@ static void init_numa_sched_groups_powe
                                 continue;
                         }
   
- -                      sg_inc_cpu_power(sg, sd->groups->__cpu_power);
+ +                      sg->cpu_power += sd->groups->cpu_power;
                 }
                 sg = sg->next;
         } while (sg != group_head);
   }
+ +
+ +static int build_numa_sched_groups(struct s_data *d,
+ +                                 const struct cpumask *cpu_map, int num)
+ +{
+ +      struct sched_domain *sd;
+ +      struct sched_group *sg, *prev;
+ +      int n, j;
+ +
+ +      cpumask_clear(d->covered);
+ +      cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
+ +      if (cpumask_empty(d->nodemask)) {
+ +              d->sched_group_nodes[num] = NULL;
+ +              goto out;
+ +      }
+ +
+ +      sched_domain_node_span(num, d->domainspan);
+ +      cpumask_and(d->domainspan, d->domainspan, cpu_map);
+ +
+ +      sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ +                        GFP_KERNEL, num);
+ +      if (!sg) {
+ +              printk(KERN_WARNING "Can not alloc domain group for node %d\n",
+ +                     num);
+ +              return -ENOMEM;
+ +      }
+ +      d->sched_group_nodes[num] = sg;
+ +
+ +      for_each_cpu(j, d->nodemask) {
+ +              sd = &per_cpu(node_domains, j).sd;
+ +              sd->groups = sg;
+ +      }
+ +
+ +      sg->cpu_power = 0;
+ +      cpumask_copy(sched_group_cpus(sg), d->nodemask);
+ +      sg->next = sg;
+ +      cpumask_or(d->covered, d->covered, d->nodemask);
+ +
+ +      prev = sg;
+ +      for (j = 0; j < nr_node_ids; j++) {
+ +              n = (num + j) % nr_node_ids;
+ +              cpumask_complement(d->notcovered, d->covered);
+ +              cpumask_and(d->tmpmask, d->notcovered, cpu_map);
+ +              cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
+ +              if (cpumask_empty(d->tmpmask))
+ +                      break;
+ +              cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
+ +              if (cpumask_empty(d->tmpmask))
+ +                      continue;
+ +              sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ +                                GFP_KERNEL, num);
+ +              if (!sg) {
+ +                      printk(KERN_WARNING
+ +                             "Can not alloc domain group for node %d\n", j);
+ +                      return -ENOMEM;
+ +              }
+ +              sg->cpu_power = 0;
+ +              cpumask_copy(sched_group_cpus(sg), d->tmpmask);
+ +              sg->next = prev->next;
+ +              cpumask_or(d->covered, d->covered, d->tmpmask);
+ +              prev->next = sg;
+ +              prev = sg;
+ +      }
+ +out:
+ +      return 0;
+ +}
   #endif /* CONFIG_NUMA */
   
   #ifdef CONFIG_NUMA
@@@ -8601,13 -8266,15 +8601,13 @@@ static void free_sched_groups(const str
    * there are asymmetries in the topology. If there are asymmetries, group
    * having more cpu_power will pickup more load compared to the group having
    * less cpu_power.
- - *
- - * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
- - * the maximum number of tasks a group can handle in the presence of other idle
- - * or lightly loaded groups in the same sched domain.
    */
   static void init_sched_groups_power(int cpu, struct sched_domain *sd)
   {
         struct sched_domain *child;
         struct sched_group *group;
+ +      long power;
+ +      int weight;
   
         WARN_ON(!sd || !sd->groups);
   
@@@ -8616,32 -8283,28 +8616,32 @@@
   
         child = sd->child;
   
- -      sd->groups->__cpu_power = 0;
+ +      sd->groups->cpu_power = 0;
   
- -      /*
- -       * For perf policy, if the groups in child domain share resources
- -       * (for example cores sharing some portions of the cache hierarchy
- -       * or SMT), then set this domain groups cpu_power such that each group
- -       * can handle only one task, when there are other idle groups in the
- -       * same sched domain.
- -       */
- -      if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
- -                     (child->flags &
- -                      (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
- -              sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE);
+ +      if (!child) {
+ +              power = SCHED_LOAD_SCALE;
+ +              weight = cpumask_weight(sched_domain_span(sd));
+ +              /*
+ +               * SMT siblings share the power of a single core.
+ +               * Usually multiple threads get a better yield out of
+ +               * that one core than a single thread would have,
+ +               * reflect that in sd->smt_gain.
+ +               */
+ +              if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
+ +                      power *= sd->smt_gain;
+ +                      power /= weight;
+ +                      power >>= SCHED_LOAD_SHIFT;
+ +              }
+ +              sd->groups->cpu_power += power;
                 return;
         }
   
         /*
- -       * add cpu_power of each child group to this groups cpu_power
+ +       * Add cpu_power of each child group to this groups cpu_power.
          */
         group = child->groups;
         do {
- -              sg_inc_cpu_power(sd->groups, group->__cpu_power);
+ +              sd->groups->cpu_power += group->cpu_power;
                 group = group->next;
         } while (group != child->groups);
   }
@@@ -8715,285 -8378,280 +8715,285 @@@ static void set_domain_attribute(struc
         }
   }
   
- -/*
- - * Build sched domains for a given set of cpus and attach the sched domains
- - * to the individual cpus
- - */
- -static int __build_sched_domains(const struct cpumask *cpu_map,
- -                               struct sched_domain_attr *attr)
- -{
- -      int i, err = -ENOMEM;
- -      struct root_domain *rd;
- -      cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered,
- -              tmpmask;
+ +static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+ +                               const struct cpumask *cpu_map)
+ +{
+ +      switch (what) {
+ +      case sa_sched_groups:
+ +              free_sched_groups(cpu_map, d->tmpmask); /* fall through */
+ +              d->sched_group_nodes = NULL;
+ +      case sa_rootdomain:
+ +              free_rootdomain(d->rd); /* fall through */
+ +      case sa_tmpmask:
+ +              free_cpumask_var(d->tmpmask); /* fall through */
+ +      case sa_send_covered:
+ +              free_cpumask_var(d->send_covered); /* fall through */
+ +      case sa_this_core_map:
+ +              free_cpumask_var(d->this_core_map); /* fall through */
+ +      case sa_this_sibling_map:
+ +              free_cpumask_var(d->this_sibling_map); /* fall through */
+ +      case sa_nodemask:
+ +              free_cpumask_var(d->nodemask); /* fall through */
+ +      case sa_sched_group_nodes:
   #ifdef CONFIG_NUMA
- -      cpumask_var_t domainspan, covered, notcovered;
- -      struct sched_group **sched_group_nodes = NULL;
- -      int sd_allnodes = 0;
- -
- -      if (!alloc_cpumask_var(&domainspan, GFP_KERNEL))
- -              goto out;
- -      if (!alloc_cpumask_var(&covered, GFP_KERNEL))
- -              goto free_domainspan;
- -      if (!alloc_cpumask_var(&notcovered, GFP_KERNEL))
- -              goto free_covered;
- -#endif
- -
- -      if (!alloc_cpumask_var(&nodemask, GFP_KERNEL))
- -              goto free_notcovered;
- -      if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL))
- -              goto free_nodemask;
- -      if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL))
- -              goto free_this_sibling_map;
- -      if (!alloc_cpumask_var(&send_covered, GFP_KERNEL))
- -              goto free_this_core_map;
- -      if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL))
- -              goto free_send_covered;
+ +              kfree(d->sched_group_nodes); /* fall through */
+ +      case sa_notcovered:
+ +              free_cpumask_var(d->notcovered); /* fall through */
+ +      case sa_covered:
+ +              free_cpumask_var(d->covered); /* fall through */
+ +      case sa_domainspan:
+ +              free_cpumask_var(d->domainspan); /* fall through */
+ +#endif
+ +      case sa_none:
+ +              break;
+ +      }
+ +}
   
+ +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+ +                                                 const struct cpumask *cpu_map)
+ +{
   #ifdef CONFIG_NUMA
- -      /*
- -       * Allocate the per-node list of sched groups
- -       */
- -      sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
- -                                  GFP_KERNEL);
- -      if (!sched_group_nodes) {
+ +      if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
+ +              return sa_none;
+ +      if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
+ +              return sa_domainspan;
+ +      if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
+ +              return sa_covered;
+ +      /* Allocate the per-node list of sched groups */
+ +      d->sched_group_nodes = kcalloc(nr_node_ids,
+ +                                    sizeof(struct sched_group *), GFP_KERNEL);
+ +      if (!d->sched_group_nodes) {
                 printk(KERN_WARNING "Can not alloc sched group node list\n");
- -              goto free_tmpmask;
- -      }
- -#endif
- -
- -      rd = alloc_rootdomain();
- -      if (!rd) {
+ +              return sa_notcovered;
+ +      }
+ +      sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
+ +#endif
+ +      if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
+ +              return sa_sched_group_nodes;
+ +      if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
+ +              return sa_nodemask;
+ +      if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
+ +              return sa_this_sibling_map;
+ +      if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
+ +              return sa_this_core_map;
+ +      if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
+ +              return sa_send_covered;
+ +      d->rd = alloc_rootdomain();
+ +      if (!d->rd) {
                 printk(KERN_WARNING "Cannot alloc root domain\n");
- -              goto free_sched_groups;
+ +              return sa_tmpmask;
         }
+ +      return sa_rootdomain;
+ +}
   
+ +static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
+ +      const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+ +{
+ +      struct sched_domain *sd = NULL;
   #ifdef CONFIG_NUMA
- -      sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes;
- -#endif
- -
- -      /*
- -       * Set up domains for cpus specified by the cpu_map.
- -       */
- -      for_each_cpu(i, cpu_map) {
- -              struct sched_domain *sd = NULL, *p;
- -
- -              cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map);
- -
- -#ifdef CONFIG_NUMA
- -              if (cpumask_weight(cpu_map) >
- -                              SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) {
- -                      sd = &per_cpu(allnodes_domains, i).sd;
- -                      SD_INIT(sd, ALLNODES);
- -                      set_domain_attribute(sd, attr);
- -                      cpumask_copy(sched_domain_span(sd), cpu_map);
- -                      cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
- -                      p = sd;
- -                      sd_allnodes = 1;
- -              } else
- -                      p = NULL;
+ +      struct sched_domain *parent;
   
- -              sd = &per_cpu(node_domains, i).sd;
- -              SD_INIT(sd, NODE);
+ +      d->sd_allnodes = 0;
+ +      if (cpumask_weight(cpu_map) >
+ +          SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
+ +              sd = &per_cpu(allnodes_domains, i).sd;
+ +              SD_INIT(sd, ALLNODES);
                 set_domain_attribute(sd, attr);
- -              sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
- -              sd->parent = p;
- -              if (p)
- -                      p->child = sd;
- -              cpumask_and(sched_domain_span(sd),
- -                          sched_domain_span(sd), cpu_map);
+ +              cpumask_copy(sched_domain_span(sd), cpu_map);
+ +              cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
+ +              d->sd_allnodes = 1;
+ +      }
+ +      parent = sd;
+ +
+ +      sd = &per_cpu(node_domains, i).sd;
+ +      SD_INIT(sd, NODE);
+ +      set_domain_attribute(sd, attr);
+ +      sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
+ +      sd->parent = parent;
+ +      if (parent)
+ +              parent->child = sd;
+ +      cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
   #endif
+ +      return sd;
+ +}
   
- -              p = sd;
- -              sd = &per_cpu(phys_domains, i).sd;
- -              SD_INIT(sd, CPU);
- -              set_domain_attribute(sd, attr);
- -              cpumask_copy(sched_domain_span(sd), nodemask);
- -              sd->parent = p;
- -              if (p)
- -                      p->child = sd;
- -              cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask);
+ +static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
+ +      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ +      struct sched_domain *parent, int i)
+ +{
+ +      struct sched_domain *sd;
+ +      sd = &per_cpu(phys_domains, i).sd;
+ +      SD_INIT(sd, CPU);
+ +      set_domain_attribute(sd, attr);
+ +      cpumask_copy(sched_domain_span(sd), d->nodemask);
+ +      sd->parent = parent;
+ +      if (parent)
+ +              parent->child = sd;
+ +      cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
+ +      return sd;
+ +}
   
+ +static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
+ +      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ +      struct sched_domain *parent, int i)
+ +{
+ +      struct sched_domain *sd = parent;
   #ifdef CONFIG_SCHED_MC
- -              p = sd;
- -              sd = &per_cpu(core_domains, i).sd;
- -              SD_INIT(sd, MC);
- -              set_domain_attribute(sd, attr);
- -              cpumask_and(sched_domain_span(sd), cpu_map,
- -                                                 cpu_coregroup_mask(i));
- -              sd->parent = p;
- -              p->child = sd;
- -              cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask);
+ +      sd = &per_cpu(core_domains, i).sd;
+ +      SD_INIT(sd, MC);
+ +      set_domain_attribute(sd, attr);
+ +      cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
+ +      sd->parent = parent;
+ +      parent->child = sd;
+ +      cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
   #endif
+ +      return sd;
+ +}
   
+ +static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
+ +      const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+ +      struct sched_domain *parent, int i)
+ +{
+ +      struct sched_domain *sd = parent;
   #ifdef CONFIG_SCHED_SMT
- -              p = sd;
- -              sd = &per_cpu(cpu_domains, i).sd;
- -              SD_INIT(sd, SIBLING);
- -              set_domain_attribute(sd, attr);
- -              cpumask_and(sched_domain_span(sd),
- -                          topology_thread_cpumask(i), cpu_map);
- -              sd->parent = p;
- -              p->child = sd;
- -              cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
+ +      sd = &per_cpu(cpu_domains, i).sd;
+ +      SD_INIT(sd, SIBLING);
+ +      set_domain_attribute(sd, attr);
+ +      cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
+ +      sd->parent = parent;
+ +      parent->child = sd;
+ +      cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
   #endif
- -      }
+ +      return sd;
+ +}
   
+ +static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
+ +                             const struct cpumask *cpu_map, int cpu)
+ +{
+ +      switch (l) {
   #ifdef CONFIG_SCHED_SMT
- -      /* Set up CPU (sibling) groups */
- -      for_each_cpu(i, cpu_map) {
- -              cpumask_and(this_sibling_map,
- -                          topology_thread_cpumask(i), cpu_map);
- -              if (i != cpumask_first(this_sibling_map))
- -                      continue;
- -
- -              init_sched_build_groups(this_sibling_map, cpu_map,
- -                                      &cpu_to_cpu_group,
- -                                      send_covered, tmpmask);
- -      }
+ +      case SD_LV_SIBLING: /* set up CPU (sibling) groups */
+ +              cpumask_and(d->this_sibling_map, cpu_map,
+ +                          topology_thread_cpumask(cpu));
+ +              if (cpu == cpumask_first(d->this_sibling_map))
+ +                      init_sched_build_groups(d->this_sibling_map, cpu_map,
+ +                                              &cpu_to_cpu_group,
+ +                                              d->send_covered, d->tmpmask);
+ +              break;
   #endif
- -
   #ifdef CONFIG_SCHED_MC
- -      /* Set up multi-core groups */
- -      for_each_cpu(i, cpu_map) {
- -              cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map);
- -              if (i != cpumask_first(this_core_map))
- -                      continue;
- -
- -              init_sched_build_groups(this_core_map, cpu_map,
- -                                      &cpu_to_core_group,
- -                                      send_covered, tmpmask);
- -      }
+ +      case SD_LV_MC: /* set up multi-core groups */
+ +              cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
+ +              if (cpu == cpumask_first(d->this_core_map))
+ +                      init_sched_build_groups(d->this_core_map, cpu_map,
+ +                                              &cpu_to_core_group,
+ +                                              d->send_covered, d->tmpmask);
+ +              break;
   #endif
- -
- -      /* Set up physical groups */
- -      for (i = 0; i < nr_node_ids; i++) {
- -              cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- -              if (cpumask_empty(nodemask))
- -                      continue;
- -
- -              init_sched_build_groups(nodemask, cpu_map,
- -                                      &cpu_to_phys_group,
- -                                      send_covered, tmpmask);
- -      }
- -
+ +      case SD_LV_CPU: /* set up physical groups */
+ +              cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
+ +              if (!cpumask_empty(d->nodemask))
+ +                      init_sched_build_groups(d->nodemask, cpu_map,
+ +                                              &cpu_to_phys_group,
+ +                                              d->send_covered, d->tmpmask);
+ +              break;
   #ifdef CONFIG_NUMA
- -      /* Set up node groups */
- -      if (sd_allnodes) {
- -              init_sched_build_groups(cpu_map, cpu_map,
- -                                      &cpu_to_allnodes_group,
- -                                      send_covered, tmpmask);
+ +      case SD_LV_ALLNODES:
+ +              init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
+ +                                      d->send_covered, d->tmpmask);
+ +              break;
+ +#endif
+ +      default:
+ +              break;
         }
+ +}
   
- -      for (i = 0; i < nr_node_ids; i++) {
- -              /* Set up node groups */
- -              struct sched_group *sg, *prev;
- -              int j;
- -
- -              cpumask_clear(covered);
- -              cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- -              if (cpumask_empty(nodemask)) {
- -                      sched_group_nodes[i] = NULL;
- -                      continue;
- -              }
+ +/*
+ + * Build sched domains for a given set of cpus and attach the sched domains
+ + * to the individual cpus
+ + */
+ +static int __build_sched_domains(const struct cpumask *cpu_map,
+ +                               struct sched_domain_attr *attr)
+ +{
+ +      enum s_alloc alloc_state = sa_none;
+ +      struct s_data d;
+ +      struct sched_domain *sd;
+ +      int i;
+ +#ifdef CONFIG_NUMA
+ +      d.sd_allnodes = 0;
+ +#endif
   
- -              sched_domain_node_span(i, domainspan);
- -              cpumask_and(domainspan, domainspan, cpu_map);
+ +      alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+ +      if (alloc_state != sa_rootdomain)
+ +              goto error;
+ +      alloc_state = sa_sched_groups;
   
- -              sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- -                                GFP_KERNEL, i);
- -              if (!sg) {
- -                      printk(KERN_WARNING "Can not alloc domain group for "
- -                              "node %d\n", i);
- -                      goto error;
- -              }
- -              sched_group_nodes[i] = sg;
- -              for_each_cpu(j, nodemask) {
- -                      struct sched_domain *sd;
+ +      /*
+ +       * Set up domains for cpus specified by the cpu_map.
+ +       */
+ +      for_each_cpu(i, cpu_map) {
+ +              cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
+ +                          cpu_map);
   
- -                      sd = &per_cpu(node_domains, j).sd;
- -                      sd->groups = sg;
- -              }
- -              sg->__cpu_power = 0;
- -              cpumask_copy(sched_group_cpus(sg), nodemask);
- -              sg->next = sg;
- -              cpumask_or(covered, covered, nodemask);
- -              prev = sg;
+ +              sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
+ +              sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
+ +              sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
+ +              sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
+ +      }
   
- -              for (j = 0; j < nr_node_ids; j++) {
- -                      int n = (i + j) % nr_node_ids;
+ +      for_each_cpu(i, cpu_map) {
+ +              build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
+ +              build_sched_groups(&d, SD_LV_MC, cpu_map, i);
+ +      }
   
- -                      cpumask_complement(notcovered, covered);
- -                      cpumask_and(tmpmask, notcovered, cpu_map);
- -                      cpumask_and(tmpmask, tmpmask, domainspan);
- -                      if (cpumask_empty(tmpmask))
- -                              break;
+ +      /* Set up physical groups */
+ +      for (i = 0; i < nr_node_ids; i++)
+ +              build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
   
- -                      cpumask_and(tmpmask, tmpmask, cpumask_of_node(n));
- -                      if (cpumask_empty(tmpmask))
- -                              continue;
+ +#ifdef CONFIG_NUMA
+ +      /* Set up node groups */
+ +      if (d.sd_allnodes)
+ +              build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
   
- -                      sg = kmalloc_node(sizeof(struct sched_group) +
- -                                        cpumask_size(),
- -                                        GFP_KERNEL, i);
- -                      if (!sg) {
- -                              printk(KERN_WARNING
- -                              "Can not alloc domain group for node %d\n", j);
- -                              goto error;
- -                      }
- -                      sg->__cpu_power = 0;
- -                      cpumask_copy(sched_group_cpus(sg), tmpmask);
- -                      sg->next = prev->next;
- -                      cpumask_or(covered, covered, tmpmask);
- -                      prev->next = sg;
- -                      prev = sg;
- -              }
- -      }
+ +      for (i = 0; i < nr_node_ids; i++)
+ +              if (build_numa_sched_groups(&d, cpu_map, i))
+ +                      goto error;
   #endif
   
         /* Calculate CPU power for physical packages and nodes */
   #ifdef CONFIG_SCHED_SMT
         for_each_cpu(i, cpu_map) {
- -              struct sched_domain *sd = &per_cpu(cpu_domains, i).sd;
- -
+ +              sd = &per_cpu(cpu_domains, i).sd;
                 init_sched_groups_power(i, sd);
         }
   #endif
   #ifdef CONFIG_SCHED_MC
         for_each_cpu(i, cpu_map) {
- -              struct sched_domain *sd = &per_cpu(core_domains, i).sd;
- -
+ +              sd = &per_cpu(core_domains, i).sd;
                 init_sched_groups_power(i, sd);
         }
   #endif
   
         for_each_cpu(i, cpu_map) {
- -              struct sched_domain *sd = &per_cpu(phys_domains, i).sd;
- -
+ +              sd = &per_cpu(phys_domains, i).sd;
                 init_sched_groups_power(i, sd);
         }
   
   #ifdef CONFIG_NUMA
         for (i = 0; i < nr_node_ids; i++)
- -              init_numa_sched_groups_power(sched_group_nodes[i]);
+ +              init_numa_sched_groups_power(d.sched_group_nodes[i]);
   
- -      if (sd_allnodes) {
+ +      if (d.sd_allnodes) {
                 struct sched_group *sg;
   
                 cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
- -                                                              tmpmask);
+ +                                                              d.tmpmask);
                 init_numa_sched_groups_power(sg);
         }
   #endif
   
         /* Attach the domains */
         for_each_cpu(i, cpu_map) {
- -              struct sched_domain *sd;
   #ifdef CONFIG_SCHED_SMT
                 sd = &per_cpu(cpu_domains, i).sd;
   #elif defined(CONFIG_SCHED_MC)
@@@ -9001,16 -8659,44 +9001,16 @@@
   #else
                 sd = &per_cpu(phys_domains, i).sd;
   #endif
- -              cpu_attach_domain(sd, rd, i);
+ +              cpu_attach_domain(sd, d.rd, i);
         }
   
- -      err = 0;
- -
- -free_tmpmask:
- -      free_cpumask_var(tmpmask);
- -free_send_covered:
- -      free_cpumask_var(send_covered);
- -free_this_core_map:
- -      free_cpumask_var(this_core_map);
- -free_this_sibling_map:
- -      free_cpumask_var(this_sibling_map);
- -free_nodemask:
- -      free_cpumask_var(nodemask);
- -free_notcovered:
- -#ifdef CONFIG_NUMA
- -      free_cpumask_var(notcovered);
- -free_covered:
- -      free_cpumask_var(covered);
- -free_domainspan:
- -      free_cpumask_var(domainspan);
- -out:
- -#endif
- -      return err;
- -
- -free_sched_groups:
- -#ifdef CONFIG_NUMA
- -      kfree(sched_group_nodes);
- -#endif
- -      goto free_tmpmask;
+ +      d.sched_group_nodes = NULL; /* don't free this we still need it */
+ +      __free_domain_allocs(&d, sa_tmpmask, cpu_map);
+ +      return 0;
   
- -#ifdef CONFIG_NUMA
   error:
- -      free_sched_groups(cpu_map, tmpmask);
- -      free_rootdomain(rd);
- -      goto free_tmpmask;
- -#endif
+ +      __free_domain_allocs(&d, alloc_state, cpu_map);
+ +      return -ENOMEM;
   }
   
   static int build_sched_domains(const struct cpumask *cpu_map)
@@@ -9618,11 -9304,11 +9618,11 @@@ void __init sched_init(void
                  * system cpu resource, based on the weight assigned to root
                  * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
                  * by letting tasks of init_task_group sit in a separate cfs_rq
- -               * (init_cfs_rq) and having one entity represent this group of
+ +               * (init_tg_cfs_rq) and having one entity represent this group of
                  * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
                  */
                 init_tg_cfs_entry(&init_task_group,
- -                              &per_cpu(init_cfs_rq, i),
+ +                              &per_cpu(init_tg_cfs_rq, i),
                                 &per_cpu(init_sched_entity, i), i, 1,
                                 root_task_group.se[i]);
   
@@@ -9648,7 -9334,6 +9648,7 @@@
   #ifdef CONFIG_SMP
                 rq->sd = NULL;
                 rq->rd = NULL;
+ +              rq->post_schedule = 0;
                 rq->active_balance = 0;
                 rq->next_balance = jiffies;
                 rq->push_cpu = 0;
@@@ -9713,20 -9398,13 +9713,20 @@@
   }
   
   #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
- -void __might_sleep(char *file, int line)
+ +static inline int preempt_count_equals(int preempt_offset)
+ +{
+ +      int nested = preempt_count() & ~PREEMPT_ACTIVE;
+ +
+ +      return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+ +}
+ +
+ +void __might_sleep(char *file, int line, int preempt_offset)
   {
   #ifdef in_atomic
         static unsigned long prev_jiffy;        /* ratelimiting */
   
- -      if ((!in_atomic() && !irqs_disabled()) ||
- -                  system_state != SYSTEM_RUNNING || oops_in_progress)
+ +      if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
+ +          system_state != SYSTEM_RUNNING || oops_in_progress)
                 return;
         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
                 return;
@@@ -10903,113 -10581,3 +10903,113 @@@ struct cgroup_subsys cpuacct_subsys = 
         .subsys_id = cpuacct_subsys_id,
   };
   #endif        /* CONFIG_CGROUP_CPUACCT */
+ +
+ +#ifndef CONFIG_SMP
+ +
+ +int rcu_expedited_torture_stats(char *page)
+ +{
+ +      return 0;
+ +}
+ +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+ +
+ +void synchronize_sched_expedited(void)
+ +{
+ +}
+ +EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+ +
+ +#else /* #ifndef CONFIG_SMP */
+ +
+ +static DEFINE_PER_CPU(struct migration_req, rcu_migration_req);
+ +static DEFINE_MUTEX(rcu_sched_expedited_mutex);
+ +
+ +#define RCU_EXPEDITED_STATE_POST -2
+ +#define RCU_EXPEDITED_STATE_IDLE -1
+ +
+ +static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+ +
+ +int rcu_expedited_torture_stats(char *page)
+ +{
+ +      int cnt = 0;
+ +      int cpu;
+ +
+ +      cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state);
+ +      for_each_online_cpu(cpu) {
+ +               cnt += sprintf(&page[cnt], " %d:%d",
+ +                              cpu, per_cpu(rcu_migration_req, cpu).dest_cpu);
+ +      }
+ +      cnt += sprintf(&page[cnt], "\n");
+ +      return cnt;
+ +}
+ +EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
+ +
+ +static long synchronize_sched_expedited_count;
+ +
+ +/*
+ + * Wait for an rcu-sched grace period to elapse, but use "big hammer"
+ + * approach to force grace period to end quickly.  This consumes
+ + * significant time on all CPUs, and is thus not recommended for
+ + * any sort of common-case code.
+ + *
+ + * Note that it is illegal to call this function while holding any
+ + * lock that is acquired by a CPU-hotplug notifier.  Failing to
+ + * observe this restriction will result in deadlock.
+ + */
+ +void synchronize_sched_expedited(void)
+ +{
+ +      int cpu;
+ +      unsigned long flags;
+ +      bool need_full_sync = 0;
+ +      struct rq *rq;
+ +      struct migration_req *req;
+ +      long snap;
+ +      int trycount = 0;
+ +
+ +      smp_mb();  /* ensure prior mod happens before capturing snap. */
+ +      snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1;
+ +      get_online_cpus();
+ +      while (!mutex_trylock(&rcu_sched_expedited_mutex)) {
+ +              put_online_cpus();
+ +              if (trycount++ < 10)
+ +                      udelay(trycount * num_online_cpus());
+ +              else {
+ +                      synchronize_sched();
+ +                      return;
+ +              }
+ +              if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) {
+ +                      smp_mb(); /* ensure test happens before caller kfree */
+ +                      return;
+ +              }
+ +              get_online_cpus();
+ +      }
+ +      rcu_expedited_state = RCU_EXPEDITED_STATE_POST;
+ +      for_each_online_cpu(cpu) {
+ +              rq = cpu_rq(cpu);
+ +              req = &per_cpu(rcu_migration_req, cpu);
+ +              init_completion(&req->done);
+ +              req->task = NULL;
+ +              req->dest_cpu = RCU_MIGRATION_NEED_QS;
+ +              spin_lock_irqsave(&rq->lock, flags);
+ +              list_add(&req->list, &rq->migration_queue);
+ +              spin_unlock_irqrestore(&rq->lock, flags);
+ +              wake_up_process(rq->migration_thread);
+ +      }
+ +      for_each_online_cpu(cpu) {
+ +              rcu_expedited_state = cpu;
+ +              req = &per_cpu(rcu_migration_req, cpu);
+ +              rq = cpu_rq(cpu);
+ +              wait_for_completion(&req->done);
+ +              spin_lock_irqsave(&rq->lock, flags);
+ +              if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
+ +                      need_full_sync = 1;
+ +              req->dest_cpu = RCU_MIGRATION_IDLE;
+ +              spin_unlock_irqrestore(&rq->lock, flags);
+ +      }
+ +      rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
+ +      mutex_unlock(&rcu_sched_expedited_mutex);
+ +      put_online_cpus();
+ +      if (need_full_sync)
+ +              synchronize_sched();
+ +}
+ +EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
+ +
+ +#endif /* #else #ifndef CONFIG_SMP */
diff --combined kernel/trace/trace_events.c

index 78b1ed230177246349d10ba83615b99f3adcbde3,0db0a41e00794ae147982b373e8717a8b8d3c4ba..97e2c4d2e9ebdfeed72e8518a34e0e25f6b045d4
--- 1/kernel/trace/trace_events.c
--- 2/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@@ -17,8 -17,6 +17,8 @@@
   #include <linux/ctype.h>
   #include <linux/delay.h>
   
+ +#include <asm/setup.h>
+ +
   #include "trace_output.h"
   
   #define TRACE_SYSTEM "TRACE_SYSTEM"
@@@ -27,9 -25,8 +27,9 @@@ DEFINE_MUTEX(event_mutex)
   
   LIST_HEAD(ftrace_events);
   
- -int trace_define_field(struct ftrace_event_call *call, char *type,
- -                     char *name, int offset, int size, int is_signed)
+ +int trace_define_field(struct ftrace_event_call *call, const char *type,
+ +                     const char *name, int offset, int size, int is_signed,
+ +                     int filter_type)
   {
         struct ftrace_event_field *field;
   
@@@ -45,15 -42,9 +45,15 @@@
         if (!field->type)
                 goto err;
   
+ +      if (filter_type == FILTER_OTHER)
+ +              field->filter_type = filter_assign_type(type);
+ +      else
+ +              field->filter_type = filter_type;
+ +
         field->offset = offset;
         field->size = size;
         field->is_signed = is_signed;
+ +
         list_add(&field->link, &call->fields);
   
         return 0;
@@@ -69,29 -60,6 +69,29 @@@ err
   }
   EXPORT_SYMBOL_GPL(trace_define_field);
   
+ +#define __common_field(type, item)                                    \
+ +      ret = trace_define_field(call, #type, "common_" #item,          \
+ +                               offsetof(typeof(ent), item),           \
+ +                               sizeof(ent.item),                      \
+ +                               is_signed_type(type), FILTER_OTHER);   \
+ +      if (ret)                                                        \
+ +              return ret;
+ +
+ +int trace_define_common_fields(struct ftrace_event_call *call)
+ +{
+ +      int ret;
+ +      struct trace_entry ent;
+ +
+ +      __common_field(unsigned short, type);
+ +      __common_field(unsigned char, flags);
+ +      __common_field(unsigned char, preempt_count);
+ +      __common_field(int, pid);
+ +      __common_field(int, tgid);
+ +
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL_GPL(trace_define_common_fields);
+ +
   #ifdef CONFIG_MODULES
   
   static void trace_destroy_fields(struct ftrace_event_call *call)
@@@ -116,14 -84,14 +116,14 @@@ static void ftrace_event_enable_disable
                 if (call->enabled) {
                         call->enabled = 0;
                         tracing_stop_cmdline_record();
- -                      call->unregfunc();
+ +                      call->unregfunc(call->data);
                 }
                 break;
         case 1:
                 if (!call->enabled) {
                         call->enabled = 1;
                         tracing_start_cmdline_record();
- -                      call->regfunc();
+ +                      call->regfunc(call->data);
                 }
                 break;
         }
@@@ -606,7 -574,7 +606,7 @@@ event_format_read(struct file *filp, ch
         trace_seq_printf(s, "format:\n");
         trace_write_header(s);
   
- -      r = call->show_format(s);
+ +      r = call->show_format(call, s);
         if (!r) {
                 /*
                  * ug!  The format output is bigger than a PAGE!!
@@@ -881,10 -849,8 +881,10 @@@ event_subsystem_dir(const char *name, s
   
         /* First see if we did not already create this dir */
         list_for_each_entry(system, &event_subsystems, list) {
- -              if (strcmp(system->name, name) == 0)
+ +              if (strcmp(system->name, name) == 0) {
+ +                      system->nr_events++;
                         return system->entry;
+ +              }
         }
   
         /* need to create new entry */
@@@ -903,7 -869,6 +903,7 @@@
                 return d_events;
         }
   
+ +      system->nr_events = 1;
         system->name = kstrdup(name, GFP_KERNEL);
         if (!system->name) {
                 debugfs_remove(system->entry);
@@@ -955,6 -920,15 +955,6 @@@ event_create_dir(struct ftrace_event_ca
         if (strcmp(call->system, TRACE_SYSTEM) != 0)
                 d_events = event_subsystem_dir(call->system, d_events);
   
- -      if (call->raw_init) {
- -              ret = call->raw_init();
- -              if (ret < 0) {
- -                      pr_warning("Could not initialize trace point"
- -                                 " events/%s\n", call->name);
- -                      return ret;
- -              }
- -      }
- -
         call->dir = debugfs_create_dir(call->name, d_events);
         if (!call->dir) {
                 pr_warning("Could not create debugfs "
@@@ -971,7 -945,7 +971,7 @@@
                                           id);
   
         if (call->define_fields) {
- -              ret = call->define_fields();
+ +              ret = call->define_fields(call);
                 if (ret < 0) {
                         pr_warning("Could not initialize trace point"
                                    " events/%s\n", call->name);
@@@ -1013,32 -987,6 +1013,32 @@@ struct ftrace_module_file_ops 
         struct file_operations          filter;
   };
   
+ +static void remove_subsystem_dir(const char *name)
+ +{
+ +      struct event_subsystem *system;
+ +
+ +      if (strcmp(name, TRACE_SYSTEM) == 0)
+ +              return;
+ +
+ +      list_for_each_entry(system, &event_subsystems, list) {
+ +              if (strcmp(system->name, name) == 0) {
+ +                      if (!--system->nr_events) {
+ +                              struct event_filter *filter = system->filter;
+ +
+ +                              debugfs_remove_recursive(system->entry);
+ +                              list_del(&system->list);
+ +                              if (filter) {
+ +                                      kfree(filter->filter_string);
+ +                                      kfree(filter);
+ +                              }
+ +                              kfree(system->name);
+ +                              kfree(system);
+ +                      }
+ +                      break;
+ +              }
+ +      }
+ +}
+ +
   static struct ftrace_module_file_ops *
   trace_create_file_ops(struct module *mod)
   {
@@@ -1079,7 -1027,6 +1079,7 @@@ static void trace_module_add_events(str
         struct ftrace_module_file_ops *file_ops = NULL;
         struct ftrace_event_call *call, *start, *end;
         struct dentry *d_events;
+ +      int ret;
   
         start = mod->trace_events;
         end = mod->trace_events + mod->num_trace_events;
@@@ -1095,15 -1042,7 +1095,15 @@@
                 /* The linker may leave blanks */
                 if (!call->name)
                         continue;
- -
+ +              if (call->raw_init) {
+ +                      ret = call->raw_init();
+ +                      if (ret < 0) {
+ +                              if (ret != -ENOSYS)
+ +                                      pr_warning("Could not initialize trace "
+ +                                      "point events/%s\n", call->name);
+ +                              continue;
+ +                      }
+ +              }
                 /*
                  * This module has events, create file ops for this module
                  * if not already done.
@@@ -1138,7 -1077,6 +1138,7 @@@ static void trace_module_remove_events(
                         list_del(&call->list);
                         trace_destroy_fields(call);
                         destroy_preds(call);
+ +                      remove_subsystem_dir(call->system);
                 }
         }
   
@@@ -1195,18 -1133,6 +1195,18 @@@ struct notifier_block trace_module_nb 
   extern struct ftrace_event_call __start_ftrace_events[];
   extern struct ftrace_event_call __stop_ftrace_events[];
   
+ +static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
+ +
+ +static __init int setup_trace_event(char *str)
+ +{
+ +      strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
+ +      ring_buffer_expanded = 1;
+ +      tracing_selftest_disabled = 1;
+ +
+ +      return 1;
+ +}
+ +__setup("trace_event=", setup_trace_event);
+ +
   static __init int event_trace_init(void)
   {
         struct ftrace_event_call *call;
@@@ -1214,8 -1140,6 +1214,8 @@@
         struct dentry *entry;
         struct dentry *d_events;
         int ret;
+ +      char *buf = bootup_event_buf;
+ +      char *token;
   
         d_tracer = tracing_init_dentry();
         if (!d_tracer)
@@@ -1255,34 -1179,12 +1255,34 @@@
                 /* The linker may leave blanks */
                 if (!call->name)
                         continue;
+ +              if (call->raw_init) {
+ +                      ret = call->raw_init();
+ +                      if (ret < 0) {
+ +                              if (ret != -ENOSYS)
+ +                                      pr_warning("Could not initialize trace "
+ +                                      "point events/%s\n", call->name);
+ +                              continue;
+ +                      }
+ +              }
                 list_add(&call->list, &ftrace_events);
                 event_create_dir(call, d_events, &ftrace_event_id_fops,
                                  &ftrace_enable_fops, &ftrace_event_filter_fops,
                                  &ftrace_event_format_fops);
         }
   
+ +      while (true) {
+ +              token = strsep(&buf, ",");
+ +
+ +              if (!token)
+ +                      break;
+ +              if (!*token)
+ +                      continue;
+ +
+ +              ret = ftrace_set_clr_event(token, 1);
+ +              if (ret)
+ +                      pr_warning("Failed to enable trace event: %s\n", token);
+ +      }
+ +
         ret = register_module_notifier(&trace_module_nb);
         if (ret)
                 pr_warning("Failed to register trace events module notifier\n");
@@@ -1432,13 -1334,12 +1432,13 @@@ static __init void event_trace_self_tes
   
   #ifdef CONFIG_FUNCTION_TRACER
   
- static DEFINE_PER_CPU(atomic_t, test_event_disable);
+ static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);
   
   static void
   function_test_events_call(unsigned long ip, unsigned long parent_ip)
   {
         struct ring_buffer_event *event;
+ +      struct ring_buffer *buffer;
         struct ftrace_entry *entry;
         unsigned long flags;
         long disabled;
@@@ -1449,15 -1350,14 +1449,15 @@@
         pc = preempt_count();
         resched = ftrace_preempt_disable();
         cpu = raw_smp_processor_id();
-       disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+       disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));
   
         if (disabled != 1)
                 goto out;
   
         local_save_flags(flags);
   
- -      event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
+ +      event = trace_current_buffer_lock_reserve(&buffer,
+ +                                                TRACE_FN, sizeof(*entry),
                                                   flags, pc);
         if (!event)
                 goto out;
@@@ -1465,10 -1365,10 +1465,10 @@@
         entry->ip                       = ip;
         entry->parent_ip                = parent_ip;
   
- -      trace_nowake_buffer_unlock_commit(event, flags, pc);
+ +      trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);
   
    out:
-       atomic_dec(&per_cpu(test_event_disable, cpu));
+       atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));
         ftrace_preempt_enable(resched);
   }
   
@@@ -1492,10 -1392,10 +1492,10 @@@ static __init void event_trace_self_tes
   
   static __init int event_trace_self_tests_init(void)
   {
- -
- -      event_trace_self_tests();
- -
- -      event_trace_self_test_with_function();
+ +      if (!tracing_selftest_disabled) {
+ +              event_trace_self_tests();
+ +              event_trace_self_test_with_function();
+ +      }
   
         return 0;
   }
diff --combined lib/Kconfig.debug

index 7dbd5d9c29a46cf7c9b9d3db1497e3b878711018,43173c4e0adea3caa3506b2cdc454e5eaaf540c8..55d2acc607a1b052cf8e6d9474585e543e84d9cc
--- 1/lib/Kconfig.debug
--- 2/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@@ -653,21 -653,6 +653,21 @@@ config DEBUG_NOTIFIER
           This is a relatively cheap check but if you care about maximum
           performance, say N.
   
+ +config DEBUG_CREDENTIALS
+ +      bool "Debug credential management"
+ +      depends on DEBUG_KERNEL
+ +      help
+ +        Enable this to turn on some debug checking for credential
+ +        management.  The additional code keeps track of the number of
+ +        pointers from task_structs to any given cred struct, and checks to
+ +        see that this number never exceeds the usage count of the cred
+ +        struct.
+ +
+ +        Furthermore, if SELinux is enabled, this also checks that the
+ +        security pointer in the cred struct is never seen to be invalid.
+ +
+ +        If unsure, say N.
+ +
   #
   # Select this config option from the architecture Kconfig, if it
   # it is preferred to always offer frame pointers as a config
@@@ -740,7 -725,7 +740,7 @@@ config RCU_TORTURE_TEST_RUNNABL
   
   config RCU_CPU_STALL_DETECTOR
         bool "Check for stalled CPUs delaying RCU grace periods"
- -      depends on CLASSIC_RCU || TREE_RCU
+ +      depends on TREE_RCU || TREE_PREEMPT_RCU
         default n
         help
           This option causes RCU to printk information on which
@@@ -805,6 -790,21 +805,21 @@@ config DEBUG_BLOCK_EXT_DEV
   
           Say N if you are unsure.
   
+ config DEBUG_FORCE_WEAK_PER_CPU
+       bool "Force weak per-cpu definitions"
+       depends on DEBUG_KERNEL
+       help
+         s390 and alpha require percpu variables in modules to be
+         defined weak to work around addressing range issue which
+         puts the following two restrictions on percpu variable
+         definitions.
+ 
+         1. percpu symbols must be unique whether static or not
+         2. percpu variables can't be defined inside a function
+ 
+         To ensure that generic code follows the above rules, this
+         option forces all percpu variables to be defined as weak.
+ 
   config LKDTM
         tristate "Linux Kernel Dump Test Tool Module"
         depends on DEBUG_KERNEL
diff --combined mm/Makefile

index 147a7a7873c49e3314fcf2cb411c2dbe50a4f7ca,c77c6487552f1351e4cd3881598d8f6520e69789..ea4b18bd396080acf19beb1cfcada5b3aa4fcc25
--- 1/mm/Makefile
--- 2/mm/Makefile
+++ b/mm/Makefile
@@@ -8,7 -8,7 +8,7 @@@ mmu-$(CONFIG_MMU)        := fremap.o highmem.
                            vmalloc.o
   
   obj-y                 := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
- -                         maccess.o page_alloc.o page-writeback.o pdflush.o \
+ +                         maccess.o page_alloc.o page-writeback.o \
                            readahead.o swap.o truncate.o vmscan.o shmem.o \
                            prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
                            page_isolation.o mm_init.o $(mmu-y)
@@@ -33,7 -33,7 +33,7 @@@ obj-$(CONFIG_FAILSLAB) += failslab.
   obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
   obj-$(CONFIG_FS_XIP) += filemap_xip.o
   obj-$(CONFIG_MIGRATION) += migrate.o
- ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA
   obj-$(CONFIG_SMP) += percpu.o
   else
   obj-$(CONFIG_SMP) += allocpercpu.o
diff --combined mm/page-writeback.c

index 25e7770309b871e42dbaf7ccedf0132b265eacc2,997186c0b51958f5ab099e60c310e00909d90299..dd73d29c15a8697043f705a6419aa3990ee57599
--- 1/mm/page-writeback.c
--- 2/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@@ -35,6 -35,15 +35,6 @@@
   #include <linux/buffer_head.h>
   #include <linux/pagevec.h>
   
- -/*
- - * The maximum number of pages to writeout in a single bdflush/kupdate
- - * operation.  We do this so we don't hold I_SYNC against an inode for
- - * enormous amounts of time, which would block a userspace task which has
- - * been forced to throttle against that inode.  Also, the code reevaluates
- - * the dirty each time it has written this many pages.
- - */
- -#define MAX_WRITEBACK_PAGES   1024
- -
   /*
    * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
    * will look to see if it needs to force writeback or throttling.
@@@ -108,6 -117,8 +108,6 @@@ EXPORT_SYMBOL(laptop_mode)
   /* End of sysctl-exported parameters */
   
   
- -static void background_writeout(unsigned long _min_pages);
- -
   /*
    * Scale the writeback cache size proportional to the relative writeout speeds.
    *
@@@ -309,13 -320,15 +309,13 @@@ static void task_dirty_limit(struct tas
   /*
    *
    */
- -static DEFINE_SPINLOCK(bdi_lock);
   static unsigned int bdi_min_ratio;
   
   int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio)
   {
         int ret = 0;
- -      unsigned long flags;
   
- -      spin_lock_irqsave(&bdi_lock, flags);
+ +      spin_lock(&bdi_lock);
         if (min_ratio > bdi->max_ratio) {
                 ret = -EINVAL;
         } else {
@@@ -327,26 -340,27 +327,26 @@@
                         ret = -EINVAL;
                 }
         }
- -      spin_unlock_irqrestore(&bdi_lock, flags);
+ +      spin_unlock(&bdi_lock);
   
         return ret;
   }
   
   int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
   {
- -      unsigned long flags;
         int ret = 0;
   
         if (max_ratio > 100)
                 return -EINVAL;
   
- -      spin_lock_irqsave(&bdi_lock, flags);
+ +      spin_lock(&bdi_lock);
         if (bdi->min_ratio > max_ratio) {
                 ret = -EINVAL;
         } else {
                 bdi->max_ratio = max_ratio;
                 bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100;
         }
- -      spin_unlock_irqrestore(&bdi_lock, flags);
+ +      spin_unlock(&bdi_lock);
   
         return ret;
   }
@@@ -532,7 -546,7 +532,7 @@@ static void balance_dirty_pages(struct 
                  * up.
                  */
                 if (bdi_nr_reclaimable > bdi_thresh) {
- -                      writeback_inodes(&wbc);
+ +                      writeback_inodes_wbc(&wbc);
                         pages_written += write_chunk - wbc.nr_to_write;
                         get_dirty_limits(&background_thresh, &dirty_thresh,
                                        &bdi_thresh, bdi);
@@@ -561,7 -575,7 +561,7 @@@
                 if (pages_written >= write_chunk)
                         break;          /* We've done our duty */
   
- -              congestion_wait(BLK_RW_ASYNC, HZ/10);
+ +              schedule_timeout(1);
         }
   
         if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
@@@ -580,18 -594,10 +580,18 @@@
          * background_thresh, to keep the amount of dirty memory low.
          */
         if ((laptop_mode && pages_written) ||
- -                      (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
- -                                        + global_page_state(NR_UNSTABLE_NFS)
- -                                        > background_thresh)))
- -              pdflush_operation(background_writeout, 0);
+ +          (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY)
+ +                                        + global_page_state(NR_UNSTABLE_NFS))
+ +                                        > background_thresh))) {
+ +              struct writeback_control wbc = {
+ +                      .bdi            = bdi,
+ +                      .sync_mode      = WB_SYNC_NONE,
+ +                      .nr_to_write    = nr_writeback,
+ +              };
+ +
+ +
+ +              bdi_start_writeback(&wbc);
+ +      }
   }
   
   void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@@ -604,6 -610,8 +604,8 @@@
         }
   }
   
+ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
+ 
   /**
    * balance_dirty_pages_ratelimited_nr - balance dirty memory state
    * @mapping: address_space which was dirtied
@@@ -621,7 -629,6 +623,6 @@@
   void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
                                         unsigned long nr_pages_dirtied)
   {
-       static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
         unsigned long ratelimit;
         unsigned long *p;
   
@@@ -634,7 -641,7 +635,7 @@@
          * tasks in balance_dirty_pages(). Period.
          */
         preempt_disable();
-       p =  &__get_cpu_var(ratelimits);
+       p =  &__get_cpu_var(bdp_ratelimits);
         *p += nr_pages_dirtied;
         if (unlikely(*p >= ratelimit)) {
                 *p = 0;
@@@ -675,10 -682,124 +676,10 @@@ void throttle_vm_writeout(gfp_t gfp_mas
           }
   }
   
- -/*
- - * writeback at least _min_pages, and keep writing until the amount of dirty
- - * memory is less than the background threshold, or until we're all clean.
- - */
- -static void background_writeout(unsigned long _min_pages)
- -{
- -      long min_pages = _min_pages;
- -      struct writeback_control wbc = {
- -              .bdi            = NULL,
- -              .sync_mode      = WB_SYNC_NONE,
- -              .older_than_this = NULL,
- -              .nr_to_write    = 0,
- -              .nonblocking    = 1,
- -              .range_cyclic   = 1,
- -      };
- -
- -      for ( ; ; ) {
- -              unsigned long background_thresh;
- -              unsigned long dirty_thresh;
- -
- -              get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
- -              if (global_page_state(NR_FILE_DIRTY) +
- -                      global_page_state(NR_UNSTABLE_NFS) < background_thresh
- -                              && min_pages <= 0)
- -                      break;
- -              wbc.more_io = 0;
- -              wbc.encountered_congestion = 0;
- -              wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- -              wbc.pages_skipped = 0;
- -              writeback_inodes(&wbc);
- -              min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- -              if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
- -                      /* Wrote less than expected */
- -                      if (wbc.encountered_congestion || wbc.more_io)
- -                              congestion_wait(BLK_RW_ASYNC, HZ/10);
- -                      else
- -                              break;
- -              }
- -      }
- -}
- -
- -/*
- - * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
- - * the whole world.  Returns 0 if a pdflush thread was dispatched.  Returns
- - * -1 if all pdflush threads were busy.
- - */
- -int wakeup_pdflush(long nr_pages)
- -{
- -      if (nr_pages == 0)
- -              nr_pages = global_page_state(NR_FILE_DIRTY) +
- -                              global_page_state(NR_UNSTABLE_NFS);
- -      return pdflush_operation(background_writeout, nr_pages);
- -}
- -
- -static void wb_timer_fn(unsigned long unused);
   static void laptop_timer_fn(unsigned long unused);
   
- -static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
   static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
   
- -/*
- - * Periodic writeback of "old" data.
- - *
- - * Define "old": the first time one of an inode's pages is dirtied, we mark the
- - * dirtying-time in the inode's address_space.  So this periodic writeback code
- - * just walks the superblock inode list, writing back any inodes which are
- - * older than a specific point in time.
- - *
- - * Try to run once per dirty_writeback_interval.  But if a writeback event
- - * takes longer than a dirty_writeback_interval interval, then leave a
- - * one-second gap.
- - *
- - * older_than_this takes precedence over nr_to_write.  So we'll only write back
- - * all dirty pages if they are all attached to "old" mappings.
- - */
- -static void wb_kupdate(unsigned long arg)
- -{
- -      unsigned long oldest_jif;
- -      unsigned long start_jif;
- -      unsigned long next_jif;
- -      long nr_to_write;
- -      struct writeback_control wbc = {
- -              .bdi            = NULL,
- -              .sync_mode      = WB_SYNC_NONE,
- -              .older_than_this = &oldest_jif,
- -              .nr_to_write    = 0,
- -              .nonblocking    = 1,
- -              .for_kupdate    = 1,
- -              .range_cyclic   = 1,
- -      };
- -
- -      sync_supers();
- -
- -      oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10);
- -      start_jif = jiffies;
- -      next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
- -      nr_to_write = global_page_state(NR_FILE_DIRTY) +
- -                      global_page_state(NR_UNSTABLE_NFS) +
- -                      (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- -      while (nr_to_write > 0) {
- -              wbc.more_io = 0;
- -              wbc.encountered_congestion = 0;
- -              wbc.nr_to_write = MAX_WRITEBACK_PAGES;
- -              writeback_inodes(&wbc);
- -              if (wbc.nr_to_write > 0) {
- -                      if (wbc.encountered_congestion || wbc.more_io)
- -                              congestion_wait(BLK_RW_ASYNC, HZ/10);
- -                      else
- -                              break;  /* All the old data is written */
- -              }
- -              nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- -      }
- -      if (time_before(next_jif, jiffies + HZ))
- -              next_jif = jiffies + HZ;
- -      if (dirty_writeback_interval)
- -              mod_timer(&wb_timer, next_jif);
- -}
- -
   /*
    * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
    */
@@@ -686,24 -807,28 +687,24 @@@ int dirty_writeback_centisecs_handler(c
         struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
   {
         proc_dointvec(table, write, file, buffer, length, ppos);
- -      if (dirty_writeback_interval)
- -              mod_timer(&wb_timer, jiffies +
- -                      msecs_to_jiffies(dirty_writeback_interval * 10));
- -      else
- -              del_timer(&wb_timer);
         return 0;
   }
   
- -static void wb_timer_fn(unsigned long unused)
+ +static void do_laptop_sync(struct work_struct *work)
   {
- -      if (pdflush_operation(wb_kupdate, 0) < 0)
- -              mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
- -}
- -
- -static void laptop_flush(unsigned long unused)
- -{
- -      sys_sync();
+ +      wakeup_flusher_threads(0);
+ +      kfree(work);
   }
   
   static void laptop_timer_fn(unsigned long unused)
   {
- -      pdflush_operation(laptop_flush, 0);
+ +      struct work_struct *work;
+ +
+ +      work = kmalloc(sizeof(*work), GFP_ATOMIC);
+ +      if (work) {
+ +              INIT_WORK(work, do_laptop_sync);
+ +              schedule_work(work);
+ +      }
   }
   
   /*
@@@ -786,6 -911,8 +787,6 @@@ void __init page_writeback_init(void
   {
         int shift;
   
- -      mod_timer(&wb_timer,
- -                jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
         writeback_set_ratelimit();
         register_cpu_notifier(&ratelimit_nb);
   
diff --combined mm/slub.c

index 417ed843b251850eb71e3cd2e78835f7272b6aa8,dc9765bb49dcaea8af0a2391e3f6e832817ffd5a..a5789b91d17962c9856729f1c6844fb4739c0f0e
--- 1/mm/slub.c
--- 2/mm/slub.c
+++ b/mm/slub.c
@@@ -140,13 -140,6 +140,13 @@@
   #define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
                                 SLAB_POISON | SLAB_STORE_USER)
   
+ +/*
+ + * Debugging flags that require metadata to be stored in the slab.  These get
+ + * disabled when slub_debug=O is used and a cache's min order increases with
+ + * metadata.
+ + */
+ +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
+ +
   /*
    * Set of flags that will prevent slab merging
    */
@@@ -332,7 -325,6 +332,7 @@@ static int slub_debug
   #endif
   
   static char *slub_debug_slabs;
+ +static int disable_higher_order_debug;
   
   /*
    * Object debugging
@@@ -654,7 -646,7 +654,7 @@@ static int slab_pad_check(struct kmem_c
         slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
         print_section("Padding", end - remainder, remainder);
   
- -      restore_bytes(s, "slab padding", POISON_INUSE, start, end);
+ +      restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
         return 0;
   }
   
@@@ -984,15 -976,6 +984,15 @@@ static int __init setup_slub_debug(cha
                  */
                 goto check_slabs;
   
+ +      if (tolower(*str) == 'o') {
+ +              /*
+ +               * Avoid enabling debugging on caches if its minimum order
+ +               * would increase as a result.
+ +               */
+ +              disable_higher_order_debug = 1;
+ +              goto out;
+ +      }
+ +
         slub_debug = 0;
         if (*str == '-')
                 /*
@@@ -1043,8 -1026,8 +1043,8 @@@ static unsigned long kmem_cache_flags(u
          * Enable debugging if selected on the kernel commandline.
          */
         if (slub_debug && (!slub_debug_slabs ||
- -          strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0))
- -                      flags |= slub_debug;
+ +              !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
+ +              flags |= slub_debug;
   
         return flags;
   }
@@@ -1126,7 -1109,8 +1126,7 @@@ static struct page *allocate_slab(struc
         }
   
         if (kmemcheck_enabled
- -              && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS)))
- -      {
+ +              && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) {
                 int pages = 1 << oo_order(oo);
   
                 kmemcheck_alloc_shadow(page, oo_order(oo), flags, node);
@@@ -1576,10 -1560,6 +1576,10 @@@ slab_out_of_memory(struct kmem_cache *s
                 "default order: %d, min order: %d\n", s->name, s->objsize,
                 s->size, oo_order(s->oo), oo_order(s->min));
   
+ +      if (oo_order(s->min) > get_order(s->objsize))
+ +              printk(KERN_WARNING "  %s debugging increased min order, use "
+ +                     "slub_debug=O to disable.\n", s->name);
+ +
         for_each_online_node(node) {
                 struct kmem_cache_node *n = get_node(s, node);
                 unsigned long nr_slabs;
@@@ -2021,7 -2001,7 +2021,7 @@@ static inline int calculate_order(int s
                                 return order;
                         fraction /= 2;
                 }
- -              min_objects --;
+ +              min_objects--;
         }
   
         /*
@@@ -2111,8 -2091,8 +2111,8 @@@ init_kmem_cache_node(struct kmem_cache_
    */
   #define NR_KMEM_CACHE_CPU 100
   
- static DEFINE_PER_CPU(struct kmem_cache_cpu,
-                               kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
+ static DEFINE_PER_CPU(struct kmem_cache_cpu [NR_KMEM_CACHE_CPU],
+                     kmem_cache_cpu);
   
   static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
   static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
@@@ -2420,7 -2400,6 +2420,7 @@@ static int calculate_sizes(struct kmem_
          * on bootup.
          */
         align = calculate_alignment(flags, align, s->objsize);
+ +      s->align = align;
   
         /*
          * SLUB stores one object immediately after another beginning from
@@@ -2473,18 -2452,6 +2473,18 @@@ static int kmem_cache_open(struct kmem_
   
         if (!calculate_sizes(s, -1))
                 goto error;
+ +      if (disable_higher_order_debug) {
+ +              /*
+ +               * Disable debugging flags that store metadata if the min slab
+ +               * order increased.
+ +               */
+ +              if (get_order(s->size) > get_order(s->objsize)) {
+ +                      s->flags &= ~DEBUG_METADATA_FLAGS;
+ +                      s->offset = 0;
+ +                      if (!calculate_sizes(s, -1))
+ +                              goto error;
+ +              }
+ +      }
   
         /*
          * The larger the object size is, the more pages we want on the partial
@@@ -2627,6 -2594,8 +2627,6 @@@ static inline int kmem_cache_close(stru
    */
   void kmem_cache_destroy(struct kmem_cache *s)
   {
- -      if (s->flags & SLAB_DESTROY_BY_RCU)
- -              rcu_barrier();
         down_write(&slub_lock);
         s->refcount--;
         if (!s->refcount) {
@@@ -2637,8 -2606,6 +2637,8 @@@
                                 "still has objects.\n", s->name, __func__);
                         dump_stack();
                 }
+ +              if (s->flags & SLAB_DESTROY_BY_RCU)
+ +                      rcu_barrier();
                 sysfs_slab_remove(s);
         } else
                 up_write(&slub_lock);
@@@ -2823,11 -2790,6 +2823,11 @@@ static s8 size_index[24] = 
         2       /* 192 */
   };
   
+ +static inline int size_index_elem(size_t bytes)
+ +{
+ +      return (bytes - 1) / 8;
+ +}
+ +
   static struct kmem_cache *get_slab(size_t size, gfp_t flags)
   {
         int index;
@@@ -2836,7 -2798,7 +2836,7 @@@
                 if (!size)
                         return ZERO_SIZE_PTR;
   
- -              index = size_index[(size - 1) / 8];
+ +              index = size_index[size_index_elem(size)];
         } else
                 index = fls(size - 1);
   
@@@ -3194,12 -3156,10 +3194,12 @@@ void __init kmem_cache_init(void
         slab_state = PARTIAL;
   
         /* Caches that are not of the two-to-the-power-of size */
- -      if (KMALLOC_MIN_SIZE <= 64) {
+ +      if (KMALLOC_MIN_SIZE <= 32) {
                 create_kmalloc_cache(&kmalloc_caches[1],
                                 "kmalloc-96", 96, GFP_NOWAIT);
                 caches++;
+ +      }
+ +      if (KMALLOC_MIN_SIZE <= 64) {
                 create_kmalloc_cache(&kmalloc_caches[2],
                                 "kmalloc-192", 192, GFP_NOWAIT);
                 caches++;
@@@ -3226,28 -3186,17 +3226,28 @@@
         BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 ||
                 (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1)));
   
- -      for (i = 8; i < KMALLOC_MIN_SIZE; i += 8)
- -              size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW;
+ +      for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) {
+ +              int elem = size_index_elem(i);
+ +              if (elem >= ARRAY_SIZE(size_index))
+ +                      break;
+ +              size_index[elem] = KMALLOC_SHIFT_LOW;
+ +      }
   
- -      if (KMALLOC_MIN_SIZE == 128) {
+ +      if (KMALLOC_MIN_SIZE == 64) {
+ +              /*
+ +               * The 96 byte size cache is not used if the alignment
+ +               * is 64 byte.
+ +               */
+ +              for (i = 64 + 8; i <= 96; i += 8)
+ +                      size_index[size_index_elem(i)] = 7;
+ +      } else if (KMALLOC_MIN_SIZE == 128) {
                 /*
                  * The 192 byte sized cache is not used if the alignment
                  * is 128 byte. Redirect kmalloc to use the 256 byte cache
                  * instead.
                  */
                 for (i = 128 + 8; i <= 192; i += 8)
- -                      size_index[(i - 1) / 8] = 8;
+ +                      size_index[size_index_elem(i)] = 8;
         }
   
         slab_state = UP;
@@@ -4594,11 -4543,8 +4594,11 @@@ static int sysfs_slab_add(struct kmem_c
         }
   
         err = sysfs_create_group(&s->kobj, &slab_attr_group);
- -      if (err)
+ +      if (err) {
+ +              kobject_del(&s->kobj);
+ +              kobject_put(&s->kobj);
                 return err;
+ +      }
         kobject_uevent(&s->kobj, KOBJ_ADD);
         if (!unmergeable) {
                 /* Setup first alias */
@@@ -4780,7 -4726,7 +4780,7 @@@ static const struct file_operations pro
   
   static int __init slab_proc_init(void)
   {
- -      proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+ +      proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
         return 0;
   }
   module_init(slab_proc_init);
diff --combined net/rds/ib_stats.c

index 8d8488306fe4979374dad7ac05fbc4b8b1e3a67c,301ae51ae40994b9e0b22aa87d24d521c4c6f5a2..d2c904dd6fbcb9e1eeb275f84cc062a273bb98ec
--- 1/net/rds/ib_stats.c
--- 2/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@@ -37,9 -37,9 +37,9 @@@
   #include "rds.h"
   #include "ib.h"
   
- DEFINE_PER_CPU(struct rds_ib_statistics, rds_ib_stats) ____cacheline_aligned;
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_ib_statistics, rds_ib_stats);
   
- -static char *rds_ib_stat_names[] = {
+ +static const char *const rds_ib_stat_names[] = {
         "ib_connect_raced",
         "ib_listen_closed_stale",
         "ib_tx_cq_call",
diff --combined net/rds/iw_stats.c

index d33ea790484ef24a43588543770e191429fa0c83,fafea3cc92d72ecab12e38c0cf0f5b4e47b35969..5fe67f6a1d8060f8312abf785a5631a68bec6d44
--- 1/net/rds/iw_stats.c
--- 2/net/rds/iw_stats.c
+++ b/net/rds/iw_stats.c
@@@ -37,9 -37,9 +37,9 @@@
   #include "rds.h"
   #include "iw.h"
   
- DEFINE_PER_CPU(struct rds_iw_statistics, rds_iw_stats) ____cacheline_aligned;
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
   
- -static char *rds_iw_stat_names[] = {
+ +static const char *const rds_iw_stat_names[] = {
         "iw_connect_raced",
         "iw_listen_closed_stale",
         "iw_tx_cq_call",
diff --combined net/rds/page.c

index 55c21efdb62e09210784176b38dd369d4f7c856b,de7bb84bcd780180c0227cdefb8ac6581cce647e..36790122dfd4c3cb869960ddb2141cd235973e16
--- 1/net/rds/page.c
--- 2/net/rds/page.c
+++ b/net/rds/page.c
@@@ -39,7 -39,7 +39,7 @@@ struct rds_page_remainder 
         unsigned long   r_offset;
   };
   
- DEFINE_PER_CPU(struct rds_page_remainder, rds_page_remainders) ____cacheline_aligned;
+ DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
   
   /*
    * returns 0 on success or -errno on failure.
@@@ -81,7 -81,6 +81,7 @@@ int rds_page_copy_user(struct page *pag
   
         return 0;
   }
+ +EXPORT_SYMBOL_GPL(rds_page_copy_user);
   
   /*
    * Message allocation uses this to build up regions of a message.
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 15 Sep 2009 16:39:44 +0000 (09:39 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 15 Sep 2009 16:39:44 +0000 (09:39 -0700)
		1	2
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
arch/ia64/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/powerpc/mm/stab.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/s390/kernel/vmlinux.lds.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/sparc/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/include/asm/percpu.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mcheck/mce.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mcheck/mce_amd.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/perf_counter.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/vmlinux.lds.S	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/pageattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/cfq-iosched.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/percpu-defs.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/main.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/module.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/perf_counter.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/trace/trace_events.c	patch \|	diff1 \|	diff2 \|	blob \| history
lib/Kconfig.debug	patch \|	diff1 \|	diff2 \|	blob \| history
mm/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/slub.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/rds/ib_stats.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/rds/iw_stats.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/rds/page.c	patch \|	diff1 \|	diff2 \|	blob \| history