Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Nov 2015 17:41:05 +0000 (09:41 -0800)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 4 Nov 2015 17:41:05 +0000 (09:41 -0800)
Pull networking updates from David Miller:

Changes of note:

 1) Allow to schedule ICMP packets in IPVS, from Alex Gartrell.

 2) Provide FIB table ID in ipv4 route dumps just as ipv6 does, from
    David Ahern.

 3) Allow the user to ask for the statistics to be filtered out of
    ipv4/ipv6 address netlink dumps.  From Sowmini Varadhan.

 4) More work to pass the network namespace context around deep into
    various packet path APIs, starting with the netfilter hooks.  From
    Eric W Biederman.

 5) Add layer 2 TX/RX checksum offloading to qeth driver, from Thomas
    Richter.

 6) Use usec resolution for SYN/ACK RTTs in TCP, from Yuchung Cheng.

 7) Support Very High Throughput in wireless MESH code, from Bob
    Copeland.

 8) Allow setting the ageing_time in switchdev/rocker.  From Scott
    Feldman.

 9) Properly autoload L2TP type modules, from Stephen Hemminger.

10) Fix and enable offload features by default in 8139cp driver, from
    David Woodhouse.

11) Support both ipv4 and ipv6 sockets in a single vxlan device, from
    Jiri Benc.

12) Fix CWND limiting of thin streams in TCP, from Bendik Rønning
    Opstad.

13) Fix IPSEC flowcache overflows on large systems, from Steffen
    Klassert.

14) Convert bridging to track VLANs using rhashtable entries rather than
    a bitmap.  From Nikolay Aleksandrov.

15) Make TCP listener handling completely lockless, this is a major
    accomplishment.  Incoming request sockets now live in the
    established hash table just like any other socket too.

    From Eric Dumazet.

15) Provide more bridging attributes to netlink, from Nikolay
    Aleksandrov.

16) Use hash based algorithm for ipv4 multipath routing, this was very
    long overdue.  From Peter Nørlund.

17) Several y2038 cures, mostly avoiding timespec.  From Arnd Bergmann.

18) Allow non-root execution of EBPF programs, from Alexei Starovoitov.

19) Support SO_INCOMING_CPU as setsockopt, from Eric Dumazet.  This
    influences the port binding selection logic used by SO_REUSEPORT.

20) Add ipv6 support to VRF, from David Ahern.

21) Add support for Mellanox Spectrum switch ASIC, from Jiri Pirko.

22) Add rtl8xxxu Realtek wireless driver, from Jes Sorensen.

23) Implement RACK loss recovery in TCP, from Yuchung Cheng.

24) Support multipath routes in MPLS, from Roopa Prabhu.

25) Fix POLLOUT notification for listening sockets in AF_UNIX, from Eric
    Dumazet.

26) Add new QED Qlogic river, from Yuval Mintz, Manish Chopra, and
    Sudarsana Kalluru.

27) Don't fetch timestamps on AF_UNIX sockets, from Hannes Frederic
    Sowa.

28) Support ipv6 geneve tunnels, from John W Linville.

29) Add flood control support to switchdev layer, from Ido Schimmel.

30) Fix CHECKSUM_PARTIAL handling of potentially fragmented frames, from
    Hannes Frederic Sowa.

31) Support persistent maps and progs in bpf, from Daniel Borkmann.

* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1790 commits)
  sh_eth: use DMA barriers
  switchdev: respect SKIP_EOPNOTSUPP flag in case there is no recursion
  net: sched: kill dead code in sch_choke.c
  irda: Delete an unnecessary check before the function call "irlmp_unregister_service"
  net: dsa: mv88e6xxx: include DSA ports in VLANs
  net: dsa: mv88e6xxx: disable SA learning for DSA and CPU ports
  net/core: fix for_each_netdev_feature
  vlan: Invoke driver vlan hooks only if device is present
  arcnet/com20020: add LEDS_CLASS dependency
  bpf, verifier: annotate verbose printer with __printf
  dp83640: Only wait for timestamps for packets with timestamping enabled.
  ptp: Change ptp_class to a proper bitmask
  dp83640: Prune rx timestamp list before reading from it
  dp83640: Delay scheduled work.
  dp83640: Include hash in timestamp/packet matching
  ipv6: fix tunnel error handling
  net/mlx5e: Fix LSO vlan insertion
  net/mlx5e: Re-eanble client vlan TX acceleration
  net/mlx5e: Return error in case mlx5e_set_features() fails
  net/mlx5e: Don't allow more than max supported channels
  ...

1  2 
MAINTAINERS
arch/arm64/boot/dts/apm/apm-storm.dtsi
drivers/irqchip/irq-armada-370-xp.c
drivers/net/ethernet/sfc/mcdi.c
include/linux/sched.h
include/uapi/linux/perf_event.h
kernel/events/core.c
kernel/irq/manage.c

diff --combined MAINTAINERS
index 3036f5a02107385b07141c8ebe8e343182ec37cb,27b27c0a4a115d56fa9198a10494b3fd6c495e2f..5974a0f676d6e1b6c53a4f96256a2255c2947f87
@@@ -240,12 -240,6 +240,12 @@@ L:       lm-sensors@lm-sensors.or
  S:    Maintained
  F:    drivers/hwmon/abituguru3.c
  
 +ACCES 104-IDIO-16 GPIO DRIVER
 +M:    "William Breathitt Gray" <vilhelm.gray@gmail.com>
 +L:    linux-gpio@vger.kernel.org
 +S:    Maintained
 +F:    drivers/gpio/gpio-104-idio-16.c
 +
  ACENIC DRIVER
  M:    Jes Sorensen <jes@trained-monkey.org>
  L:    linux-acenic@sunsite.dk
@@@ -660,6 -654,11 +660,6 @@@ F:        drivers/gpu/drm/radeon/radeon_kfd.
  F:    drivers/gpu/drm/radeon/radeon_kfd.h
  F:    include/uapi/linux/kfd_ioctl.h
  
 -AMD MICROCODE UPDATE SUPPORT
 -M:    Borislav Petkov <bp@alien8.de>
 -S:    Maintained
 -F:    arch/x86/kernel/cpu/microcode/amd*
 -
  AMD XGBE DRIVER
  M:    Tom Lendacky <thomas.lendacky@amd.com>
  L:    netdev@vger.kernel.org
@@@ -1529,7 -1528,6 +1529,7 @@@ W:      http://www.stlinux.co
  S:    Maintained
  F:    arch/arm/mach-sti/
  F:    arch/arm/boot/dts/sti*
 +F:    drivers/char/hw_random/st-rng.c
  F:    drivers/clocksource/arm_global_timer.c
  F:    drivers/clocksource/clksrc_st_lpc.c
  F:    drivers/i2c/busses/i2c-st.c
@@@ -1782,14 -1780,6 +1782,14 @@@ S:    Supporte
  F:    Documentation/aoe/
  F:    drivers/block/aoe/
  
 +ATHEROS 71XX/9XXX GPIO DRIVER
 +M:    Alban Bedel <albeu@free.fr>
 +W:    https://github.com/AlbanBedel/linux
 +T:    git git://github.com/AlbanBedel/linux
 +S:    Maintained
 +F:    drivers/gpio/gpio-ath79.c
 +F:    Documentation/devicetree/bindings/gpio/gpio-ath79.txt
 +
  ATHEROS ATH GENERIC UTILITIES
  M:    "Luis R. Rodriguez" <mcgrof@do-not-panic.com>
  L:    linux-wireless@vger.kernel.org
@@@ -5464,6 -5454,12 +5464,6 @@@ W:     https://01.org/linux-acp
  S:    Supported
  F:    drivers/platform/x86/intel_menlow.c
  
 -INTEL IA32 MICROCODE UPDATE SUPPORT
 -M:    Borislav Petkov <bp@alien8.de>
 -S:    Maintained
 -F:    arch/x86/kernel/cpu/microcode/core*
 -F:    arch/x86/kernel/cpu/microcode/intel*
 -
  INTEL I/OAT DMA DRIVER
  M:    Dave Jiang <dave.jiang@intel.com>
  R:    Dan Williams <dan.j.williams@intel.com>
@@@ -5574,7 -5570,7 +5574,7 @@@ F:      drivers/net/wireless/iwlegacy
  INTEL WIRELESS WIFI LINK (iwlwifi)
  M:    Johannes Berg <johannes.berg@intel.com>
  M:    Emmanuel Grumbach <emmanuel.grumbach@intel.com>
- M:    Intel Linux Wireless <ilw@linux.intel.com>
+ M:    Intel Linux Wireless <linuxwifi@intel.com>
  L:    linux-wireless@vger.kernel.org
  W:    http://intellinuxwireless.org
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/iwlwifi/iwlwifi.git
@@@ -6121,6 -6117,13 +6121,13 @@@ F:    Documentation/auxdisplay/ks010
  F:    drivers/auxdisplay/ks0108.c
  F:    include/linux/ks0108.h
  
+ L3MDEV
+ M:    David Ahern <dsa@cumulusnetworks.com>
+ L:    netdev@vger.kernel.org
+ S:    Maintained
+ F:    net/l3mdev
+ F:    include/net/l3mdev.h
  LAPB module
  L:    linux-x25@vger.kernel.org
  S:    Orphan
@@@ -6588,13 -6591,6 +6595,13 @@@ M:    Guenter Roeck <linux@roeck-us.net
  S:    Maintained
  F:    drivers/net/dsa/mv88e6352.c
  
 +MARVELL CRYPTO DRIVER
 +M:    Boris Brezillon <boris.brezillon@free-electrons.com>
 +M:    Arnaud Ebalard <arno@natisbad.org>
 +F:    drivers/crypto/marvell/
 +S:    Maintained
 +L:    linux-crypto@vger.kernel.org
 +
  MARVELL GIGABIT ETHERNET DRIVERS (skge/sky2)
  M:    Mirko Lindner <mlindner@marvell.com>
  M:    Stephen Hemminger <stephen@networkplumber.org>
@@@ -7005,6 -7001,7 +7012,7 @@@ M:      Alan Ott <alan@signal11.us
  L:    linux-wpan@vger.kernel.org
  S:    Maintained
  F:    drivers/net/ieee802154/mrf24j40.c
+ F:    Documentation/devicetree/bindings/net/ieee802154/mrf24j40.txt
  
  MSI LAPTOP SUPPORT
  M:    "Lee, Chun-Yi" <jlee@suse.com>
@@@ -7339,7 -7336,6 +7347,6 @@@ S:      Odd Fixe
  F:    drivers/net/
  F:    include/linux/if_*
  F:    include/linux/netdevice.h
- F:    include/linux/arcdevice.h
  F:    include/linux/etherdevice.h
  F:    include/linux/fcdevice.h
  F:    include/linux/fddidevice.h
@@@ -8192,13 -8188,6 +8199,13 @@@ L:    linux-arm-kernel@lists.infradead.or
  S:    Maintained
  F:    drivers/pinctrl/pinctrl-at91.*
  
 +PIN CONTROLLER - ATMEL AT91 PIO4
 +M:    Ludovic Desroches <ludovic.desroches@atmel.com>
 +L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 +L:    linux-gpio@vger.kernel.org
 +S:    Supported
 +F:    drivers/pinctrl/pinctrl-at91-pio4.*
 +
  PIN CONTROLLER - INTEL
  M:    Mika Westerberg <mika.westerberg@linux.intel.com>
  M:    Heikki Krogerus <heikki.krogerus@linux.intel.com>
@@@ -8560,6 -8549,16 +8567,16 @@@ L:    netdev@vger.kernel.or
  S:    Supported
  F:    drivers/net/ethernet/qlogic/qlge/
  
+ QLOGIC QL4xxx ETHERNET DRIVER
+ M:    Yuval Mintz <Yuval.Mintz@qlogic.com>
+ M:    Ariel Elior <Ariel.Elior@qlogic.com>
+ M:    everest-linux-l2@qlogic.com
+ L:    netdev@vger.kernel.org
+ S:    Supported
+ F:    drivers/net/ethernet/qlogic/qed/
+ F:    include/linux/qed/
+ F:    drivers/net/ethernet/qlogic/qede/
  QNX4 FILESYSTEM
  M:    Anders Larsen <al@alarsen.net>
  W:    http://www.alarsen.net/linux/qnx4fs/
@@@ -8911,6 -8910,13 +8928,13 @@@ S:    Maintaine
  F:    drivers/net/wireless/rtlwifi/
  F:    drivers/net/wireless/rtlwifi/rtl8192ce/
  
+ RTL8XXXU WIRELESS DRIVER (rtl8xxxu)
+ M:    Jes Sorensen <Jes.Sorensen@redhat.com>
+ L:    linux-wireless@vger.kernel.org
+ T:    git git://git.kernel.org/pub/scm/linux/kernel/git/jes/linux.git rtl8723au-mac80211
+ S:    Maintained
+ F:    drivers/net/wireless/realtek/rtl8xxxu/
  S3 SAVAGE FRAMEBUFFER DRIVER
  M:    Antonino Daplas <adaplas@gmail.com>
  L:    linux-fbdev@vger.kernel.org
@@@ -10118,7 -10124,6 +10142,7 @@@ F:   include/net/switchdev.
  
  SYNOPSYS ARC ARCHITECTURE
  M:    Vineet Gupta <vgupta@synopsys.com>
 +L:    linux-snps-arc@lists.infraded.org
  S:    Supported
  F:    arch/arc/
  F:    Documentation/devicetree/bindings/arc/*
@@@ -11315,7 -11320,6 +11339,6 @@@ M:   Shrijeet Mukherjee <shm@cumulusnetwo
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    drivers/net/vrf.c
- F:    include/net/vrf.h
  F:    Documentation/networking/vrf.txt
  
  VT1211 HARDWARE MONITOR DRIVER
@@@ -11502,11 -11506,6 +11525,11 @@@ L: linux-edac@vger.kernel.or
  S:    Maintained
  F:    arch/x86/kernel/cpu/mcheck/*
  
 +X86 MICROCODE UPDATE SUPPORT
 +M:    Borislav Petkov <bp@alien8.de>
 +S:    Maintained
 +F:    arch/x86/kernel/cpu/microcode/*
 +
  X86 VDSO
  M:    Andy Lutomirski <luto@amacapital.net>
  L:    linux-kernel@vger.kernel.org
index fac1720472f9f6ce4e9ae9bbf538f1bf6a036557,d483e7e4f9211a331b638646c0cec47bba85270a..d6c9630a5c20a817e840e8f5ed8e00c3a25b3c38
                                clock-output-names = "xge0clk";
                        };
  
+                       xge1clk: xge1clk@1f62c000 {
+                               compatible = "apm,xgene-device-clock";
+                               status = "disabled";
+                               #clock-cells = <1>;
+                               clocks = <&socplldiv2 0>;
+                               reg = <0x0 0x1f62c000 0x0 0x1000>;
+                               reg-names = "csr-reg";
+                               csr-mask = <0x3>;
+                               clock-output-names = "xge1clk";
+                       };
                        sataphy1clk: sataphy1clk@1f21c000 {
                                compatible = "apm,xgene-device-clock";
                                #clock-cells = <1>;
                                reg = <0x0 0x7c600000 0x0 0x200000>;
                                pmd-controller = <3>;
                        };
 +
 +                      edacl3@7e600000 {
 +                              compatible = "apm,xgene-edac-l3";
 +                              reg = <0x0 0x7e600000 0x0 0x1000>;
 +                      };
 +
 +                      edacsoc@7e930000 {
 +                              compatible = "apm,xgene-edac-soc-v1";
 +                              reg = <0x0 0x7e930000 0x0 0x1000>;
 +                      };
                };
  
                pcie0: pcie@1f2b0000 {
                        phy-connection-type = "xgmii";
                };
  
+               xgenet1: ethernet@1f620000 {
+                       compatible = "apm,xgene1-xgenet";
+                       status = "disabled";
+                       reg = <0x0 0x1f620000 0x0 0xd100>,
+                             <0x0 0x1f600000 0x0 0Xc300>,
+                             <0x0 0x18000000 0x0 0X8000>;
+                       reg-names = "enet_csr", "ring_csr", "ring_cmd";
+                       interrupts = <0x0 0x6C 0x4>,
+                                    <0x0 0x6D 0x4>;
+                       port-id = <1>;
+                       dma-coherent;
+                       clocks = <&xge1clk 0>;
+                       /* mac address will be overwritten by the bootloader */
+                       local-mac-address = [00 00 00 00 00 00];
+                       phy-connection-type = "xgmii";
+               };
                rng: rng@10520000 {
                        compatible = "apm,xgene-rng";
                        reg = <0x0 0x10520000 0x0 0x100>;
index 389318a3be820a560dbc02c7b5e2bd9b37cdeb35,9d89900a3f887218e99fbb33ade929431ec7c5ca..3f3a8c3d2175ed6a49727bc86d8afde6e7cd9520
@@@ -56,9 -56,6 +56,6 @@@
  
  #define ARMADA_370_XP_MAX_PER_CPU_IRQS                (28)
  
- #define ARMADA_370_XP_TIMER0_PER_CPU_IRQ      (5)
- #define ARMADA_370_XP_FABRIC_IRQ              (3)
  #define IPI_DOORBELL_START                      (0)
  #define IPI_DOORBELL_END                        (8)
  #define IPI_DOORBELL_MASK                       0xFF
@@@ -81,13 -78,10 +78,10 @@@ static phys_addr_t msi_doorbell_addr
  
  static inline bool is_percpu_irq(irq_hw_number_t irq)
  {
-       switch (irq) {
-       case ARMADA_370_XP_TIMER0_PER_CPU_IRQ:
-       case ARMADA_370_XP_FABRIC_IRQ:
+       if (irq <= ARMADA_370_XP_MAX_PER_CPU_IRQS)
                return true;
-       default:
-               return false;
-       }
+       return false;
  }
  
  /*
@@@ -317,7 -311,6 +311,7 @@@ static int armada_370_xp_mpic_irq_map(s
                                        handle_level_irq);
        }
        irq_set_probe(virq);
 +      irq_clear_status_flags(virq, IRQ_NOAUTOEN);
  
        return 0;
  }
@@@ -550,7 -543,7 +544,7 @@@ static void armada_370_xp_mpic_resume(v
                if (virq == 0)
                        continue;
  
-               if (irq != ARMADA_370_XP_TIMER0_PER_CPU_IRQ)
+               if (!is_percpu_irq(irq))
                        writel(irq, per_cpu_int_base +
                               ARMADA_370_XP_INT_CLEAR_MASK_OFFS);
                else
index a9b9460de0d63b2369062ce1fc9dee90909878ee,d3f307e5c070f8d15538d260336b08d2a251e130..41fb6b60a3f07e11444e0d62430aca937f7e84ea
@@@ -9,7 -9,7 +9,7 @@@
  
  #include <linux/delay.h>
  #include <linux/moduleparam.h>
 -#include <asm/cmpxchg.h>
 +#include <linux/atomic.h>
  #include "net_driver.h"
  #include "nic.h"
  #include "io.h"
@@@ -1028,10 -1028,21 +1028,21 @@@ static void efx_mcdi_ev_death(struct ef
  
                /* Consume the status word since efx_mcdi_rpc_finish() won't */
                for (count = 0; count < MCDI_STATUS_DELAY_COUNT; ++count) {
-                       if (efx_mcdi_poll_reboot(efx))
+                       rc = efx_mcdi_poll_reboot(efx);
+                       if (rc)
                                break;
                        udelay(MCDI_STATUS_DELAY_US);
                }
+               /* On EF10, a CODE_MC_REBOOT event can be received without the
+                * reboot detection in efx_mcdi_poll_reboot() being triggered.
+                * If zero was returned from the final call to
+                * efx_mcdi_poll_reboot(), the MC reboot wasn't noticed but the
+                * MC has definitely rebooted so prepare for the reset.
+                */
+               if (!rc && efx->type->mcdi_reboot_detected)
+                       efx->type->mcdi_reboot_detected(efx);
                mcdi->new_epoch = true;
  
                /* Nobody was waiting for an MCDI request, so trigger a reset */
diff --combined include/linux/sched.h
index 9e1e06c3ce051e63862a22e7edb5897e7027591e,4817df5fffae86c1ccbcd385200c053e8eaad2ea..c115d617739d8e6f1f388c13069ad32cfca1bb3b
@@@ -599,42 -599,33 +599,42 @@@ struct task_cputime_atomic 
                .sum_exec_runtime = ATOMIC64_INIT(0),           \
        }
  
 -#ifdef CONFIG_PREEMPT_COUNT
 -#define PREEMPT_DISABLED      (1 + PREEMPT_ENABLED)
 -#else
 -#define PREEMPT_DISABLED      PREEMPT_ENABLED
 -#endif
 +#define PREEMPT_DISABLED      (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
 +
 +/*
 + * Disable preemption until the scheduler is running -- use an unconditional
 + * value so that it also works on !PREEMPT_COUNT kernels.
 + *
 + * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
 + */
 +#define INIT_PREEMPT_COUNT    PREEMPT_OFFSET
  
  /*
 - * Disable preemption until the scheduler is running.
 - * Reset by start_kernel()->sched_init()->init_idle().
 + * Initial preempt_count value; reflects the preempt_count schedule invariant
 + * which states that during context switches:
   *
 - * We include PREEMPT_ACTIVE to avoid cond_resched() from working
 - * before the scheduler is active -- see should_resched().
 + *    preempt_count() == 2*PREEMPT_DISABLE_OFFSET
 + *
 + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
 + * Note: See finish_task_switch().
   */
 -#define INIT_PREEMPT_COUNT    (PREEMPT_DISABLED + PREEMPT_ACTIVE)
 +#define FORK_PREEMPT_COUNT    (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
  
  /**
   * struct thread_group_cputimer - thread group interval timer counts
   * @cputime_atomic:   atomic thread group interval timers.
 - * @running:          non-zero when there are timers running and
 - *                    @cputime receives updates.
 + * @running:          true when there are timers running and
 + *                    @cputime_atomic receives updates.
 + * @checking_timer:   true when a thread in the group is in the
 + *                    process of checking for thread group timers.
   *
   * This structure contains the version of task_cputime, above, that is
   * used for thread group CPU timer calculations.
   */
  struct thread_group_cputimer {
        struct task_cputime_atomic cputime_atomic;
 -      int running;
 +      bool running;
 +      bool checking_timer;
  };
  
  #include <linux/rwsem.h>
@@@ -849,7 -840,7 +849,7 @@@ struct user_struct 
        struct hlist_node uidhash_node;
        kuid_t uid;
  
- #ifdef CONFIG_PERF_EVENTS
+ #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
        atomic_long_t locked_vm;
  #endif
  };
@@@ -1148,6 -1139,8 +1148,6 @@@ struct sched_domain_topology_level 
  #endif
  };
  
 -extern struct sched_domain_topology_level *sched_domain_topology;
 -
  extern void set_sched_topology(struct sched_domain_topology_level *tl);
  extern void wake_up_if_idle(int cpu);
  
@@@ -1196,10 -1189,10 +1196,10 @@@ struct load_weight 
  
  /*
   * The load_avg/util_avg accumulates an infinite geometric series.
 - * 1) load_avg factors the amount of time that a sched_entity is
 - * runnable on a rq into its weight. For cfs_rq, it is the aggregated
 - * such weights of all runnable and blocked sched_entities.
 - * 2) util_avg factors frequency scaling into the amount of time
 + * 1) load_avg factors frequency scaling into the amount of time that a
 + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
 + * aggregated such weights of all runnable and blocked sched_entities.
 + * 2) util_avg factors frequency and cpu scaling into the amount of time
   * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
   * For cfs_rq, it is the aggregated such times of all runnable and
   * blocked sched_entities.
@@@ -1349,12 -1342,10 +1349,12 @@@ struct sched_dl_entity 
  
  union rcu_special {
        struct {
 -              bool blocked;
 -              bool need_qs;
 -      } b;
 -      short s;
 +              u8 blocked;
 +              u8 need_qs;
 +              u8 exp_need_qs;
 +              u8 pad; /* Otherwise the compiler can store garbage here. */
 +      } b; /* Bits. */
 +      u32 s; /* Set of bits. */
  };
  struct rcu_node;
  
index 651221334f4980c189c9b713d75f5f8f912c40f0,d3c4176153611c25a3f3baf091fde5738b7078ee..d801bb0d9f6d2b08b6be600565716d045d8b85ca
@@@ -110,6 -110,7 +110,7 @@@ enum perf_sw_ids 
        PERF_COUNT_SW_ALIGNMENT_FAULTS          = 7,
        PERF_COUNT_SW_EMULATION_FAULTS          = 8,
        PERF_COUNT_SW_DUMMY                     = 9,
+       PERF_COUNT_SW_BPF_OUTPUT                = 10,
  
        PERF_COUNT_SW_MAX,                      /* non-ABI */
  };
@@@ -168,7 -169,6 +169,7 @@@ enum perf_branch_sample_type_shift 
  
        PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT     = 11, /* call/ret stack */
        PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT       = 12, /* indirect jumps */
 +      PERF_SAMPLE_BRANCH_CALL_SHIFT           = 13, /* direct call */
  
        PERF_SAMPLE_BRANCH_MAX_SHIFT            /* non-ABI */
  };
@@@ -189,7 -189,6 +190,7 @@@ enum perf_branch_sample_type 
  
        PERF_SAMPLE_BRANCH_CALL_STACK   = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
        PERF_SAMPLE_BRANCH_IND_JUMP     = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT,
 +      PERF_SAMPLE_BRANCH_CALL         = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT,
  
        PERF_SAMPLE_BRANCH_MAX          = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
  };
@@@ -478,7 -477,7 +479,7 @@@ struct perf_event_mmap_page 
         *   u64 delta;
         *
         *   quot = (cyc >> time_shift);
 -       *   rem = cyc & ((1 << time_shift) - 1);
 +       *   rem = cyc & (((u64)1 << time_shift) - 1);
         *   delta = time_offset + quot * time_mult +
         *              ((rem * time_mult) >> time_shift);
         *
         * And vice versa:
         *
         *   quot = cyc >> time_shift;
 -       *   rem  = cyc & ((1 << time_shift) - 1);
 +       *   rem  = cyc & (((u64)1 << time_shift) - 1);
         *   timestamp = time_zero + quot * time_mult +
         *               ((rem * time_mult) >> time_shift);
         */
diff --combined kernel/events/core.c
index ea02109aee77e057713ff8787f5e2d6eb9f9fb23,64754bfecd700dd4a0fc7f6738f2d07a718100c6..39db20c6248e47c940bd8721c41ade530e1eb5c9
@@@ -196,7 -196,7 +196,7 @@@ static int perf_sample_period_ns __read
  static int perf_sample_allowed_ns __read_mostly =
        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
  
 -void update_perf_cpu_limits(void)
 +static void update_perf_cpu_limits(void)
  {
        u64 tmp = perf_sample_period_ns;
  
@@@ -472,7 -472,7 +472,7 @@@ perf_cgroup_set_timestamp(struct task_s
   * mode SWOUT : schedule out everything
   * mode SWIN : schedule in based on cgroup for next
   */
 -void perf_cgroup_switch(struct task_struct *task, int mode)
 +static void perf_cgroup_switch(struct task_struct *task, int mode)
  {
        struct perf_cpu_context *cpuctx;
        struct pmu *pmu;
@@@ -1939,7 -1939,7 +1939,7 @@@ group_sched_in(struct perf_event *group
        if (group_event->state == PERF_EVENT_STATE_OFF)
                return 0;
  
 -      pmu->start_txn(pmu);
 +      pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
  
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
@@@ -3209,22 -3209,14 +3209,22 @@@ void perf_event_exec(void
        rcu_read_unlock();
  }
  
 +struct perf_read_data {
 +      struct perf_event *event;
 +      bool group;
 +      int ret;
 +};
 +
  /*
   * Cross CPU call to read the hardware event
   */
  static void __perf_event_read(void *info)
  {
 -      struct perf_event *event = info;
 +      struct perf_read_data *data = info;
 +      struct perf_event *sub, *event = data->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 +      struct pmu *pmu = event->pmu;
  
        /*
         * If this is a task context, we need to check whether it is
                update_context_time(ctx);
                update_cgrp_time_from_event(event);
        }
 +
        update_event_times(event);
 -      if (event->state == PERF_EVENT_STATE_ACTIVE)
 -              event->pmu->read(event);
 +      if (event->state != PERF_EVENT_STATE_ACTIVE)
 +              goto unlock;
 +
 +      if (!data->group) {
 +              pmu->read(event);
 +              data->ret = 0;
 +              goto unlock;
 +      }
 +
 +      pmu->start_txn(pmu, PERF_PMU_TXN_READ);
 +
 +      pmu->read(event);
 +
 +      list_for_each_entry(sub, &event->sibling_list, group_entry) {
 +              update_event_times(sub);
 +              if (sub->state == PERF_EVENT_STATE_ACTIVE) {
 +                      /*
 +                       * Use sibling's PMU rather than @event's since
 +                       * sibling could be on different (eg: software) PMU.
 +                       */
 +                      sub->pmu->read(sub);
 +              }
 +      }
 +
 +      data->ret = pmu->commit_txn(pmu);
 +
 +unlock:
        raw_spin_unlock(&ctx->lock);
  }
  
@@@ -3334,23 -3300,15 +3334,23 @@@ u64 perf_event_read_local(struct perf_e
        return val;
  }
  
 -static u64 perf_event_read(struct perf_event *event)
 +static int perf_event_read(struct perf_event *event, bool group)
  {
 +      int ret = 0;
 +
        /*
         * If event is enabled and currently active on a CPU, update the
         * value in the event structure:
         */
        if (event->state == PERF_EVENT_STATE_ACTIVE) {
 +              struct perf_read_data data = {
 +                      .event = event,
 +                      .group = group,
 +                      .ret = 0,
 +              };
                smp_call_function_single(event->oncpu,
 -                                       __perf_event_read, event, 1);
 +                                       __perf_event_read, &data, 1);
 +              ret = data.ret;
        } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
                struct perf_event_context *ctx = event->ctx;
                unsigned long flags;
                        update_context_time(ctx);
                        update_cgrp_time_from_event(event);
                }
 -              update_event_times(event);
 +              if (group)
 +                      update_group_times(event);
 +              else
 +                      update_event_times(event);
                raw_spin_unlock_irqrestore(&ctx->lock, flags);
        }
  
 -      return perf_event_count(event);
 +      return ret;
  }
  
  /*
@@@ -3814,7 -3769,7 +3814,7 @@@ static void put_event(struct perf_even
         *     see the comment there.
         *
         *  2) there is a lock-inversion with mmap_sem through
 -       *     perf_event_read_group(), which takes faults while
 +       *     perf_read_group(), which takes faults while
         *     holding ctx->mutex, however this is called after
         *     the last filedesc died, so there is no possibility
         *     to trigger the AB-BA case.
@@@ -3888,18 -3843,14 +3888,18 @@@ u64 perf_event_read_value(struct perf_e
        *running = 0;
  
        mutex_lock(&event->child_mutex);
 -      total += perf_event_read(event);
 +
 +      (void)perf_event_read(event, false);
 +      total += perf_event_count(event);
 +
        *enabled += event->total_time_enabled +
                        atomic64_read(&event->child_total_time_enabled);
        *running += event->total_time_running +
                        atomic64_read(&event->child_total_time_running);
  
        list_for_each_entry(child, &event->child_list, child_list) {
 -              total += perf_event_read(child);
 +              (void)perf_event_read(child, false);
 +              total += perf_event_count(child);
                *enabled += child->total_time_enabled;
                *running += child->total_time_running;
        }
  }
  EXPORT_SYMBOL_GPL(perf_event_read_value);
  
 -static int perf_event_read_group(struct perf_event *event,
 -                                 u64 read_format, char __user *buf)
 +static int __perf_read_group_add(struct perf_event *leader,
 +                                      u64 read_format, u64 *values)
  {
 -      struct perf_event *leader = event->group_leader, *sub;
 -      struct perf_event_context *ctx = leader->ctx;
 -      int n = 0, size = 0, ret;
 -      u64 count, enabled, running;
 -      u64 values[5];
 +      struct perf_event *sub;
 +      int n = 1; /* skip @nr */
 +      int ret;
  
 -      lockdep_assert_held(&ctx->mutex);
 +      ret = perf_event_read(leader, true);
 +      if (ret)
 +              return ret;
  
 -      count = perf_event_read_value(leader, &enabled, &running);
 +      /*
 +       * Since we co-schedule groups, {enabled,running} times of siblings
 +       * will be identical to those of the leader, so we only publish one
 +       * set.
 +       */
 +      if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
 +              values[n++] += leader->total_time_enabled +
 +                      atomic64_read(&leader->child_total_time_enabled);
 +      }
  
 -      values[n++] = 1 + leader->nr_siblings;
 -      if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 -              values[n++] = enabled;
 -      if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 -              values[n++] = running;
 -      values[n++] = count;
 +      if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
 +              values[n++] += leader->total_time_running +
 +                      atomic64_read(&leader->child_total_time_running);
 +      }
 +
 +      /*
 +       * Write {count,id} tuples for every sibling.
 +       */
 +      values[n++] += perf_event_count(leader);
        if (read_format & PERF_FORMAT_ID)
                values[n++] = primary_event_id(leader);
  
 -      size = n * sizeof(u64);
 +      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 +              values[n++] += perf_event_count(sub);
 +              if (read_format & PERF_FORMAT_ID)
 +                      values[n++] = primary_event_id(sub);
 +      }
  
 -      if (copy_to_user(buf, values, size))
 -              return -EFAULT;
 +      return 0;
 +}
 +
 +static int perf_read_group(struct perf_event *event,
 +                                 u64 read_format, char __user *buf)
 +{
 +      struct perf_event *leader = event->group_leader, *child;
 +      struct perf_event_context *ctx = leader->ctx;
 +      int ret;
 +      u64 *values;
  
 -      ret = size;
 +      lockdep_assert_held(&ctx->mutex);
  
 -      list_for_each_entry(sub, &leader->sibling_list, group_entry) {
 -              n = 0;
 +      values = kzalloc(event->read_size, GFP_KERNEL);
 +      if (!values)
 +              return -ENOMEM;
  
 -              values[n++] = perf_event_read_value(sub, &enabled, &running);
 -              if (read_format & PERF_FORMAT_ID)
 -                      values[n++] = primary_event_id(sub);
 +      values[0] = 1 + leader->nr_siblings;
  
 -              size = n * sizeof(u64);
 +      /*
 +       * By locking the child_mutex of the leader we effectively
 +       * lock the child list of all siblings.. XXX explain how.
 +       */
 +      mutex_lock(&leader->child_mutex);
  
 -              if (copy_to_user(buf + ret, values, size)) {
 -                      return -EFAULT;
 -              }
 +      ret = __perf_read_group_add(leader, read_format, values);
 +      if (ret)
 +              goto unlock;
  
 -              ret += size;
 +      list_for_each_entry(child, &leader->child_list, child_list) {
 +              ret = __perf_read_group_add(child, read_format, values);
 +              if (ret)
 +                      goto unlock;
        }
  
 +      mutex_unlock(&leader->child_mutex);
 +
 +      ret = event->read_size;
 +      if (copy_to_user(buf, values, event->read_size))
 +              ret = -EFAULT;
 +      goto out;
 +
 +unlock:
 +      mutex_unlock(&leader->child_mutex);
 +out:
 +      kfree(values);
        return ret;
  }
  
 -static int perf_event_read_one(struct perf_event *event,
 +static int perf_read_one(struct perf_event *event,
                                 u64 read_format, char __user *buf)
  {
        u64 enabled, running;
@@@ -4035,7 -3946,7 +4035,7 @@@ static bool is_event_hup(struct perf_ev
   * Read the performance event - simple non blocking version for now
   */
  static ssize_t
 -perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
 +__perf_read(struct perf_event *event, char __user *buf, size_t count)
  {
        u64 read_format = event->attr.read_format;
        int ret;
  
        WARN_ON_ONCE(event->ctx->parent_ctx);
        if (read_format & PERF_FORMAT_GROUP)
 -              ret = perf_event_read_group(event, read_format, buf);
 +              ret = perf_read_group(event, read_format, buf);
        else
 -              ret = perf_event_read_one(event, read_format, buf);
 +              ret = perf_read_one(event, read_format, buf);
  
        return ret;
  }
@@@ -4068,7 -3979,7 +4068,7 @@@ perf_read(struct file *file, char __use
        int ret;
  
        ctx = perf_event_ctx_lock(event);
 -      ret = perf_read_hw(event, buf, count);
 +      ret = __perf_read(event, buf, count);
        perf_event_ctx_unlock(event, ctx);
  
        return ret;
@@@ -4099,7 -4010,7 +4099,7 @@@ static unsigned int perf_poll(struct fi
  
  static void _perf_event_reset(struct perf_event *event)
  {
 -      (void)perf_event_read(event);
 +      (void)perf_event_read(event, false);
        local64_set(&event->count, 0);
        perf_event_update_userpage(event);
  }
@@@ -5375,9 -5286,15 +5375,15 @@@ void perf_output_sample(struct perf_out
  
        if (sample_type & PERF_SAMPLE_RAW) {
                if (data->raw) {
-                       perf_output_put(handle, data->raw->size);
-                       __output_copy(handle, data->raw->data,
-                                          data->raw->size);
+                       u32 raw_size = data->raw->size;
+                       u32 real_size = round_up(raw_size + sizeof(u32),
+                                                sizeof(u64)) - sizeof(u32);
+                       u64 zero = 0;
+                       perf_output_put(handle, real_size);
+                       __output_copy(handle, data->raw->data, raw_size);
+                       if (real_size - raw_size)
+                               __output_copy(handle, &zero, real_size - raw_size);
                } else {
                        struct {
                                u32     size;
@@@ -5509,8 -5426,7 +5515,7 @@@ void perf_prepare_sample(struct perf_ev
                else
                        size += sizeof(u32);
  
-               WARN_ON_ONCE(size & (sizeof(u64)-1));
-               header->size += size;
+               header->size += round_up(size, sizeof(u64));
        }
  
        if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
@@@ -7381,49 -7297,24 +7386,49 @@@ static void perf_pmu_nop_void(struct pm
  {
  }
  
 +static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
 +{
 +}
 +
  static int perf_pmu_nop_int(struct pmu *pmu)
  {
        return 0;
  }
  
 -static void perf_pmu_start_txn(struct pmu *pmu)
 +static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
 +
 +static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
  {
 +      __this_cpu_write(nop_txn_flags, flags);
 +
 +      if (flags & ~PERF_PMU_TXN_ADD)
 +              return;
 +
        perf_pmu_disable(pmu);
  }
  
  static int perf_pmu_commit_txn(struct pmu *pmu)
  {
 +      unsigned int flags = __this_cpu_read(nop_txn_flags);
 +
 +      __this_cpu_write(nop_txn_flags, 0);
 +
 +      if (flags & ~PERF_PMU_TXN_ADD)
 +              return 0;
 +
        perf_pmu_enable(pmu);
        return 0;
  }
  
  static void perf_pmu_cancel_txn(struct pmu *pmu)
  {
 +      unsigned int flags =  __this_cpu_read(nop_txn_flags);
 +
 +      __this_cpu_write(nop_txn_flags, 0);
 +
 +      if (flags & ~PERF_PMU_TXN_ADD)
 +              return;
 +
        perf_pmu_enable(pmu);
  }
  
@@@ -7662,7 -7553,7 +7667,7 @@@ got_cpu_context
                        pmu->commit_txn = perf_pmu_commit_txn;
                        pmu->cancel_txn = perf_pmu_cancel_txn;
                } else {
 -                      pmu->start_txn  = perf_pmu_nop_void;
 +                      pmu->start_txn  = perf_pmu_nop_txn;
                        pmu->commit_txn = perf_pmu_nop_int;
                        pmu->cancel_txn = perf_pmu_nop_void;
                }
@@@ -7750,7 -7641,7 +7755,7 @@@ static int perf_try_init_event(struct p
        return ret;
  }
  
 -struct pmu *perf_init_event(struct perf_event *event)
 +static struct pmu *perf_init_event(struct perf_event *event)
  {
        struct pmu *pmu = NULL;
        int idx;
@@@ -9459,6 -9350,14 +9464,6 @@@ static void perf_cgroup_exit(struct cgr
                             struct cgroup_subsys_state *old_css,
                             struct task_struct *task)
  {
 -      /*
 -       * cgroup_exit() is called in the copy_process() failure path.
 -       * Ignore this case since the task hasn't ran yet, this avoids
 -       * trying to poke a half freed task state from generic code.
 -       */
 -      if (!(task->flags & PF_EXITING))
 -              return;
 -
        task_function_call(task, __perf_cgroup_move, task);
  }
  
diff --combined kernel/irq/manage.c
index a71175ff98d58f7c274786cadd5b6b520678c665,4c213864ec458577719602e7c5a93a33d793a6f3..0eebaeef317bc990e3ee1b1bd1719b9e89211685
@@@ -258,6 -258,37 +258,6 @@@ int irq_set_affinity_hint(unsigned int 
  }
  EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
  
 -/**
 - *    irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
 - *    @irq: interrupt number to set affinity
 - *    @vcpu_info: vCPU specific data
 - *
 - *    This function uses the vCPU specific data to set the vCPU
 - *    affinity for an irq. The vCPU specific data is passed from
 - *    outside, such as KVM. One example code path is as below:
 - *    KVM -> IOMMU -> irq_set_vcpu_affinity().
 - */
 -int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
 -{
 -      unsigned long flags;
 -      struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 -      struct irq_data *data;
 -      struct irq_chip *chip;
 -      int ret = -ENOSYS;
 -
 -      if (!desc)
 -              return -EINVAL;
 -
 -      data = irq_desc_get_irq_data(desc);
 -      chip = irq_data_get_irq_chip(data);
 -      if (chip && chip->irq_set_vcpu_affinity)
 -              ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
 -      irq_put_desc_unlock(desc, flags);
 -
 -      return ret;
 -}
 -EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
 -
  static void irq_affinity_notify(struct work_struct *work)
  {
        struct irq_affinity_notify *notify =
@@@ -393,37 -424,6 +393,37 @@@ setup_affinity(struct irq_desc *desc, s
  }
  #endif
  
 +/**
 + *    irq_set_vcpu_affinity - Set vcpu affinity for the interrupt
 + *    @irq: interrupt number to set affinity
 + *    @vcpu_info: vCPU specific data
 + *
 + *    This function uses the vCPU specific data to set the vCPU
 + *    affinity for an irq. The vCPU specific data is passed from
 + *    outside, such as KVM. One example code path is as below:
 + *    KVM -> IOMMU -> irq_set_vcpu_affinity().
 + */
 +int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
 +{
 +      unsigned long flags;
 +      struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
 +      struct irq_data *data;
 +      struct irq_chip *chip;
 +      int ret = -ENOSYS;
 +
 +      if (!desc)
 +              return -EINVAL;
 +
 +      data = irq_desc_get_irq_data(desc);
 +      chip = irq_data_get_irq_chip(data);
 +      if (chip && chip->irq_set_vcpu_affinity)
 +              ret = chip->irq_set_vcpu_affinity(data, vcpu_info);
 +      irq_put_desc_unlock(desc, flags);
 +
 +      return ret;
 +}
 +EXPORT_SYMBOL_GPL(irq_set_vcpu_affinity);
 +
  void __disable_irq(struct irq_desc *desc)
  {
        if (!desc->depth++)
@@@ -730,12 -730,6 +730,12 @@@ static irqreturn_t irq_nested_primary_h
        return IRQ_NONE;
  }
  
 +static irqreturn_t irq_forced_secondary_handler(int irq, void *dev_id)
 +{
 +      WARN(1, "Secondary action handler called for irq %d\n", irq);
 +      return IRQ_NONE;
 +}
 +
  static int irq_wait_for_interrupt(struct irqaction *action)
  {
        set_current_state(TASK_INTERRUPTIBLE);
  static void irq_finalize_oneshot(struct irq_desc *desc,
                                 struct irqaction *action)
  {
 -      if (!(desc->istate & IRQS_ONESHOT))
 +      if (!(desc->istate & IRQS_ONESHOT) ||
 +          action->handler == irq_forced_secondary_handler)
                return;
  again:
        chip_bus_lock(desc);
@@@ -917,18 -910,6 +917,18 @@@ static void irq_thread_dtor(struct call
        irq_finalize_oneshot(desc, action);
  }
  
 +static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
 +{
 +      struct irqaction *secondary = action->secondary;
 +
 +      if (WARN_ON_ONCE(!secondary))
 +              return;
 +
 +      raw_spin_lock_irq(&desc->lock);
 +      __irq_wake_thread(desc, secondary);
 +      raw_spin_unlock_irq(&desc->lock);
 +}
 +
  /*
   * Interrupt handler thread
   */
@@@ -959,8 -940,6 +959,8 @@@ static int irq_thread(void *data
                action_ret = handler_fn(desc, action);
                if (action_ret == IRQ_HANDLED)
                        atomic_inc(&desc->threads_handled);
 +              if (action_ret == IRQ_WAKE_THREAD)
 +                      irq_wake_secondary(desc, action);
  
                wake_threads_waitq(desc);
        }
@@@ -1005,36 -984,20 +1005,36 @@@ void irq_wake_thread(unsigned int irq, 
  }
  EXPORT_SYMBOL_GPL(irq_wake_thread);
  
 -static void irq_setup_forced_threading(struct irqaction *new)
 +static int irq_setup_forced_threading(struct irqaction *new)
  {
        if (!force_irqthreads)
 -              return;
 +              return 0;
        if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
 -              return;
 +              return 0;
  
        new->flags |= IRQF_ONESHOT;
  
 -      if (!new->thread_fn) {
 -              set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
 -              new->thread_fn = new->handler;
 -              new->handler = irq_default_primary_handler;
 +      /*
 +       * Handle the case where we have a real primary handler and a
 +       * thread handler. We force thread them as well by creating a
 +       * secondary action.
 +       */
 +      if (new->handler != irq_default_primary_handler && new->thread_fn) {
 +              /* Allocate the secondary action */
 +              new->secondary = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
 +              if (!new->secondary)
 +                      return -ENOMEM;
 +              new->secondary->handler = irq_forced_secondary_handler;
 +              new->secondary->thread_fn = new->thread_fn;
 +              new->secondary->dev_id = new->dev_id;
 +              new->secondary->irq = new->irq;
 +              new->secondary->name = new->name;
        }
 +      /* Deal with the primary handler */
 +      set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
 +      new->thread_fn = new->handler;
 +      new->handler = irq_default_primary_handler;
 +      return 0;
  }
  
  static int irq_request_resources(struct irq_desc *desc)
@@@ -1054,48 -1017,6 +1054,48 @@@ static void irq_release_resources(struc
                c->irq_release_resources(d);
  }
  
 +static int
 +setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
 +{
 +      struct task_struct *t;
 +      struct sched_param param = {
 +              .sched_priority = MAX_USER_RT_PRIO/2,
 +      };
 +
 +      if (!secondary) {
 +              t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
 +                                 new->name);
 +      } else {
 +              t = kthread_create(irq_thread, new, "irq/%d-s-%s", irq,
 +                                 new->name);
 +              param.sched_priority -= 1;
 +      }
 +
 +      if (IS_ERR(t))
 +              return PTR_ERR(t);
 +
 +      sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
 +
 +      /*
 +       * We keep the reference to the task struct even if
 +       * the thread dies to avoid that the interrupt code
 +       * references an already freed task_struct.
 +       */
 +      get_task_struct(t);
 +      new->thread = t;
 +      /*
 +       * Tell the thread to set its affinity. This is
 +       * important for shared interrupt handlers as we do
 +       * not invoke setup_affinity() for the secondary
 +       * handlers as everything is already set up. Even for
 +       * interrupts marked with IRQF_NO_BALANCE this is
 +       * correct as we want the thread to move to the cpu(s)
 +       * on which the requesting code placed the interrupt.
 +       */
 +      set_bit(IRQTF_AFFINITY, &new->thread_flags);
 +      return 0;
 +}
 +
  /*
   * Internal function to register an irqaction - typically used to
   * allocate special interrupts that are part of the architecture.
@@@ -1116,8 -1037,6 +1116,8 @@@ __setup_irq(unsigned int irq, struct ir
        if (!try_module_get(desc->owner))
                return -ENODEV;
  
 +      new->irq = irq;
 +
        /*
         * Check whether the interrupt nests into another interrupt
         * thread.
                 */
                new->handler = irq_nested_primary_handler;
        } else {
 -              if (irq_settings_can_thread(desc))
 -                      irq_setup_forced_threading(new);
 +              if (irq_settings_can_thread(desc)) {
 +                      ret = irq_setup_forced_threading(new);
 +                      if (ret)
 +                              goto out_mput;
 +              }
        }
  
        /*
         * thread.
         */
        if (new->thread_fn && !nested) {
 -              struct task_struct *t;
 -              static const struct sched_param param = {
 -                      .sched_priority = MAX_USER_RT_PRIO/2,
 -              };
 -
 -              t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
 -                                 new->name);
 -              if (IS_ERR(t)) {
 -                      ret = PTR_ERR(t);
 +              ret = setup_irq_thread(new, irq, false);
 +              if (ret)
                        goto out_mput;
 +              if (new->secondary) {
 +                      ret = setup_irq_thread(new->secondary, irq, true);
 +                      if (ret)
 +                              goto out_thread;
                }
 -
 -              sched_setscheduler_nocheck(t, SCHED_FIFO, &param);
 -
 -              /*
 -               * We keep the reference to the task struct even if
 -               * the thread dies to avoid that the interrupt code
 -               * references an already freed task_struct.
 -               */
 -              get_task_struct(t);
 -              new->thread = t;
 -              /*
 -               * Tell the thread to set its affinity. This is
 -               * important for shared interrupt handlers as we do
 -               * not invoke setup_affinity() for the secondary
 -               * handlers as everything is already set up. Even for
 -               * interrupts marked with IRQF_NO_BALANCE this is
 -               * correct as we want the thread to move to the cpu(s)
 -               * on which the requesting code placed the interrupt.
 -               */
 -              set_bit(IRQTF_AFFINITY, &new->thread_flags);
        }
  
        if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
                                   irq, nmsk, omsk);
        }
  
 -      new->irq = irq;
        *old_ptr = new;
  
        irq_pm_install_action(desc, new);
         */
        if (new->thread)
                wake_up_process(new->thread);
 +      if (new->secondary)
 +              wake_up_process(new->secondary->thread);
  
        register_irq_proc(irq, desc);
        new->dir = NULL;
@@@ -1385,13 -1323,6 +1385,13 @@@ out_thread
                kthread_stop(t);
                put_task_struct(t);
        }
 +      if (new->secondary && new->secondary->thread) {
 +              struct task_struct *t = new->secondary->thread;
 +
 +              new->secondary->thread = NULL;
 +              kthread_stop(t);
 +              put_task_struct(t);
 +      }
  out_mput:
        module_put(desc->owner);
        return ret;
@@@ -1463,7 -1394,6 +1463,7 @@@ static struct irqaction *__free_irq(uns
  
        /* If this was the last handler, shut down the IRQ line: */
        if (!desc->action) {
 +              irq_settings_clr_disable_unlazy(desc);
                irq_shutdown(desc);
                irq_release_resources(desc);
        }
        if (action->thread) {
                kthread_stop(action->thread);
                put_task_struct(action->thread);
 +              if (action->secondary && action->secondary->thread) {
 +                      kthread_stop(action->secondary->thread);
 +                      put_task_struct(action->secondary->thread);
 +              }
        }
  
        module_put(desc->owner);
 +      kfree(action->secondary);
        return action;
  }
  
@@@ -1651,10 -1576,8 +1651,10 @@@ int request_threaded_irq(unsigned int i
        retval = __setup_irq(irq, desc, action);
        chip_bus_sync_unlock(desc);
  
 -      if (retval)
 +      if (retval) {
 +              kfree(action->secondary);
                kfree(action);
 +      }
  
  #ifdef CONFIG_DEBUG_SHIRQ_FIXME
        if (!retval && (irqflags & IRQF_SHARED)) {
@@@ -1838,6 -1761,7 +1838,7 @@@ void free_percpu_irq(unsigned int irq, 
        kfree(__free_percpu_irq(irq, dev_id));
        chip_bus_sync_unlock(desc);
  }
+ EXPORT_SYMBOL_GPL(free_percpu_irq);
  
  /**
   *    setup_percpu_irq - setup a per-cpu interrupt
@@@ -1867,9 -1791,10 +1868,10 @@@ int setup_percpu_irq(unsigned int irq, 
   *    @devname: An ascii name for the claiming device
   *    @dev_id: A percpu cookie passed back to the handler function
   *
-  *    This call allocates interrupt resources, but doesn't
-  *    automatically enable the interrupt. It has to be done on each
-  *    CPU using enable_percpu_irq().
+  *    This call allocates interrupt resources and enables the
+  *    interrupt on the local CPU. If the interrupt is supposed to be
+  *    enabled on other CPUs, it has to be done on each CPU using
+  *    enable_percpu_irq().
   *
   *    Dev_id must be globally unique. It is a per-cpu variable, and
   *    the handler gets called with the interrupted CPU's instance of
@@@ -1908,6 -1833,7 +1910,7 @@@ int request_percpu_irq(unsigned int irq
  
        return retval;
  }
+ EXPORT_SYMBOL_GPL(request_percpu_irq);
  
  /**
   *    irq_get_irqchip_state - returns the irqchip state of a interrupt.