Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
authorDavid S. Miller <davem@davemloft.net>
Tue, 14 Apr 2015 19:44:14 +0000 (15:44 -0400)
committerDavid S. Miller <davem@davemloft.net>
Tue, 14 Apr 2015 19:44:14 +0000 (15:44 -0400)
The dwmac-socfpga.c conflict was a case of a bug fix overlapping
changes in net-next to handle an error pointer differently.

Signed-off-by: David S. Miller <davem@davemloft.net>
1  2 
drivers/net/ethernet/emulex/benet/be.h
drivers/net/ethernet/emulex/benet/be_main.c
drivers/net/vxlan.c
include/linux/brcmphy.h
net/ipv4/fou.c
net/ipv4/geneve.c
net/ipv4/tcp_output.c
net/ipv6/ip6_vti.c
net/rds/rds.h
net/rds/send.c

index 4b0494b9cc7cf034e8ebdc190d08e46a8a1e790e,204ec43438c4d8edc6ee2e9a0bdaa39023a9718e..1bf1cdce74ac3591d4a2011e6be9399c4a5cdf57
  #include <linux/firmware.h>
  #include <linux/slab.h>
  #include <linux/u64_stats_sync.h>
 +#include <linux/cpumask.h>
  
  #include "be_hw.h"
  #include "be_roce.h"
  
 -#define DRV_VER                       "10.4u"
 +#define DRV_VER                       "10.6.0.1"
  #define DRV_NAME              "be2net"
  #define BE_NAME                       "Emulex BladeEngine2"
  #define BE3_NAME              "Emulex BladeEngine3"
@@@ -88,7 -87,6 +88,7 @@@
  #define BE3_MAX_EVT_QS                16
  #define BE3_SRIOV_MAX_EVT_QS  8
  
 +#define MAX_RSS_IFACES                15
  #define MAX_RX_QS             32
  #define MAX_EVT_QS            32
  #define MAX_TX_QS             32
@@@ -99,6 -97,7 +99,7 @@@
  #define BE_NAPI_WEIGHT                64
  #define MAX_RX_POST           BE_NAPI_WEIGHT /* Frags posted at a time */
  #define RX_FRAGS_REFILL_WM    (RX_Q_LEN - MAX_RX_POST)
+ #define MAX_NUM_POST_ERX_DB   255u
  
  #define MAX_VFS                       30 /* Max VFs supported by BE3 FW */
  #define FW_VER_LEN            32
@@@ -184,7 -183,6 +185,7 @@@ struct be_eq_obj 
        u16 spurious_intr;
        struct napi_struct napi;
        struct be_adapter *adapter;
 +      cpumask_var_t  affinity_mask;
  
  #ifdef CONFIG_NET_RX_BUSY_POLL
  #define BE_EQ_IDLE            0
@@@ -241,17 -239,10 +242,17 @@@ struct be_tx_stats 
        struct u64_stats_sync sync_compl;
  };
  
 +/* Structure to hold some data of interest obtained from a TX CQE */
 +struct be_tx_compl_info {
 +      u8 status;              /* Completion status */
 +      u16 end_index;          /* Completed TXQ Index */
 +};
 +
  struct be_tx_obj {
        u32 db_offset;
        struct be_queue_info q;
        struct be_queue_info cq;
 +      struct be_tx_compl_info txcp;
        /* Remember the skbs that were transmitted */
        struct sk_buff *sent_skb_list[TX_Q_LEN];
        struct be_tx_stats stats;
@@@ -380,7 -371,6 +381,7 @@@ enum vf_state 
  #define BE_FLAGS_VXLAN_OFFLOADS                       BIT(8)
  #define BE_FLAGS_SETUP_DONE                   BIT(9)
  #define BE_FLAGS_EVT_INCOMPATIBLE_SFP         BIT(10)
 +#define BE_FLAGS_ERR_DETECTION_SCHEDULED      BIT(11)
  
  #define BE_UC_PMAC_COUNT                      30
  #define BE_VF_UC_PMAC_COUNT                   2
@@@ -415,11 -405,8 +416,11 @@@ struct be_resources 
        u16 max_tx_qs;
        u16 max_rss_qs;
        u16 max_rx_qs;
 +      u16 max_cq_count;
        u16 max_uc_mac;         /* Max UC MACs programmable */
        u16 max_vlans;          /* Number of vlans supported */
 +      u16 max_iface_count;
 +      u16 max_mcc_count;
        u16 max_evt_qs;
        u32 if_cap_flags;
        u32 vf_if_cap_flags;    /* VF if capability flags */
@@@ -432,39 -419,6 +433,39 @@@ struct rss_info 
        u8 rss_hkey[RSS_HASH_KEY_LEN];
  };
  
 +/* Macros to read/write the 'features' word of be_wrb_params structure.
 + */
 +#define       BE_WRB_F_BIT(name)                      BE_WRB_F_##name##_BIT
 +#define       BE_WRB_F_MASK(name)                     BIT_MASK(BE_WRB_F_##name##_BIT)
 +
 +#define       BE_WRB_F_GET(word, name)        \
 +      (((word) & (BE_WRB_F_MASK(name))) >> BE_WRB_F_BIT(name))
 +
 +#define       BE_WRB_F_SET(word, name, val)   \
 +      ((word) |= (((val) << BE_WRB_F_BIT(name)) & BE_WRB_F_MASK(name)))
 +
 +/* Feature/offload bits */
 +enum {
 +      BE_WRB_F_CRC_BIT,               /* Ethernet CRC */
 +      BE_WRB_F_IPCS_BIT,              /* IP csum */
 +      BE_WRB_F_TCPCS_BIT,             /* TCP csum */
 +      BE_WRB_F_UDPCS_BIT,             /* UDP csum */
 +      BE_WRB_F_LSO_BIT,               /* LSO */
 +      BE_WRB_F_LSO6_BIT,              /* LSO6 */
 +      BE_WRB_F_VLAN_BIT,              /* VLAN */
 +      BE_WRB_F_VLAN_SKIP_HW_BIT       /* Skip VLAN tag (workaround) */
 +};
 +
 +/* The structure below provides a HW-agnostic abstraction of WRB params
 + * retrieved from a TX skb. This is in turn passed to chip specific routines
 + * during transmit, to set the corresponding params in the WRB.
 + */
 +struct be_wrb_params {
 +      u32 features;   /* Feature bits */
 +      u16 vlan_tag;   /* VLAN tag */
 +      u16 lso_mss;    /* MSS for LSO */
 +};
 +
  struct be_adapter {
        struct pci_dev *pdev;
        struct net_device *netdev;
  
        /* Rx rings */
        u16 num_rx_qs;
 +      u16 num_rss_qs;
 +      u16 need_def_rxq;
        struct be_rx_obj rx_obj[MAX_RX_QS];
        u32 big_page_size;      /* Compounded page size shared by rx wrbs */
  
        struct delayed_work work;
        u16 work_counter;
  
 -      struct delayed_work func_recovery_work;
 +      struct delayed_work be_err_detection_work;
        u32 flags;
        u32 cmd_privileges;
        /* Ethtool knobs and info */
@@@ -645,8 -597,9 +646,8 @@@ extern const struct ethtool_ops be_etht
        for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rx_qs;  \
                i++, rxo++)
  
 -/* Skip the default non-rss queue (last one)*/
  #define for_all_rss_queues(adapter, rxo, i)                           \
 -      for (i = 0, rxo = &adapter->rx_obj[i]; i < (adapter->num_rx_qs - 1);\
 +      for (i = 0, rxo = &adapter->rx_obj[i]; i < adapter->num_rss_qs; \
                i++, rxo++)
  
  #define for_all_tx_queues(adapter, txo, i)                            \
index 5ff7fba9b67c9d39043d1094193db714f7625a6b,ad2b5094a498f2b53be8700e52b330ffba5e364a..fb0bc3c3620e9cf87983b1c425e0f24d431bffc9
@@@ -30,9 -30,6 +30,9 @@@ MODULE_DESCRIPTION(DRV_DESC " " DRV_VER
  MODULE_AUTHOR("Emulex Corporation");
  MODULE_LICENSE("GPL");
  
 +/* num_vfs module param is obsolete.
 + * Use sysfs method to enable/disable VFs.
 + */
  static unsigned int num_vfs;
  module_param(num_vfs, uint, S_IRUGO);
  MODULE_PARM_DESC(num_vfs, "Number of PCI VFs to initialize");
@@@ -730,86 -727,48 +730,86 @@@ static u16 skb_ip_proto(struct sk_buff 
                ip_hdr(skb)->protocol : ipv6_hdr(skb)->nexthdr;
  }
  
 -static void wrb_fill_hdr(struct be_adapter *adapter, struct be_eth_hdr_wrb *hdr,
 -                       struct sk_buff *skb, u32 wrb_cnt, u32 len,
 -                       bool skip_hw_vlan)
 +static inline bool be_is_txq_full(struct be_tx_obj *txo)
  {
 -      u16 vlan_tag, proto;
 +      return atomic_read(&txo->q.used) + BE_MAX_TX_FRAG_COUNT >= txo->q.len;
 +}
  
 -      memset(hdr, 0, sizeof(*hdr));
 +static inline bool be_can_txq_wake(struct be_tx_obj *txo)
 +{
 +      return atomic_read(&txo->q.used) < txo->q.len / 2;
 +}
 +
 +static inline bool be_is_tx_compl_pending(struct be_tx_obj *txo)
 +{
 +      return atomic_read(&txo->q.used) > txo->pend_wrb_cnt;
 +}
  
 -      SET_TX_WRB_HDR_BITS(crc, hdr, 1);
 +static void be_get_wrb_params_from_skb(struct be_adapter *adapter,
 +                                     struct sk_buff *skb,
 +                                     struct be_wrb_params *wrb_params)
 +{
 +      u16 proto;
  
        if (skb_is_gso(skb)) {
 -              SET_TX_WRB_HDR_BITS(lso, hdr, 1);
 -              SET_TX_WRB_HDR_BITS(lso_mss, hdr, skb_shinfo(skb)->gso_size);
 +              BE_WRB_F_SET(wrb_params->features, LSO, 1);
 +              wrb_params->lso_mss = skb_shinfo(skb)->gso_size;
                if (skb_is_gso_v6(skb) && !lancer_chip(adapter))
 -                      SET_TX_WRB_HDR_BITS(lso6, hdr, 1);
 +                      BE_WRB_F_SET(wrb_params->features, LSO6, 1);
        } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
                if (skb->encapsulation) {
 -                      SET_TX_WRB_HDR_BITS(ipcs, hdr, 1);
 +                      BE_WRB_F_SET(wrb_params->features, IPCS, 1);
                        proto = skb_inner_ip_proto(skb);
                } else {
                        proto = skb_ip_proto(skb);
                }
                if (proto == IPPROTO_TCP)
 -                      SET_TX_WRB_HDR_BITS(tcpcs, hdr, 1);
 +                      BE_WRB_F_SET(wrb_params->features, TCPCS, 1);
                else if (proto == IPPROTO_UDP)
 -                      SET_TX_WRB_HDR_BITS(udpcs, hdr, 1);
 +                      BE_WRB_F_SET(wrb_params->features, UDPCS, 1);
        }
  
        if (skb_vlan_tag_present(skb)) {
 -              SET_TX_WRB_HDR_BITS(vlan, hdr, 1);
 -              vlan_tag = be_get_tx_vlan_tag(adapter, skb);
 -              SET_TX_WRB_HDR_BITS(vlan_tag, hdr, vlan_tag);
 +              BE_WRB_F_SET(wrb_params->features, VLAN, 1);
 +              wrb_params->vlan_tag = be_get_tx_vlan_tag(adapter, skb);
        }
  
 -      SET_TX_WRB_HDR_BITS(num_wrb, hdr, wrb_cnt);
 -      SET_TX_WRB_HDR_BITS(len, hdr, len);
 +      BE_WRB_F_SET(wrb_params->features, CRC, 1);
 +}
 +
 +static void wrb_fill_hdr(struct be_adapter *adapter,
 +                       struct be_eth_hdr_wrb *hdr,
 +                       struct be_wrb_params *wrb_params,
 +                       struct sk_buff *skb)
 +{
 +      memset(hdr, 0, sizeof(*hdr));
  
 -      /* Hack to skip HW VLAN tagging needs evt = 1, compl = 0
 -       * When this hack is not needed, the evt bit is set while ringing DB
 +      SET_TX_WRB_HDR_BITS(crc, hdr,
 +                          BE_WRB_F_GET(wrb_params->features, CRC));
 +      SET_TX_WRB_HDR_BITS(ipcs, hdr,
 +                          BE_WRB_F_GET(wrb_params->features, IPCS));
 +      SET_TX_WRB_HDR_BITS(tcpcs, hdr,
 +                          BE_WRB_F_GET(wrb_params->features, TCPCS));
 +      SET_TX_WRB_HDR_BITS(udpcs, hdr,
 +                          BE_WRB_F_GET(wrb_params->features, UDPCS));
 +
 +      SET_TX_WRB_HDR_BITS(lso, hdr,
 +                          BE_WRB_F_GET(wrb_params->features, LSO));
 +      SET_TX_WRB_HDR_BITS(lso6, hdr,
 +                          BE_WRB_F_GET(wrb_params->features, LSO6));
 +      SET_TX_WRB_HDR_BITS(lso_mss, hdr, wrb_params->lso_mss);
 +
 +      /* Hack to skip HW VLAN tagging needs evt = 1, compl = 0. When this
 +       * hack is not needed, the evt bit is set while ringing DB.
         */
 -      if (skip_hw_vlan)
 -              SET_TX_WRB_HDR_BITS(event, hdr, 1);
 +      SET_TX_WRB_HDR_BITS(event, hdr,
 +                          BE_WRB_F_GET(wrb_params->features, VLAN_SKIP_HW));
 +      SET_TX_WRB_HDR_BITS(vlan, hdr,
 +                          BE_WRB_F_GET(wrb_params->features, VLAN));
 +      SET_TX_WRB_HDR_BITS(vlan_tag, hdr, wrb_params->vlan_tag);
 +
 +      SET_TX_WRB_HDR_BITS(num_wrb, hdr, skb_wrb_cnt(skb));
 +      SET_TX_WRB_HDR_BITS(len, hdr, skb->len);
  }
  
  static void unmap_tx_frag(struct device *dev, struct be_eth_wrb *wrb,
        }
  }
  
 -/* Returns the number of WRBs used up by the skb */
 +/* Grab a WRB header for xmit */
 +static u16 be_tx_get_wrb_hdr(struct be_tx_obj *txo)
 +{
 +      u16 head = txo->q.head;
 +
 +      queue_head_inc(&txo->q);
 +      return head;
 +}
 +
 +/* Set up the WRB header for xmit */
 +static void be_tx_setup_wrb_hdr(struct be_adapter *adapter,
 +                              struct be_tx_obj *txo,
 +                              struct be_wrb_params *wrb_params,
 +                              struct sk_buff *skb, u16 head)
 +{
 +      u32 num_frags = skb_wrb_cnt(skb);
 +      struct be_queue_info *txq = &txo->q;
 +      struct be_eth_hdr_wrb *hdr = queue_index_node(txq, head);
 +
 +      wrb_fill_hdr(adapter, hdr, wrb_params, skb);
 +      be_dws_cpu_to_le(hdr, sizeof(*hdr));
 +
 +      BUG_ON(txo->sent_skb_list[head]);
 +      txo->sent_skb_list[head] = skb;
 +      txo->last_req_hdr = head;
 +      atomic_add(num_frags, &txq->used);
 +      txo->last_req_wrb_cnt = num_frags;
 +      txo->pend_wrb_cnt += num_frags;
 +}
 +
 +/* Setup a WRB fragment (buffer descriptor) for xmit */
 +static void be_tx_setup_wrb_frag(struct be_tx_obj *txo, dma_addr_t busaddr,
 +                               int len)
 +{
 +      struct be_eth_wrb *wrb;
 +      struct be_queue_info *txq = &txo->q;
 +
 +      wrb = queue_head_node(txq);
 +      wrb_fill(wrb, busaddr, len);
 +      queue_head_inc(txq);
 +}
 +
 +/* Bring the queue back to the state it was in before be_xmit_enqueue() routine
 + * was invoked. The producer index is restored to the previous packet and the
 + * WRBs of the current packet are unmapped. Invoked to handle tx setup errors.
 + */
 +static void be_xmit_restore(struct be_adapter *adapter,
 +                          struct be_tx_obj *txo, u16 head, bool map_single,
 +                          u32 copied)
 +{
 +      struct device *dev;
 +      struct be_eth_wrb *wrb;
 +      struct be_queue_info *txq = &txo->q;
 +
 +      dev = &adapter->pdev->dev;
 +      txq->head = head;
 +
 +      /* skip the first wrb (hdr); it's not mapped */
 +      queue_head_inc(txq);
 +      while (copied) {
 +              wrb = queue_head_node(txq);
 +              unmap_tx_frag(dev, wrb, map_single);
 +              map_single = false;
 +              copied -= le32_to_cpu(wrb->frag_len);
 +              queue_head_inc(txq);
 +      }
 +
 +      txq->head = head;
 +}
 +
 +/* Enqueue the given packet for transmit. This routine allocates WRBs for the
 + * packet, dma maps the packet buffers and sets up the WRBs. Returns the number
 + * of WRBs used up by the packet.
 + */
  static u32 be_xmit_enqueue(struct be_adapter *adapter, struct be_tx_obj *txo,
 -                         struct sk_buff *skb, bool skip_hw_vlan)
 +                         struct sk_buff *skb,
 +                         struct be_wrb_params *wrb_params)
  {
        u32 i, copied = 0, wrb_cnt = skb_wrb_cnt(skb);
        struct device *dev = &adapter->pdev->dev;
        struct be_queue_info *txq = &txo->q;
 -      struct be_eth_hdr_wrb *hdr;
        bool map_single = false;
 -      struct be_eth_wrb *wrb;
 -      dma_addr_t busaddr;
        u16 head = txq->head;
 +      dma_addr_t busaddr;
 +      int len;
  
 -      hdr = queue_head_node(txq);
 -      wrb_fill_hdr(adapter, hdr, skb, wrb_cnt, skb->len, skip_hw_vlan);
 -      be_dws_cpu_to_le(hdr, sizeof(*hdr));
 -
 -      queue_head_inc(txq);
 +      head = be_tx_get_wrb_hdr(txo);
  
        if (skb->len > skb->data_len) {
 -              int len = skb_headlen(skb);
 +              len = skb_headlen(skb);
  
                busaddr = dma_map_single(dev, skb->data, len, DMA_TO_DEVICE);
                if (dma_mapping_error(dev, busaddr))
                        goto dma_err;
                map_single = true;
 -              wrb = queue_head_node(txq);
 -              wrb_fill(wrb, busaddr, len);
 -              queue_head_inc(txq);
 +              be_tx_setup_wrb_frag(txo, busaddr, len);
                copied += len;
        }
  
        for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
                const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i];
 +              len = skb_frag_size(frag);
  
 -              busaddr = skb_frag_dma_map(dev, frag, 0,
 -                                         skb_frag_size(frag), DMA_TO_DEVICE);
 +              busaddr = skb_frag_dma_map(dev, frag, 0, len, DMA_TO_DEVICE);
                if (dma_mapping_error(dev, busaddr))
                        goto dma_err;
 -              wrb = queue_head_node(txq);
 -              wrb_fill(wrb, busaddr, skb_frag_size(frag));
 -              queue_head_inc(txq);
 -              copied += skb_frag_size(frag);
 +              be_tx_setup_wrb_frag(txo, busaddr, len);
 +              copied += len;
        }
  
 -      BUG_ON(txo->sent_skb_list[head]);
 -      txo->sent_skb_list[head] = skb;
 -      txo->last_req_hdr = head;
 -      atomic_add(wrb_cnt, &txq->used);
 -      txo->last_req_wrb_cnt = wrb_cnt;
 -      txo->pend_wrb_cnt += wrb_cnt;
 +      be_tx_setup_wrb_hdr(adapter, txo, wrb_params, skb, head);
  
        be_tx_stats_update(txo, skb);
        return wrb_cnt;
  
  dma_err:
 -      /* Bring the queue back to the state it was in before this
 -       * routine was invoked.
 -       */
 -      txq->head = head;
 -      /* skip the first wrb (hdr); it's not mapped */
 -      queue_head_inc(txq);
 -      while (copied) {
 -              wrb = queue_head_node(txq);
 -              unmap_tx_frag(dev, wrb, map_single);
 -              map_single = false;
 -              copied -= le32_to_cpu(wrb->frag_len);
 -              adapter->drv_stats.dma_map_errors++;
 -              queue_head_inc(txq);
 -      }
 -      txq->head = head;
 +      adapter->drv_stats.dma_map_errors++;
 +      be_xmit_restore(adapter, txo, head, map_single, copied);
        return 0;
  }
  
@@@ -957,8 -869,7 +957,8 @@@ static inline int qnq_async_evt_rcvd(st
  
  static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
                                             struct sk_buff *skb,
 -                                           bool *skip_hw_vlan)
 +                                           struct be_wrb_params
 +                                           *wrb_params)
  {
        u16 vlan_tag = 0;
  
                /* f/w workaround to set skip_hw_vlan = 1, informs the F/W to
                 * skip VLAN insertion
                 */
 -              if (skip_hw_vlan)
 -                      *skip_hw_vlan = true;
 +              BE_WRB_F_SET(wrb_params->features, VLAN_SKIP_HW, 1);
        }
  
        if (vlan_tag) {
                                                vlan_tag);
                if (unlikely(!skb))
                        return skb;
 -              if (skip_hw_vlan)
 -                      *skip_hw_vlan = true;
 +              BE_WRB_F_SET(wrb_params->features, VLAN_SKIP_HW, 1);
        }
  
        return skb;
@@@ -1033,8 -946,7 +1033,8 @@@ static int be_ipv6_tx_stall_chk(struct 
  
  static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
                                                  struct sk_buff *skb,
 -                                                bool *skip_hw_vlan)
 +                                                struct be_wrb_params
 +                                                *wrb_params)
  {
        struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
        unsigned int eth_hdr_len;
         */
        if (be_pvid_tagging_enabled(adapter) &&
            veh->h_vlan_proto == htons(ETH_P_8021Q))
 -              *skip_hw_vlan = true;
 +              BE_WRB_F_SET(wrb_params->features, VLAN_SKIP_HW, 1);
  
        /* HW has a bug wherein it will calculate CSUM for VLAN
         * pkts even though it is disabled.
         */
        if (skb->ip_summed != CHECKSUM_PARTIAL &&
            skb_vlan_tag_present(skb)) {
 -              skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan);
 +              skb = be_insert_vlan_in_pkt(adapter, skb, wrb_params);
                if (unlikely(!skb))
                        goto err;
        }
         */
        if (be_ipv6_tx_stall_chk(adapter, skb) &&
            be_vlan_tag_tx_chk(adapter, skb)) {
 -              skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan);
 +              skb = be_insert_vlan_in_pkt(adapter, skb, wrb_params);
                if (unlikely(!skb))
                        goto err;
        }
@@@ -1102,7 -1014,7 +1102,7 @@@ err
  
  static struct sk_buff *be_xmit_workarounds(struct be_adapter *adapter,
                                           struct sk_buff *skb,
 -                                         bool *skip_hw_vlan)
 +                                         struct be_wrb_params *wrb_params)
  {
        /* Lancer, SH-R ASICs have a bug wherein Packets that are 32 bytes or
         * less may cause a transmit stall on that port. So the work-around is
        }
  
        if (BEx_chip(adapter) || lancer_chip(adapter)) {
 -              skb = be_lancer_xmit_workarounds(adapter, skb, skip_hw_vlan);
 +              skb = be_lancer_xmit_workarounds(adapter, skb, wrb_params);
                if (!skb)
                        return NULL;
        }
@@@ -1148,26 -1060,24 +1148,26 @@@ static void be_xmit_flush(struct be_ada
  
  static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev)
  {
 -      bool skip_hw_vlan = false, flush = !skb->xmit_more;
        struct be_adapter *adapter = netdev_priv(netdev);
        u16 q_idx = skb_get_queue_mapping(skb);
        struct be_tx_obj *txo = &adapter->tx_obj[q_idx];
 -      struct be_queue_info *txq = &txo->q;
 +      struct be_wrb_params wrb_params = { 0 };
 +      bool flush = !skb->xmit_more;
        u16 wrb_cnt;
  
 -      skb = be_xmit_workarounds(adapter, skb, &skip_hw_vlan);
 +      skb = be_xmit_workarounds(adapter, skb, &wrb_params);
        if (unlikely(!skb))
                goto drop;
  
 -      wrb_cnt = be_xmit_enqueue(adapter, txo, skb, skip_hw_vlan);
 +      be_get_wrb_params_from_skb(adapter, skb, &wrb_params);
 +
 +      wrb_cnt = be_xmit_enqueue(adapter, txo, skb, &wrb_params);
        if (unlikely(!wrb_cnt)) {
                dev_kfree_skb_any(skb);
                goto drop;
        }
  
 -      if ((atomic_read(&txq->used) + BE_MAX_TX_FRAG_COUNT) >= txq->len) {
 +      if (be_is_txq_full(txo)) {
                netif_stop_subqueue(netdev, q_idx);
                tx_stats(txo)->tx_stops++;
        }
@@@ -2122,7 -2032,7 +2122,7 @@@ static void be_post_rx_frags(struct be_
                if (rxo->rx_post_starved)
                        rxo->rx_post_starved = false;
                do {
-                       notify = min(256u, posted);
+                       notify = min(MAX_NUM_POST_ERX_DB, posted);
                        be_rxq_notify(adapter, rxq->id, notify);
                        posted -= notify;
                } while (posted);
        }
  }
  
 -static struct be_eth_tx_compl *be_tx_compl_get(struct be_queue_info *tx_cq)
 +static struct be_tx_compl_info *be_tx_compl_get(struct be_tx_obj *txo)
  {
 -      struct be_eth_tx_compl *txcp = queue_tail_node(tx_cq);
 +      struct be_queue_info *tx_cq = &txo->cq;
 +      struct be_tx_compl_info *txcp = &txo->txcp;
 +      struct be_eth_tx_compl *compl = queue_tail_node(tx_cq);
  
 -      if (txcp->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] == 0)
 +      if (compl->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] == 0)
                return NULL;
  
 +      /* Ensure load ordering of valid bit dword and other dwords below */
        rmb();
 -      be_dws_le_to_cpu(txcp, sizeof(*txcp));
 +      be_dws_le_to_cpu(compl, sizeof(*compl));
  
 -      txcp->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] = 0;
 +      txcp->status = GET_TX_COMPL_BITS(status, compl);
 +      txcp->end_index = GET_TX_COMPL_BITS(wrb_index, compl);
  
 +      compl->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] = 0;
        queue_tail_inc(tx_cq);
        return txcp;
  }
@@@ -2269,9 -2174,9 +2269,9 @@@ static void be_tx_compl_clean(struct be
  {
        u16 end_idx, notified_idx, cmpl = 0, timeo = 0, num_wrbs = 0;
        struct device *dev = &adapter->pdev->dev;
 -      struct be_tx_obj *txo;
 +      struct be_tx_compl_info *txcp;
        struct be_queue_info *txq;
 -      struct be_eth_tx_compl *txcp;
 +      struct be_tx_obj *txo;
        int i, pending_txqs;
  
        /* Stop polling for compls when HW has been silent for 10ms */
                        cmpl = 0;
                        num_wrbs = 0;
                        txq = &txo->q;
 -                      while ((txcp = be_tx_compl_get(&txo->cq))) {
 -                              end_idx = GET_TX_COMPL_BITS(wrb_index, txcp);
 -                              num_wrbs += be_tx_compl_process(adapter, txo,
 -                                                              end_idx);
 +                      while ((txcp = be_tx_compl_get(txo))) {
 +                              num_wrbs +=
 +                                      be_tx_compl_process(adapter, txo,
 +                                                          txcp->end_index);
                                cmpl++;
                        }
                        if (cmpl) {
                                atomic_sub(num_wrbs, &txq->used);
                                timeo = 0;
                        }
 -                      if (atomic_read(&txq->used) == txo->pend_wrb_cnt)
 +                      if (!be_is_tx_compl_pending(txo))
                                pending_txqs--;
                }
  
@@@ -2342,7 -2247,6 +2342,7 @@@ static void be_evt_queues_destroy(struc
                        napi_hash_del(&eqo->napi);
                        netif_napi_del(&eqo->napi);
                }
 +              free_cpumask_var(eqo->affinity_mask);
                be_queue_free(adapter, &eqo->q);
        }
  }
@@@ -2358,11 -2262,6 +2358,11 @@@ static int be_evt_queues_create(struct 
                                    adapter->cfg_num_qs);
  
        for_all_evt_queues(adapter, eqo, i) {
 +              if (!zalloc_cpumask_var(&eqo->affinity_mask, GFP_KERNEL))
 +                      return -ENOMEM;
 +              cpumask_set_cpu_local_first(i, dev_to_node(&adapter->pdev->dev),
 +                                          eqo->affinity_mask);
 +
                netif_napi_add(adapter->netdev, &eqo->napi, be_poll,
                               BE_NAPI_WEIGHT);
                napi_hash_add(&eqo->napi);
@@@ -2454,9 -2353,8 +2454,9 @@@ static void be_tx_queues_destroy(struc
  
  static int be_tx_qs_create(struct be_adapter *adapter)
  {
 -      struct be_queue_info *cq, *eq;
 +      struct be_queue_info *cq;
        struct be_tx_obj *txo;
 +      struct be_eq_obj *eqo;
        int status, i;
  
        adapter->num_tx_qs = min(adapter->num_evt_qs, be_max_txqs(adapter));
                /* If num_evt_qs is less than num_tx_qs, then more than
                 * one txq share an eq
                 */
 -              eq = &adapter->eq_obj[i % adapter->num_evt_qs].q;
 -              status = be_cmd_cq_create(adapter, cq, eq, false, 3);
 +              eqo = &adapter->eq_obj[i % adapter->num_evt_qs];
 +              status = be_cmd_cq_create(adapter, cq, &eqo->q, false, 3);
                if (status)
                        return status;
  
                status = be_cmd_txq_create(adapter, txo);
                if (status)
                        return status;
 +
 +              netif_set_xps_queue(adapter->netdev, eqo->affinity_mask,
 +                                  eqo->idx);
        }
  
        dev_info(&adapter->pdev->dev, "created %d TX queue(s)\n",
@@@ -2518,19 -2413,13 +2518,19 @@@ static int be_rx_cqs_create(struct be_a
        int rc, i;
  
        /* We can create as many RSS rings as there are EQs. */
 -      adapter->num_rx_qs = adapter->num_evt_qs;
 +      adapter->num_rss_qs = adapter->num_evt_qs;
 +
 +      /* We'll use RSS only if atleast 2 RSS rings are supported. */
 +      if (adapter->num_rss_qs <= 1)
 +              adapter->num_rss_qs = 0;
  
 -      /* We'll use RSS only if atleast 2 RSS rings are supported.
 -       * When RSS is used, we'll need a default RXQ for non-IP traffic.
 +      adapter->num_rx_qs = adapter->num_rss_qs + adapter->need_def_rxq;
 +
 +      /* When the interface is not capable of RSS rings (and there is no
 +       * need to create a default RXQ) we'll still need one RXQ
         */
 -      if (adapter->num_rx_qs > 1)
 -              adapter->num_rx_qs++;
 +      if (adapter->num_rx_qs == 0)
 +              adapter->num_rx_qs = 1;
  
        adapter->big_page_size = (1 << get_order(rx_frag_size)) * PAGE_SIZE;
        for_all_rx_queues(adapter, rxo, i) {
        }
  
        dev_info(&adapter->pdev->dev,
 -               "created %d RSS queue(s) and 1 default RX queue\n",
 -               adapter->num_rx_qs - 1);
 +               "created %d RX queue(s)\n", adapter->num_rx_qs);
        return 0;
  }
  
@@@ -2659,7 -2549,7 +2659,7 @@@ loop_continue
        return work_done;
  }
  
 -static inline void be_update_tx_err(struct be_tx_obj *txo, u32 status)
 +static inline void be_update_tx_err(struct be_tx_obj *txo, u8 status)
  {
        switch (status) {
        case BE_TX_COMP_HDR_PARSE_ERR:
        }
  }
  
 -static inline void lancer_update_tx_err(struct be_tx_obj *txo, u32 status)
 +static inline void lancer_update_tx_err(struct be_tx_obj *txo, u8 status)
  {
        switch (status) {
        case LANCER_TX_COMP_LSO_ERR:
  static void be_process_tx(struct be_adapter *adapter, struct be_tx_obj *txo,
                          int idx)
  {
 -      struct be_eth_tx_compl *txcp;
        int num_wrbs = 0, work_done = 0;
 -      u32 compl_status;
 -      u16 last_idx;
 +      struct be_tx_compl_info *txcp;
  
 -      while ((txcp = be_tx_compl_get(&txo->cq))) {
 -              last_idx = GET_TX_COMPL_BITS(wrb_index, txcp);
 -              num_wrbs += be_tx_compl_process(adapter, txo, last_idx);
 +      while ((txcp = be_tx_compl_get(txo))) {
 +              num_wrbs += be_tx_compl_process(adapter, txo, txcp->end_index);
                work_done++;
  
 -              compl_status = GET_TX_COMPL_BITS(status, txcp);
 -              if (compl_status) {
 +              if (txcp->status) {
                        if (lancer_chip(adapter))
 -                              lancer_update_tx_err(txo, compl_status);
 +                              lancer_update_tx_err(txo, txcp->status);
                        else
 -                              be_update_tx_err(txo, compl_status);
 +                              be_update_tx_err(txo, txcp->status);
                }
        }
  
                /* As Tx wrbs have been freed up, wake up netdev queue
                 * if it was stopped due to lack of tx wrbs.  */
                if (__netif_subqueue_stopped(adapter->netdev, idx) &&
 -                  atomic_read(&txo->q.used) < txo->q.len / 2) {
 +                  be_can_txq_wake(txo)) {
                        netif_wake_subqueue(adapter->netdev, idx);
                }
  
@@@ -2913,12 -2807,12 +2913,12 @@@ void be_detect_error(struct be_adapter 
                        sliport_err2 = ioread32(adapter->db +
                                                SLIPORT_ERROR2_OFFSET);
                        adapter->hw_error = true;
 +                      error_detected = true;
                        /* Do not log error messages if its a FW reset */
                        if (sliport_err1 == SLIPORT_ERROR_FW_RESET1 &&
                            sliport_err2 == SLIPORT_ERROR_FW_RESET2) {
                                dev_info(dev, "Firmware update in progress\n");
                        } else {
 -                              error_detected = true;
                                dev_err(dev, "Error detected in the card\n");
                                dev_err(dev, "ERR: sliport status 0x%x\n",
                                        sliport_status);
@@@ -3038,8 -2932,6 +3038,8 @@@ static int be_msix_register(struct be_a
                status = request_irq(vec, be_msix, 0, eqo->desc, eqo);
                if (status)
                        goto err_msix;
 +
 +              irq_set_affinity_hint(vec, eqo->affinity_mask);
        }
  
        return 0;
@@@ -3084,7 -2976,7 +3084,7 @@@ static void be_irq_unregister(struct be
  {
        struct net_device *netdev = adapter->netdev;
        struct be_eq_obj *eqo;
 -      int i;
 +      int i, vec;
  
        if (!adapter->isr_registered)
                return;
        }
  
        /* MSIx */
 -      for_all_evt_queues(adapter, eqo, i)
 -              free_irq(be_msix_vec_get(adapter, eqo), eqo);
 +      for_all_evt_queues(adapter, eqo, i) {
 +              vec = be_msix_vec_get(adapter, eqo);
 +              irq_set_affinity_hint(vec, NULL);
 +              free_irq(vec, eqo);
 +      }
  
  done:
        adapter->isr_registered = false;
@@@ -3182,14 -3071,12 +3182,14 @@@ static int be_rx_qs_create(struct be_ad
                        return rc;
        }
  
 -      /* The FW would like the default RXQ to be created first */
 -      rxo = default_rxo(adapter);
 -      rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id, rx_frag_size,
 -                             adapter->if_handle, false, &rxo->rss_id);
 -      if (rc)
 -              return rc;
 +      if (adapter->need_def_rxq || !adapter->num_rss_qs) {
 +              rxo = default_rxo(adapter);
 +              rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id,
 +                                     rx_frag_size, adapter->if_handle,
 +                                     false, &rxo->rss_id);
 +              if (rc)
 +                      return rc;
 +      }
  
        for_all_rss_queues(adapter, rxo, i) {
                rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id,
        }
  
        if (be_multi_rxq(adapter)) {
 -              for (j = 0; j < RSS_INDIR_TABLE_LEN;
 -                      j += adapter->num_rx_qs - 1) {
 +              for (j = 0; j < RSS_INDIR_TABLE_LEN; j += adapter->num_rss_qs) {
                        for_all_rss_queues(adapter, rxo, i) {
                                if ((j + i) >= RSS_INDIR_TABLE_LEN)
                                        break;
@@@ -3291,7 -3179,7 +3291,7 @@@ static int be_setup_wol(struct be_adapt
        int status = 0;
        u8 mac[ETH_ALEN];
  
 -      memset(mac, 0, ETH_ALEN);
 +      eth_zero_addr(mac);
  
        cmd.size = sizeof(struct be_cmd_req_acpi_wol_magic_config);
        cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
@@@ -3436,14 -3324,6 +3436,14 @@@ static void be_cancel_worker(struct be_
        }
  }
  
 +static void be_cancel_err_detection(struct be_adapter *adapter)
 +{
 +      if (adapter->flags & BE_FLAGS_ERR_DETECTION_SCHEDULED) {
 +              cancel_delayed_work_sync(&adapter->be_err_detection_work);
 +              adapter->flags &= ~BE_FLAGS_ERR_DETECTION_SCHEDULED;
 +      }
 +}
 +
  static void be_mac_clear(struct be_adapter *adapter)
  {
        if (adapter->pmac_id) {
@@@ -3475,39 -3355,8 +3475,39 @@@ static void be_disable_vxlan_offloads(s
  }
  #endif
  
 +static u16 be_calculate_vf_qs(struct be_adapter *adapter, u16 num_vfs)
 +{
 +      struct be_resources res = adapter->pool_res;
 +      u16 num_vf_qs = 1;
 +
 +      /* Distribute the queue resources equally among the PF and it's VFs
 +       * Do not distribute queue resources in multi-channel configuration.
 +       */
 +      if (num_vfs && !be_is_mc(adapter)) {
 +              /* If number of VFs requested is 8 less than max supported,
 +               * assign 8 queue pairs to the PF and divide the remaining
 +               * resources evenly among the VFs
 +               */
 +              if (num_vfs < (be_max_vfs(adapter) - 8))
 +                      num_vf_qs = (res.max_rss_qs - 8) / num_vfs;
 +              else
 +                      num_vf_qs = res.max_rss_qs / num_vfs;
 +
 +              /* Skyhawk-R chip supports only MAX_RSS_IFACES RSS capable
 +               * interfaces per port. Provide RSS on VFs, only if number
 +               * of VFs requested is less than MAX_RSS_IFACES limit.
 +               */
 +              if (num_vfs >= MAX_RSS_IFACES)
 +                      num_vf_qs = 1;
 +      }
 +      return num_vf_qs;
 +}
 +
  static int be_clear(struct be_adapter *adapter)
  {
 +      struct pci_dev *pdev = adapter->pdev;
 +      u16 num_vf_qs;
 +
        be_cancel_worker(adapter);
  
        if (sriov_enabled(adapter))
        /* Re-configure FW to distribute resources evenly across max-supported
         * number of VFs, only when VFs are not already enabled.
         */
 -      if (be_physfn(adapter) && !pci_vfs_assigned(adapter->pdev))
 +      if (skyhawk_chip(adapter) && be_physfn(adapter) &&
 +          !pci_vfs_assigned(pdev)) {
 +              num_vf_qs = be_calculate_vf_qs(adapter,
 +                                             pci_sriov_get_totalvfs(pdev));
                be_cmd_set_sriov_config(adapter, adapter->pool_res,
 -                                      pci_sriov_get_totalvfs(adapter->pdev));
 +                                      pci_sriov_get_totalvfs(pdev),
 +                                      num_vf_qs);
 +      }
  
  #ifdef CONFIG_BE2NET_VXLAN
        be_disable_vxlan_offloads(adapter);
@@@ -3547,7 -3391,7 +3547,7 @@@ static int be_if_create(struct be_adapt
  
        en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
                   BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS |
 -                 BE_IF_FLAGS_RSS;
 +                 BE_IF_FLAGS_RSS | BE_IF_FLAGS_DEFQ_RSS;
  
        en_flags &= cap_flags;
  
@@@ -3568,7 -3412,6 +3568,7 @@@ static int be_vfs_if_create(struct be_a
        for_all_vfs(adapter, vf_cfg, vf) {
                if (!BE3_chip(adapter)) {
                        status = be_cmd_get_profile_config(adapter, &res,
 +                                                         RESOURCE_LIMITS,
                                                           vf + 1);
                        if (!status) {
                                cap_flags = res.if_cap_flags;
@@@ -3742,8 -3585,7 +3742,8 @@@ static void BEx_get_resources(struct be
                /* On a SuperNIC profile, the driver needs to use the
                 * GET_PROFILE_CONFIG cmd to query the per-function TXQ limits
                 */
 -              be_cmd_get_profile_config(adapter, &super_nic_res, 0);
 +              be_cmd_get_profile_config(adapter, &super_nic_res,
 +                                        RESOURCE_LIMITS, 0);
                /* Some old versions of BE3 FW don't report max_tx_qs value */
                res->max_tx_qs = super_nic_res.max_tx_qs ? : BE3_MAX_TX_QS;
        } else {
                res->max_evt_qs = 1;
  
        res->if_cap_flags = BE_IF_CAP_FLAGS_WANT;
 +      res->if_cap_flags &= ~BE_IF_FLAGS_DEFQ_RSS;
        if (!(adapter->function_caps & BE_FUNCTION_CAPS_RSS))
                res->if_cap_flags &= ~BE_IF_FLAGS_RSS;
  }
@@@ -3783,12 -3624,13 +3783,12 @@@ static void be_setup_init(struct be_ada
  
  static int be_get_sriov_config(struct be_adapter *adapter)
  {
 -      struct device *dev = &adapter->pdev->dev;
        struct be_resources res = {0};
        int max_vfs, old_vfs;
  
 -      /* Some old versions of BE3 FW don't report max_vfs value */
 -      be_cmd_get_profile_config(adapter, &res, 0);
 +      be_cmd_get_profile_config(adapter, &res, RESOURCE_LIMITS, 0);
  
 +      /* Some old versions of BE3 FW don't report max_vfs value */
        if (BE3_chip(adapter) && !res.max_vfs) {
                max_vfs = pci_sriov_get_totalvfs(adapter->pdev);
                res.max_vfs = max_vfs > 0 ? min(MAX_VFS, max_vfs) : 0;
  
        adapter->pool_res = res;
  
 -      if (!be_max_vfs(adapter)) {
 -              if (num_vfs)
 -                      dev_warn(dev, "SRIOV is disabled. Ignoring num_vfs\n");
 -              adapter->num_vfs = 0;
 -              return 0;
 -      }
 -
 -      pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter));
 -
 -      /* validate num_vfs module param */
 +      /* If during previous unload of the driver, the VFs were not disabled,
 +       * then we cannot rely on the PF POOL limits for the TotalVFs value.
 +       * Instead use the TotalVFs value stored in the pci-dev struct.
 +       */
        old_vfs = pci_num_vf(adapter->pdev);
        if (old_vfs) {
 -              dev_info(dev, "%d VFs are already enabled\n", old_vfs);
 -              if (old_vfs != num_vfs)
 -                      dev_warn(dev, "Ignoring num_vfs=%d setting\n", num_vfs);
 +              dev_info(&adapter->pdev->dev, "%d VFs are already enabled\n",
 +                       old_vfs);
 +
 +              adapter->pool_res.max_vfs =
 +                      pci_sriov_get_totalvfs(adapter->pdev);
                adapter->num_vfs = old_vfs;
 -      } else {
 -              if (num_vfs > be_max_vfs(adapter)) {
 -                      dev_info(dev, "Resources unavailable to init %d VFs\n",
 -                               num_vfs);
 -                      dev_info(dev, "Limiting to %d VFs\n",
 -                               be_max_vfs(adapter));
 -              }
 -              adapter->num_vfs = min_t(u16, num_vfs, be_max_vfs(adapter));
        }
  
        return 0;
  }
  
 +static void be_alloc_sriov_res(struct be_adapter *adapter)
 +{
 +      int old_vfs = pci_num_vf(adapter->pdev);
 +      u16 num_vf_qs;
 +      int status;
 +
 +      be_get_sriov_config(adapter);
 +
 +      if (!old_vfs)
 +              pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter));
 +
 +      /* When the HW is in SRIOV capable configuration, the PF-pool
 +       * resources are given to PF during driver load, if there are no
 +       * old VFs. This facility is not available in BE3 FW.
 +       * Also, this is done by FW in Lancer chip.
 +       */
 +      if (skyhawk_chip(adapter) && be_max_vfs(adapter) && !old_vfs) {
 +              num_vf_qs = be_calculate_vf_qs(adapter, 0);
 +              status = be_cmd_set_sriov_config(adapter, adapter->pool_res, 0,
 +                                               num_vf_qs);
 +              if (status)
 +                      dev_err(&adapter->pdev->dev,
 +                              "Failed to optimize SRIOV resources\n");
 +      }
 +}
 +
  static int be_get_resources(struct be_adapter *adapter)
  {
        struct device *dev = &adapter->pdev->dev;
                if (status)
                        return status;
  
 +              /* If a deafault RXQ must be created, we'll use up one RSSQ*/
 +              if (res.max_rss_qs && res.max_rss_qs == res.max_rx_qs &&
 +                  !(res.if_cap_flags & BE_IF_FLAGS_DEFQ_RSS))
 +                      res.max_rss_qs -= 1;
 +
                /* If RoCE may be enabled stash away half the EQs for RoCE */
                if (be_roce_supported(adapter))
                        res.max_evt_qs /= 2;
                adapter->res = res;
        }
  
 +      /* If FW supports RSS default queue, then skip creating non-RSS
 +       * queue for non-IP traffic.
 +       */
 +      adapter->need_def_rxq = (be_if_cap_flags(adapter) &
 +                               BE_IF_FLAGS_DEFQ_RSS) ? 0 : 1;
 +
        dev_info(dev, "Max: txqs %d, rxqs %d, rss %d, eqs %d, vfs %d\n",
                 be_max_txqs(adapter), be_max_rxqs(adapter),
                 be_max_rss(adapter), be_max_eqs(adapter),
                 be_max_uc(adapter), be_max_mc(adapter),
                 be_max_vlans(adapter));
  
 +      /* Sanitize cfg_num_qs based on HW and platform limits */
 +      adapter->cfg_num_qs = min_t(u16, netif_get_num_default_rss_queues(),
 +                                  be_max_qs(adapter));
        return 0;
  }
  
 -static void be_sriov_config(struct be_adapter *adapter)
 -{
 -      struct device *dev = &adapter->pdev->dev;
 -      int status;
 -
 -      status = be_get_sriov_config(adapter);
 -      if (status) {
 -              dev_err(dev, "Failed to query SR-IOV configuration\n");
 -              dev_err(dev, "SR-IOV cannot be enabled\n");
 -              return;
 -      }
 -
 -      /* When the HW is in SRIOV capable configuration, the PF-pool
 -       * resources are equally distributed across the max-number of
 -       * VFs. The user may request only a subset of the max-vfs to be
 -       * enabled. Based on num_vfs, redistribute the resources across
 -       * num_vfs so that each VF will have access to more number of
 -       * resources. This facility is not available in BE3 FW.
 -       * Also, this is done by FW in Lancer chip.
 -       */
 -      if (be_max_vfs(adapter) && !pci_num_vf(adapter->pdev)) {
 -              status = be_cmd_set_sriov_config(adapter,
 -                                               adapter->pool_res,
 -                                               adapter->num_vfs);
 -              if (status)
 -                      dev_err(dev, "Failed to optimize SR-IOV resources\n");
 -      }
 -}
 -
  static int be_get_config(struct be_adapter *adapter)
  {
 +      int status, level;
        u16 profile_id;
 -      int status;
 +
 +      status = be_cmd_get_cntl_attributes(adapter);
 +      if (status)
 +              return status;
  
        status = be_cmd_query_fw_cfg(adapter);
        if (status)
                return status;
  
 +      if (BEx_chip(adapter)) {
 +              level = be_cmd_get_fw_log_level(adapter);
 +              adapter->msg_enable =
 +                      level <= FW_LOG_LEVEL_DEFAULT ? NETIF_MSG_HW : 0;
 +      }
 +
 +      be_cmd_get_acpi_wol_cap(adapter);
 +
        be_cmd_query_port_name(adapter);
  
        if (be_physfn(adapter)) {
                                 "Using profile 0x%x\n", profile_id);
        }
  
 -      if (!BE2_chip(adapter) && be_physfn(adapter))
 -              be_sriov_config(adapter);
 -
        status = be_get_resources(adapter);
        if (status)
                return status;
        if (!adapter->pmac_id)
                return -ENOMEM;
  
 -      /* Sanitize cfg_num_qs based on HW and platform limits */
 -      adapter->cfg_num_qs = min(adapter->cfg_num_qs, be_max_qs(adapter));
 -
        return 0;
  }
  
@@@ -3962,13 -3799,6 +3962,13 @@@ static void be_schedule_worker(struct b
        adapter->flags |= BE_FLAGS_WORKER_SCHEDULED;
  }
  
 +static void be_schedule_err_detection(struct be_adapter *adapter)
 +{
 +      schedule_delayed_work(&adapter->be_err_detection_work,
 +                            msecs_to_jiffies(1000));
 +      adapter->flags |= BE_FLAGS_ERR_DETECTION_SCHEDULED;
 +}
 +
  static int be_setup_queues(struct be_adapter *adapter)
  {
        struct net_device *netdev = adapter->netdev;
@@@ -4051,61 -3881,16 +4051,61 @@@ static inline int fw_major_num(const ch
        return fw_major;
  }
  
 +/* If any VFs are already enabled don't FLR the PF */
 +static bool be_reset_required(struct be_adapter *adapter)
 +{
 +      return pci_num_vf(adapter->pdev) ? false : true;
 +}
 +
 +/* Wait for the FW to be ready and perform the required initialization */
 +static int be_func_init(struct be_adapter *adapter)
 +{
 +      int status;
 +
 +      status = be_fw_wait_ready(adapter);
 +      if (status)
 +              return status;
 +
 +      if (be_reset_required(adapter)) {
 +              status = be_cmd_reset_function(adapter);
 +              if (status)
 +                      return status;
 +
 +              /* Wait for interrupts to quiesce after an FLR */
 +              msleep(100);
 +
 +              /* We can clear all errors when function reset succeeds */
 +              be_clear_all_error(adapter);
 +      }
 +
 +      /* Tell FW we're ready to fire cmds */
 +      status = be_cmd_fw_init(adapter);
 +      if (status)
 +              return status;
 +
 +      /* Allow interrupts for other ULPs running on NIC function */
 +      be_intr_set(adapter, true);
 +
 +      return 0;
 +}
 +
  static int be_setup(struct be_adapter *adapter)
  {
        struct device *dev = &adapter->pdev->dev;
        int status;
  
 +      status = be_func_init(adapter);
 +      if (status)
 +              return status;
 +
        be_setup_init(adapter);
  
        if (!lancer_chip(adapter))
                be_cmd_req_native_mode(adapter);
  
 +      if (!BE2_chip(adapter) && be_physfn(adapter))
 +              be_alloc_sriov_res(adapter);
 +
        status = be_get_config(adapter);
        if (status)
                goto err;
  
        be_set_rx_mode(adapter->netdev);
  
 -      be_cmd_get_acpi_wol_cap(adapter);
 -
        status = be_cmd_set_flow_control(adapter, adapter->tx_fc,
                                         adapter->rx_fc);
        if (status)
@@@ -5055,165 -4842,29 +5055,165 @@@ static void be_netdev_init(struct net_d
        netdev->ethtool_ops = &be_ethtool_ops;
  }
  
 -static void be_unmap_pci_bars(struct be_adapter *adapter)
 +static void be_cleanup(struct be_adapter *adapter)
  {
 -      if (adapter->csr)
 -              pci_iounmap(adapter->pdev, adapter->csr);
 -      if (adapter->db)
 -              pci_iounmap(adapter->pdev, adapter->db);
 -}
 +      struct net_device *netdev = adapter->netdev;
  
 -static int db_bar(struct be_adapter *adapter)
 -{
 -      if (lancer_chip(adapter) || !be_physfn(adapter))
 -              return 0;
 -      else
 -              return 4;
 +      rtnl_lock();
 +      netif_device_detach(netdev);
 +      if (netif_running(netdev))
 +              be_close(netdev);
 +      rtnl_unlock();
 +
 +      be_clear(adapter);
  }
  
 -static int be_roce_map_pci_bars(struct be_adapter *adapter)
 +static int be_resume(struct be_adapter *adapter)
  {
 -      if (skyhawk_chip(adapter)) {
 -              adapter->roce_db.size = 4096;
 -              adapter->roce_db.io_addr = pci_resource_start(adapter->pdev,
 -                                                            db_bar(adapter));
 -              adapter->roce_db.total_size = pci_resource_len(adapter->pdev,
 +      struct net_device *netdev = adapter->netdev;
 +      int status;
 +
 +      status = be_setup(adapter);
 +      if (status)
 +              return status;
 +
 +      if (netif_running(netdev)) {
 +              status = be_open(netdev);
 +              if (status)
 +                      return status;
 +      }
 +
 +      netif_device_attach(netdev);
 +
 +      return 0;
 +}
 +
 +static int be_err_recover(struct be_adapter *adapter)
 +{
 +      struct device *dev = &adapter->pdev->dev;
 +      int status;
 +
 +      status = be_resume(adapter);
 +      if (status)
 +              goto err;
 +
 +      dev_info(dev, "Adapter recovery successful\n");
 +      return 0;
 +err:
 +      if (be_physfn(adapter))
 +              dev_err(dev, "Adapter recovery failed\n");
 +      else
 +              dev_err(dev, "Re-trying adapter recovery\n");
 +
 +      return status;
 +}
 +
 +static void be_err_detection_task(struct work_struct *work)
 +{
 +      struct be_adapter *adapter =
 +                              container_of(work, struct be_adapter,
 +                                           be_err_detection_work.work);
 +      int status = 0;
 +
 +      be_detect_error(adapter);
 +
 +      if (adapter->hw_error) {
 +              be_cleanup(adapter);
 +
 +              /* As of now error recovery support is in Lancer only */
 +              if (lancer_chip(adapter))
 +                      status = be_err_recover(adapter);
 +      }
 +
 +      /* Always attempt recovery on VFs */
 +      if (!status || be_virtfn(adapter))
 +              be_schedule_err_detection(adapter);
 +}
 +
 +static void be_log_sfp_info(struct be_adapter *adapter)
 +{
 +      int status;
 +
 +      status = be_cmd_query_sfp_info(adapter);
 +      if (!status) {
 +              dev_err(&adapter->pdev->dev,
 +                      "Unqualified SFP+ detected on %c from %s part no: %s",
 +                      adapter->port_name, adapter->phy.vendor_name,
 +                      adapter->phy.vendor_pn);
 +      }
 +      adapter->flags &= ~BE_FLAGS_EVT_INCOMPATIBLE_SFP;
 +}
 +
 +static void be_worker(struct work_struct *work)
 +{
 +      struct be_adapter *adapter =
 +              container_of(work, struct be_adapter, work.work);
 +      struct be_rx_obj *rxo;
 +      int i;
 +
 +      /* when interrupts are not yet enabled, just reap any pending
 +       * mcc completions
 +       */
 +      if (!netif_running(adapter->netdev)) {
 +              local_bh_disable();
 +              be_process_mcc(adapter);
 +              local_bh_enable();
 +              goto reschedule;
 +      }
 +
 +      if (!adapter->stats_cmd_sent) {
 +              if (lancer_chip(adapter))
 +                      lancer_cmd_get_pport_stats(adapter,
 +                                                 &adapter->stats_cmd);
 +              else
 +                      be_cmd_get_stats(adapter, &adapter->stats_cmd);
 +      }
 +
 +      if (be_physfn(adapter) &&
 +          MODULO(adapter->work_counter, adapter->be_get_temp_freq) == 0)
 +              be_cmd_get_die_temperature(adapter);
 +
 +      for_all_rx_queues(adapter, rxo, i) {
 +              /* Replenish RX-queues starved due to memory
 +               * allocation failures.
 +               */
 +              if (rxo->rx_post_starved)
 +                      be_post_rx_frags(rxo, GFP_KERNEL, MAX_RX_POST);
 +      }
 +
 +      be_eqd_update(adapter);
 +
 +      if (adapter->flags & BE_FLAGS_EVT_INCOMPATIBLE_SFP)
 +              be_log_sfp_info(adapter);
 +
 +reschedule:
 +      adapter->work_counter++;
 +      schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000));
 +}
 +
 +static void be_unmap_pci_bars(struct be_adapter *adapter)
 +{
 +      if (adapter->csr)
 +              pci_iounmap(adapter->pdev, adapter->csr);
 +      if (adapter->db)
 +              pci_iounmap(adapter->pdev, adapter->db);
 +}
 +
 +static int db_bar(struct be_adapter *adapter)
 +{
 +      if (lancer_chip(adapter) || !be_physfn(adapter))
 +              return 0;
 +      else
 +              return 4;
 +}
 +
 +static int be_roce_map_pci_bars(struct be_adapter *adapter)
 +{
 +      if (skyhawk_chip(adapter)) {
 +              adapter->roce_db.size = 4096;
 +              adapter->roce_db.io_addr = pci_resource_start(adapter->pdev,
 +                                                            db_bar(adapter));
 +              adapter->roce_db.total_size = pci_resource_len(adapter->pdev,
                                                               db_bar(adapter));
        }
        return 0;
@@@ -5223,12 -4874,6 +5223,12 @@@ static int be_map_pci_bars(struct be_ad
  {
        struct pci_dev *pdev = adapter->pdev;
        u8 __iomem *addr;
 +      u32 sli_intf;
 +
 +      pci_read_config_dword(adapter->pdev, SLI_INTF_REG_OFFSET, &sli_intf);
 +      adapter->sli_family = (sli_intf & SLI_INTF_FAMILY_MASK) >>
 +                              SLI_INTF_FAMILY_SHIFT;
 +      adapter->virtfn = (sli_intf & SLI_INTF_FT_MASK) ? 1 : 0;
  
        if (BEx_chip(adapter) && be_physfn(adapter)) {
                adapter->csr = pci_iomap(pdev, 2, 0);
@@@ -5262,93 -4907,109 +5262,93 @@@ pci_map_err
        return -ENOMEM;
  }
  
 -static void be_ctrl_cleanup(struct be_adapter *adapter)
 +static void be_drv_cleanup(struct be_adapter *adapter)
  {
        struct be_dma_mem *mem = &adapter->mbox_mem_alloced;
 -
 -      be_unmap_pci_bars(adapter);
 +      struct device *dev = &adapter->pdev->dev;
  
        if (mem->va)
 -              dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va,
 -                                mem->dma);
 +              dma_free_coherent(dev, mem->size, mem->va, mem->dma);
  
        mem = &adapter->rx_filter;
        if (mem->va)
 -              dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va,
 -                                mem->dma);
 +              dma_free_coherent(dev, mem->size, mem->va, mem->dma);
 +
 +      mem = &adapter->stats_cmd;
 +      if (mem->va)
 +              dma_free_coherent(dev, mem->size, mem->va, mem->dma);
  }
  
 -static int be_ctrl_init(struct be_adapter *adapter)
 +/* Allocate and initialize various fields in be_adapter struct */
 +static int be_drv_init(struct be_adapter *adapter)
  {
        struct be_dma_mem *mbox_mem_alloc = &adapter->mbox_mem_alloced;
        struct be_dma_mem *mbox_mem_align = &adapter->mbox_mem;
        struct be_dma_mem *rx_filter = &adapter->rx_filter;
 -      u32 sli_intf;
 -      int status;
 -
 -      pci_read_config_dword(adapter->pdev, SLI_INTF_REG_OFFSET, &sli_intf);
 -      adapter->sli_family = (sli_intf & SLI_INTF_FAMILY_MASK) >>
 -                               SLI_INTF_FAMILY_SHIFT;
 -      adapter->virtfn = (sli_intf & SLI_INTF_FT_MASK) ? 1 : 0;
 -
 -      status = be_map_pci_bars(adapter);
 -      if (status)
 -              goto done;
 +      struct be_dma_mem *stats_cmd = &adapter->stats_cmd;
 +      struct device *dev = &adapter->pdev->dev;
 +      int status = 0;
  
        mbox_mem_alloc->size = sizeof(struct be_mcc_mailbox) + 16;
 -      mbox_mem_alloc->va = dma_alloc_coherent(&adapter->pdev->dev,
 -                                              mbox_mem_alloc->size,
 +      mbox_mem_alloc->va = dma_alloc_coherent(dev, mbox_mem_alloc->size,
                                                &mbox_mem_alloc->dma,
                                                GFP_KERNEL);
 -      if (!mbox_mem_alloc->va) {
 -              status = -ENOMEM;
 -              goto unmap_pci_bars;
 -      }
 +      if (!mbox_mem_alloc->va)
 +              return -ENOMEM;
 +
        mbox_mem_align->size = sizeof(struct be_mcc_mailbox);
        mbox_mem_align->va = PTR_ALIGN(mbox_mem_alloc->va, 16);
        mbox_mem_align->dma = PTR_ALIGN(mbox_mem_alloc->dma, 16);
        memset(mbox_mem_align->va, 0, sizeof(struct be_mcc_mailbox));
  
        rx_filter->size = sizeof(struct be_cmd_req_rx_filter);
 -      rx_filter->va = dma_zalloc_coherent(&adapter->pdev->dev,
 -                                          rx_filter->size, &rx_filter->dma,
 -                                          GFP_KERNEL);
 +      rx_filter->va = dma_zalloc_coherent(dev, rx_filter->size,
 +                                          &rx_filter->dma, GFP_KERNEL);
        if (!rx_filter->va) {
                status = -ENOMEM;
                goto free_mbox;
        }
  
 +      if (lancer_chip(adapter))
 +              stats_cmd->size = sizeof(struct lancer_cmd_req_pport_stats);
 +      else if (BE2_chip(adapter))
 +              stats_cmd->size = sizeof(struct be_cmd_req_get_stats_v0);
 +      else if (BE3_chip(adapter))
 +              stats_cmd->size = sizeof(struct be_cmd_req_get_stats_v1);
 +      else
 +              stats_cmd->size = sizeof(struct be_cmd_req_get_stats_v2);
 +      stats_cmd->va = dma_zalloc_coherent(dev, stats_cmd->size,
 +                                          &stats_cmd->dma, GFP_KERNEL);
 +      if (!stats_cmd->va) {
 +              status = -ENOMEM;
 +              goto free_rx_filter;
 +      }
 +
        mutex_init(&adapter->mbox_lock);
        spin_lock_init(&adapter->mcc_lock);
        spin_lock_init(&adapter->mcc_cq_lock);
 -
        init_completion(&adapter->et_cmd_compl);
 -      pci_save_state(adapter->pdev);
 -      return 0;
 -
 -free_mbox:
 -      dma_free_coherent(&adapter->pdev->dev, mbox_mem_alloc->size,
 -                        mbox_mem_alloc->va, mbox_mem_alloc->dma);
 -
 -unmap_pci_bars:
 -      be_unmap_pci_bars(adapter);
 -
 -done:
 -      return status;
 -}
  
 -static void be_stats_cleanup(struct be_adapter *adapter)
 -{
 -      struct be_dma_mem *cmd = &adapter->stats_cmd;
 +      pci_save_state(adapter->pdev);
  
 -      if (cmd->va)
 -              dma_free_coherent(&adapter->pdev->dev, cmd->size,
 -                                cmd->va, cmd->dma);
 -}
 +      INIT_DELAYED_WORK(&adapter->work, be_worker);
 +      INIT_DELAYED_WORK(&adapter->be_err_detection_work,
 +                        be_err_detection_task);
  
 -static int be_stats_init(struct be_adapter *adapter)
 -{
 -      struct be_dma_mem *cmd = &adapter->stats_cmd;
 +      adapter->rx_fc = true;
 +      adapter->tx_fc = true;
  
 -      if (lancer_chip(adapter))
 -              cmd->size = sizeof(struct lancer_cmd_req_pport_stats);
 -      else if (BE2_chip(adapter))
 -              cmd->size = sizeof(struct be_cmd_req_get_stats_v0);
 -      else if (BE3_chip(adapter))
 -              cmd->size = sizeof(struct be_cmd_req_get_stats_v1);
 -      else
 -              /* ALL non-BE ASICs */
 -              cmd->size = sizeof(struct be_cmd_req_get_stats_v2);
 +      /* Must be a power of 2 or else MODULO will BUG_ON */
 +      adapter->be_get_temp_freq = 64;
  
 -      cmd->va = dma_zalloc_coherent(&adapter->pdev->dev, cmd->size, &cmd->dma,
 -                                    GFP_KERNEL);
 -      if (!cmd->va)
 -              return -ENOMEM;
        return 0;
 +
 +free_rx_filter:
 +      dma_free_coherent(dev, rx_filter->size, rx_filter->va, rx_filter->dma);
 +free_mbox:
 +      dma_free_coherent(dev, mbox_mem_alloc->size, mbox_mem_alloc->va,
 +                        mbox_mem_alloc->dma);
 +      return status;
  }
  
  static void be_remove(struct pci_dev *pdev)
        be_roce_dev_remove(adapter);
        be_intr_set(adapter, false);
  
 -      cancel_delayed_work_sync(&adapter->func_recovery_work);
 +      be_cancel_err_detection(adapter);
  
        unregister_netdev(adapter->netdev);
  
        /* tell fw we're done with firing cmds */
        be_cmd_fw_clean(adapter);
  
 -      be_stats_cleanup(adapter);
 -
 -      be_ctrl_cleanup(adapter);
 +      be_unmap_pci_bars(adapter);
 +      be_drv_cleanup(adapter);
  
        pci_disable_pcie_error_reporting(pdev);
  
        free_netdev(adapter->netdev);
  }
  
 -static int be_get_initial_config(struct be_adapter *adapter)
 -{
 -      int status, level;
 -
 -      status = be_cmd_get_cntl_attributes(adapter);
 -      if (status)
 -              return status;
 -
 -      /* Must be a power of 2 or else MODULO will BUG_ON */
 -      adapter->be_get_temp_freq = 64;
 -
 -      if (BEx_chip(adapter)) {
 -              level = be_cmd_get_fw_log_level(adapter);
 -              adapter->msg_enable =
 -                      level <= FW_LOG_LEVEL_DEFAULT ? NETIF_MSG_HW : 0;
 -      }
 -
 -      adapter->cfg_num_qs = netif_get_num_default_rss_queues();
 -      return 0;
 -}
 -
 -static int lancer_recover_func(struct be_adapter *adapter)
 -{
 -      struct device *dev = &adapter->pdev->dev;
 -      int status;
 -
 -      status = lancer_test_and_set_rdy_state(adapter);
 -      if (status)
 -              goto err;
 -
 -      if (netif_running(adapter->netdev))
 -              be_close(adapter->netdev);
 -
 -      be_clear(adapter);
 -
 -      be_clear_all_error(adapter);
 -
 -      status = be_setup(adapter);
 -      if (status)
 -              goto err;
 -
 -      if (netif_running(adapter->netdev)) {
 -              status = be_open(adapter->netdev);
 -              if (status)
 -                      goto err;
 -      }
 -
 -      dev_err(dev, "Adapter recovery successful\n");
 -      return 0;
 -err:
 -      if (status == -EAGAIN)
 -              dev_err(dev, "Waiting for resource provisioning\n");
 -      else
 -              dev_err(dev, "Adapter recovery failed\n");
 -
 -      return status;
 -}
 -
 -static void be_func_recovery_task(struct work_struct *work)
 -{
 -      struct be_adapter *adapter =
 -              container_of(work, struct be_adapter,  func_recovery_work.work);
 -      int status = 0;
 -
 -      be_detect_error(adapter);
 -
 -      if (adapter->hw_error && lancer_chip(adapter)) {
 -              rtnl_lock();
 -              netif_device_detach(adapter->netdev);
 -              rtnl_unlock();
 -
 -              status = lancer_recover_func(adapter);
 -              if (!status)
 -                      netif_device_attach(adapter->netdev);
 -      }
 -
 -      /* In Lancer, for all errors other than provisioning error (-EAGAIN),
 -       * no need to attempt further recovery.
 -       */
 -      if (!status || status == -EAGAIN)
 -              schedule_delayed_work(&adapter->func_recovery_work,
 -                                    msecs_to_jiffies(1000));
 -}
 -
 -static void be_log_sfp_info(struct be_adapter *adapter)
 -{
 -      int status;
 -
 -      status = be_cmd_query_sfp_info(adapter);
 -      if (!status) {
 -              dev_err(&adapter->pdev->dev,
 -                      "Unqualified SFP+ detected on %c from %s part no: %s",
 -                      adapter->port_name, adapter->phy.vendor_name,
 -                      adapter->phy.vendor_pn);
 -      }
 -      adapter->flags &= ~BE_FLAGS_EVT_INCOMPATIBLE_SFP;
 -}
 -
 -static void be_worker(struct work_struct *work)
 -{
 -      struct be_adapter *adapter =
 -              container_of(work, struct be_adapter, work.work);
 -      struct be_rx_obj *rxo;
 -      int i;
 -
 -      /* when interrupts are not yet enabled, just reap any pending
 -      * mcc completions */
 -      if (!netif_running(adapter->netdev)) {
 -              local_bh_disable();
 -              be_process_mcc(adapter);
 -              local_bh_enable();
 -              goto reschedule;
 -      }
 -
 -      if (!adapter->stats_cmd_sent) {
 -              if (lancer_chip(adapter))
 -                      lancer_cmd_get_pport_stats(adapter,
 -                                                 &adapter->stats_cmd);
 -              else
 -                      be_cmd_get_stats(adapter, &adapter->stats_cmd);
 -      }
 -
 -      if (be_physfn(adapter) &&
 -          MODULO(adapter->work_counter, adapter->be_get_temp_freq) == 0)
 -              be_cmd_get_die_temperature(adapter);
 -
 -      for_all_rx_queues(adapter, rxo, i) {
 -              /* Replenish RX-queues starved due to memory
 -               * allocation failures.
 -               */
 -              if (rxo->rx_post_starved)
 -                      be_post_rx_frags(rxo, GFP_KERNEL, MAX_RX_POST);
 -      }
 -
 -      be_eqd_update(adapter);
 -
 -      if (adapter->flags & BE_FLAGS_EVT_INCOMPATIBLE_SFP)
 -              be_log_sfp_info(adapter);
 -
 -reschedule:
 -      adapter->work_counter++;
 -      schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000));
 -}
 -
 -/* If any VFs are already enabled don't FLR the PF */
 -static bool be_reset_required(struct be_adapter *adapter)
 -{
 -      return pci_num_vf(adapter->pdev) ? false : true;
 -}
 -
  static char *mc_name(struct be_adapter *adapter)
  {
        char *str = ""; /* default */
@@@ -5479,17 -5291,50 +5479,17 @@@ static int be_probe(struct pci_dev *pde
        if (!status)
                dev_info(&pdev->dev, "PCIe error reporting enabled\n");
  
 -      status = be_ctrl_init(adapter);
 +      status = be_map_pci_bars(adapter);
        if (status)
                goto free_netdev;
  
 -      /* sync up with fw's ready state */
 -      if (be_physfn(adapter)) {
 -              status = be_fw_wait_ready(adapter);
 -              if (status)
 -                      goto ctrl_clean;
 -      }
 -
 -      if (be_reset_required(adapter)) {
 -              status = be_cmd_reset_function(adapter);
 -              if (status)
 -                      goto ctrl_clean;
 -
 -              /* Wait for interrupts to quiesce after an FLR */
 -              msleep(100);
 -      }
 -
 -      /* Allow interrupts for other ULPs running on NIC function */
 -      be_intr_set(adapter, true);
 -
 -      /* tell fw we're ready to fire cmds */
 -      status = be_cmd_fw_init(adapter);
 -      if (status)
 -              goto ctrl_clean;
 -
 -      status = be_stats_init(adapter);
 -      if (status)
 -              goto ctrl_clean;
 -
 -      status = be_get_initial_config(adapter);
 +      status = be_drv_init(adapter);
        if (status)
 -              goto stats_clean;
 -
 -      INIT_DELAYED_WORK(&adapter->work, be_worker);
 -      INIT_DELAYED_WORK(&adapter->func_recovery_work, be_func_recovery_task);
 -      adapter->rx_fc = true;
 -      adapter->tx_fc = true;
 +              goto unmap_bars;
  
        status = be_setup(adapter);
        if (status)
 -              goto stats_clean;
 +              goto drv_cleanup;
  
        be_netdev_init(netdev);
        status = register_netdev(netdev);
  
        be_roce_dev_add(adapter);
  
 -      schedule_delayed_work(&adapter->func_recovery_work,
 -                            msecs_to_jiffies(1000));
 +      be_schedule_err_detection(adapter);
  
        dev_info(&pdev->dev, "%s: %s %s port %c\n", nic_name(pdev),
                 func_name(adapter), mc_name(adapter), adapter->port_name);
  
  unsetup:
        be_clear(adapter);
 -stats_clean:
 -      be_stats_cleanup(adapter);
 -ctrl_clean:
 -      be_ctrl_cleanup(adapter);
 +drv_cleanup:
 +      be_drv_cleanup(adapter);
 +unmap_bars:
 +      be_unmap_pci_bars(adapter);
  free_netdev:
        free_netdev(netdev);
  rel_reg:
@@@ -5525,14 -5371,21 +5525,14 @@@ do_none
  static int be_suspend(struct pci_dev *pdev, pm_message_t state)
  {
        struct be_adapter *adapter = pci_get_drvdata(pdev);
 -      struct net_device *netdev =  adapter->netdev;
  
        if (adapter->wol_en)
                be_setup_wol(adapter, true);
  
        be_intr_set(adapter, false);
 -      cancel_delayed_work_sync(&adapter->func_recovery_work);
 +      be_cancel_err_detection(adapter);
  
 -      netif_device_detach(netdev);
 -      if (netif_running(netdev)) {
 -              rtnl_lock();
 -              be_close(netdev);
 -              rtnl_unlock();
 -      }
 -      be_clear(adapter);
 +      be_cleanup(adapter);
  
        pci_save_state(pdev);
        pci_disable_device(pdev);
        return 0;
  }
  
 -static int be_resume(struct pci_dev *pdev)
 +static int be_pci_resume(struct pci_dev *pdev)
  {
 -      int status = 0;
        struct be_adapter *adapter = pci_get_drvdata(pdev);
 -      struct net_device *netdev =  adapter->netdev;
 -
 -      netif_device_detach(netdev);
 +      int status = 0;
  
        status = pci_enable_device(pdev);
        if (status)
        pci_set_power_state(pdev, PCI_D0);
        pci_restore_state(pdev);
  
 -      status = be_fw_wait_ready(adapter);
 -      if (status)
 -              return status;
 -
 -      status = be_cmd_reset_function(adapter);
 -      if (status)
 -              return status;
 -
 -      be_intr_set(adapter, true);
 -      /* tell fw we're ready to fire cmds */
 -      status = be_cmd_fw_init(adapter);
 +      status = be_resume(adapter);
        if (status)
                return status;
  
 -      be_setup(adapter);
 -      if (netif_running(netdev)) {
 -              rtnl_lock();
 -              be_open(netdev);
 -              rtnl_unlock();
 -      }
 -
 -      schedule_delayed_work(&adapter->func_recovery_work,
 -                            msecs_to_jiffies(1000));
 -      netif_device_attach(netdev);
 +      be_schedule_err_detection(adapter);
  
        if (adapter->wol_en)
                be_setup_wol(adapter, false);
@@@ -5576,7 -5451,7 +5576,7 @@@ static void be_shutdown(struct pci_dev 
  
        be_roce_dev_shutdown(adapter);
        cancel_delayed_work_sync(&adapter->work);
 -      cancel_delayed_work_sync(&adapter->func_recovery_work);
 +      be_cancel_err_detection(adapter);
  
        netif_device_detach(adapter->netdev);
  
@@@ -5589,15 -5464,22 +5589,15 @@@ static pci_ers_result_t be_eeh_err_dete
                                            pci_channel_state_t state)
  {
        struct be_adapter *adapter = pci_get_drvdata(pdev);
 -      struct net_device *netdev =  adapter->netdev;
  
        dev_err(&adapter->pdev->dev, "EEH error detected\n");
  
        if (!adapter->eeh_error) {
                adapter->eeh_error = true;
  
 -              cancel_delayed_work_sync(&adapter->func_recovery_work);
 +              be_cancel_err_detection(adapter);
  
 -              rtnl_lock();
 -              netif_device_detach(netdev);
 -              if (netif_running(netdev))
 -                      be_close(netdev);
 -              rtnl_unlock();
 -
 -              be_clear(adapter);
 +              be_cleanup(adapter);
        }
  
        if (state == pci_channel_io_perm_failure)
@@@ -5648,73 -5530,43 +5648,73 @@@ static void be_eeh_resume(struct pci_de
  {
        int status = 0;
        struct be_adapter *adapter = pci_get_drvdata(pdev);
 -      struct net_device *netdev =  adapter->netdev;
  
        dev_info(&adapter->pdev->dev, "EEH resume\n");
  
        pci_save_state(pdev);
  
 -      status = be_cmd_reset_function(adapter);
 +      status = be_resume(adapter);
        if (status)
                goto err;
  
 -      /* On some BE3 FW versions, after a HW reset,
 -       * interrupts will remain disabled for each function.
 -       * So, explicitly enable interrupts
 +      be_schedule_err_detection(adapter);
 +      return;
 +err:
 +      dev_err(&adapter->pdev->dev, "EEH resume failed\n");
 +}
 +
 +static int be_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
 +{
 +      struct be_adapter *adapter = pci_get_drvdata(pdev);
 +      u16 num_vf_qs;
 +      int status;
 +
 +      if (!num_vfs)
 +              be_vf_clear(adapter);
 +
 +      adapter->num_vfs = num_vfs;
 +
 +      if (adapter->num_vfs == 0 && pci_vfs_assigned(pdev)) {
 +              dev_warn(&pdev->dev,
 +                       "Cannot disable VFs while they are assigned\n");
 +              return -EBUSY;
 +      }
 +
 +      /* When the HW is in SRIOV capable configuration, the PF-pool resources
 +       * are equally distributed across the max-number of VFs. The user may
 +       * request only a subset of the max-vfs to be enabled.
 +       * Based on num_vfs, redistribute the resources across num_vfs so that
 +       * each VF will have access to more number of resources.
 +       * This facility is not available in BE3 FW.
 +       * Also, this is done by FW in Lancer chip.
         */
 -      be_intr_set(adapter, true);
 +      if (skyhawk_chip(adapter) && !pci_num_vf(pdev)) {
 +              num_vf_qs = be_calculate_vf_qs(adapter, adapter->num_vfs);
 +              status = be_cmd_set_sriov_config(adapter, adapter->pool_res,
 +                                               adapter->num_vfs, num_vf_qs);
 +              if (status)
 +                      dev_err(&pdev->dev,
 +                              "Failed to optimize SR-IOV resources\n");
 +      }
  
 -      /* tell fw we're ready to fire cmds */
 -      status = be_cmd_fw_init(adapter);
 +      status = be_get_resources(adapter);
        if (status)
 -              goto err;
 +              return be_cmd_status(status);
  
 -      status = be_setup(adapter);
 +      /* Updating real_num_tx/rx_queues() requires rtnl_lock() */
 +      rtnl_lock();
 +      status = be_update_queues(adapter);
 +      rtnl_unlock();
        if (status)
 -              goto err;
 +              return be_cmd_status(status);
  
 -      if (netif_running(netdev)) {
 -              status = be_open(netdev);
 -              if (status)
 -                      goto err;
 -      }
 +      if (adapter->num_vfs)
 +              status = be_vf_setup(adapter);
  
 -      schedule_delayed_work(&adapter->func_recovery_work,
 -                            msecs_to_jiffies(1000));
 -      netif_device_attach(netdev);
 -      return;
 -err:
 -      dev_err(&adapter->pdev->dev, "EEH resume failed\n");
 +      if (!status)
 +              return adapter->num_vfs;
 +
 +      return 0;
  }
  
  static const struct pci_error_handlers be_eeh_handlers = {
@@@ -5729,9 -5581,8 +5729,9 @@@ static struct pci_driver be_driver = 
        .probe = be_probe,
        .remove = be_remove,
        .suspend = be_suspend,
 -      .resume = be_resume,
 +      .resume = be_pci_resume,
        .shutdown = be_shutdown,
 +      .sriov_configure = be_pci_sriov_configure,
        .err_handler = &be_eeh_handlers
  };
  
@@@ -5745,11 -5596,6 +5745,11 @@@ static int __init be_init_module(void
                rx_frag_size = 2048;
        }
  
 +      if (num_vfs > 0) {
 +              pr_info(DRV_NAME " : Module param num_vfs is obsolete.");
 +              pr_info(DRV_NAME " : Use sysfs method to enable VFs\n");
 +      }
 +
        return pci_register_driver(&be_driver);
  }
  module_init(be_init_module);
diff --combined drivers/net/vxlan.c
index 577c9b071ad9e8568d955a39ce00eb185e52e186,fceb637efd6b1246a836a244600e00d68a504aeb..154116aafd0d8c5cb6caab9056a2245cbc3c783b
@@@ -127,6 -127,10 +127,6 @@@ struct vxlan_dev 
        __u8              ttl;
        u32               flags;        /* VXLAN_F_* in vxlan.h */
  
 -      struct work_struct sock_work;
 -      struct work_struct igmp_join;
 -      struct work_struct igmp_leave;
 -
        unsigned long     age_interval;
        struct timer_list age_timer;
        spinlock_t        hash_lock;
  static u32 vxlan_salt __read_mostly;
  static struct workqueue_struct *vxlan_wq;
  
 -static void vxlan_sock_work(struct work_struct *work);
 -
  #if IS_ENABLED(CONFIG_IPV6)
  static inline
  bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
  {
 -       if (a->sa.sa_family != b->sa.sa_family)
 -               return false;
 -       if (a->sa.sa_family == AF_INET6)
 -               return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
 -       else
 -               return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
 +      if (a->sa.sa_family != b->sa.sa_family)
 +              return false;
 +      if (a->sa.sa_family == AF_INET6)
 +              return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
 +      else
 +              return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
  }
  
  static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
  {
 -       if (ipa->sa.sa_family == AF_INET6)
 -               return ipv6_addr_any(&ipa->sin6.sin6_addr);
 -       else
 -               return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
 +      if (ipa->sa.sa_family == AF_INET6)
 +              return ipv6_addr_any(&ipa->sin6.sin6_addr);
 +      else
 +              return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
  }
  
  static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
  {
 -       if (ipa->sa.sa_family == AF_INET6)
 -               return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
 -       else
 -               return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
 +      if (ipa->sa.sa_family == AF_INET6)
 +              return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
 +      else
 +              return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
  }
  
  static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
  {
 -       if (nla_len(nla) >= sizeof(struct in6_addr)) {
 -               nla_memcpy(&ip->sin6.sin6_addr, nla, sizeof(struct in6_addr));
 -               ip->sa.sa_family = AF_INET6;
 -               return 0;
 -       } else if (nla_len(nla) >= sizeof(__be32)) {
 -               ip->sin.sin_addr.s_addr = nla_get_be32(nla);
 -               ip->sa.sa_family = AF_INET;
 -               return 0;
 -       } else {
 -               return -EAFNOSUPPORT;
 -       }
 +      if (nla_len(nla) >= sizeof(struct in6_addr)) {
 +              ip->sin6.sin6_addr = nla_get_in6_addr(nla);
 +              ip->sa.sa_family = AF_INET6;
 +              return 0;
 +      } else if (nla_len(nla) >= sizeof(__be32)) {
 +              ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
 +              ip->sa.sa_family = AF_INET;
 +              return 0;
 +      } else {
 +              return -EAFNOSUPPORT;
 +      }
  }
  
  static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
 -                             const union vxlan_addr *ip)
 +                            const union vxlan_addr *ip)
  {
 -       if (ip->sa.sa_family == AF_INET6)
 -               return nla_put(skb, attr, sizeof(struct in6_addr), &ip->sin6.sin6_addr);
 -       else
 -               return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
 +      if (ip->sa.sa_family == AF_INET6)
 +              return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
 +      else
 +              return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
  }
  
  #else /* !CONFIG_IPV6 */
  static inline
  bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
  {
 -       return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
 +      return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
  }
  
  static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
  {
 -       return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
 +      return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
  }
  
  static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
  {
 -       return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
 +      return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
  }
  
  static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
  {
 -       if (nla_len(nla) >= sizeof(struct in6_addr)) {
 -               return -EAFNOSUPPORT;
 -       } else if (nla_len(nla) >= sizeof(__be32)) {
 -               ip->sin.sin_addr.s_addr = nla_get_be32(nla);
 -               ip->sa.sa_family = AF_INET;
 -               return 0;
 -       } else {
 -               return -EAFNOSUPPORT;
 -       }
 +      if (nla_len(nla) >= sizeof(struct in6_addr)) {
 +              return -EAFNOSUPPORT;
 +      } else if (nla_len(nla) >= sizeof(__be32)) {
 +              ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
 +              ip->sa.sa_family = AF_INET;
 +              return 0;
 +      } else {
 +              return -EAFNOSUPPORT;
 +      }
  }
  
  static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
 -                             const union vxlan_addr *ip)
 +                            const union vxlan_addr *ip)
  {
 -       return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
 +      return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
  }
  #endif
  
@@@ -989,7 -995,7 +989,7 @@@ out
  
  /* Watch incoming packets to learn mapping between Ethernet address
   * and Tunnel endpoint.
 - * Return true if packet is bogus and should be droppped.
 + * Return true if packet is bogus and should be dropped.
   */
  static bool vxlan_snoop(struct net_device *dev,
                        union vxlan_addr *src_ip, const u8 *src_mac)
@@@ -1066,6 -1072,11 +1066,6 @@@ static bool vxlan_group_used(struct vxl
        return false;
  }
  
 -static void vxlan_sock_hold(struct vxlan_sock *vs)
 -{
 -      atomic_inc(&vs->refcnt);
 -}
 -
  void vxlan_sock_release(struct vxlan_sock *vs)
  {
        struct sock *sk = vs->sock->sk;
  }
  EXPORT_SYMBOL_GPL(vxlan_sock_release);
  
 -/* Callback to update multicast group membership when first VNI on
 - * multicast asddress is brought up
 - * Done as workqueue because ip_mc_join_group acquires RTNL.
 +/* Update multicast group membership when first VNI on
 + * multicast address is brought up
   */
 -static void vxlan_igmp_join(struct work_struct *work)
 +static int vxlan_igmp_join(struct vxlan_dev *vxlan)
  {
 -      struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join);
        struct vxlan_sock *vs = vxlan->vn_sock;
        struct sock *sk = vs->sock->sk;
        union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
        int ifindex = vxlan->default_dst.remote_ifindex;
 +      int ret = -EINVAL;
  
        lock_sock(sk);
        if (ip->sa.sa_family == AF_INET) {
                        .imr_ifindex            = ifindex,
                };
  
 -              ip_mc_join_group(sk, &mreq);
 +              ret = ip_mc_join_group(sk, &mreq);
  #if IS_ENABLED(CONFIG_IPV6)
        } else {
 -              ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
 -                                           &ip->sin6.sin6_addr);
 +              ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
 +                                                 &ip->sin6.sin6_addr);
  #endif
        }
        release_sock(sk);
  
 -      vxlan_sock_release(vs);
 -      dev_put(vxlan->dev);
 +      return ret;
  }
  
  /* Inverse of vxlan_igmp_join when last VNI is brought down */
 -static void vxlan_igmp_leave(struct work_struct *work)
 +static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
  {
 -      struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave);
        struct vxlan_sock *vs = vxlan->vn_sock;
        struct sock *sk = vs->sock->sk;
        union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
        int ifindex = vxlan->default_dst.remote_ifindex;
 +      int ret = -EINVAL;
  
        lock_sock(sk);
        if (ip->sa.sa_family == AF_INET) {
                        .imr_ifindex            = ifindex,
                };
  
 -              ip_mc_leave_group(sk, &mreq);
 +              ret = ip_mc_leave_group(sk, &mreq);
  #if IS_ENABLED(CONFIG_IPV6)
        } else {
 -              ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
 -                                           &ip->sin6.sin6_addr);
 +              ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
 +                                                 &ip->sin6.sin6_addr);
  #endif
        }
 -
        release_sock(sk);
  
 -      vxlan_sock_release(vs);
 -      dev_put(vxlan->dev);
 +      return ret;
  }
  
  static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
@@@ -1229,7 -1244,7 +1229,7 @@@ static int vxlan_udp_encap_recv(struct 
                 * this as a malformed packet. This behavior diverges from
                 * VXLAN RFC (RFC7348) which stipulates that bits in reserved
                 * in reserved fields are to be ignored. The approach here
 -               * maintains compatbility with previous stack code, and also
 +               * maintains compatibility with previous stack code, and also
                 * is more robust and provides a little more security in
                 * adding extensions to VXLAN.
                 */
@@@ -1672,8 -1687,7 +1672,8 @@@ static void vxlan_build_gbp_hdr(struct 
  }
  
  #if IS_ENABLED(CONFIG_IPV6)
 -static int vxlan6_xmit_skb(struct dst_entry *dst, struct sk_buff *skb,
 +static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
 +                         struct sk_buff *skb,
                           struct net_device *dev, struct in6_addr *saddr,
                           struct in6_addr *daddr, __u8 prio, __u8 ttl,
                           __be16 src_port, __be16 dst_port,
                }
        }
  
-       skb = iptunnel_handle_offloads(skb, udp_sum, type);
-       if (IS_ERR(skb)) {
-               err = -EINVAL;
-               goto err;
-       }
        skb_scrub_packet(skb, xnet);
  
        min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
                goto err;
        }
  
+       skb = iptunnel_handle_offloads(skb, udp_sum, type);
+       if (IS_ERR(skb)) {
+               err = -EINVAL;
+               goto err;
+       }
        vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
        vxh->vx_flags = htonl(VXLAN_HF_VNI);
        vxh->vx_vni = md->vni;
  
        skb_set_inner_protocol(skb, htons(ETH_P_TEB));
  
 -      udp_tunnel6_xmit_skb(dst, skb, dev, saddr, daddr, prio,
 +      udp_tunnel6_xmit_skb(dst, sk, skb, dev, saddr, daddr, prio,
                             ttl, src_port, dst_port,
                             !!(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX));
        return 0;
@@@ -1759,7 -1773,7 +1759,7 @@@ err
  }
  #endif
  
 -int vxlan_xmit_skb(struct rtable *rt, struct sk_buff *skb,
 +int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
                   __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
                   __be16 src_port, __be16 dst_port,
                   struct vxlan_metadata *md, bool xnet, u32 vxflags)
                }
        }
  
-       skb = iptunnel_handle_offloads(skb, udp_sum, type);
-       if (IS_ERR(skb))
-               return PTR_ERR(skb);
        min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
                        + VXLAN_HLEN + sizeof(struct iphdr)
                        + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
        if (WARN_ON(!skb))
                return -ENOMEM;
  
+       skb = iptunnel_handle_offloads(skb, udp_sum, type);
+       if (IS_ERR(skb))
+               return PTR_ERR(skb);
        vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
        vxh->vx_flags = htonl(VXLAN_HF_VNI);
        vxh->vx_vni = md->vni;
  
        skb_set_inner_protocol(skb, htons(ETH_P_TEB));
  
 -      return udp_tunnel_xmit_skb(rt, skb, src, dst, tos,
 +      return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos,
                                   ttl, df, src_port, dst_port, xnet,
                                   !(vxflags & VXLAN_F_UDP_CSUM));
  }
@@@ -1883,7 -1897,6 +1883,7 @@@ static void vxlan_xmit_one(struct sk_bu
                           struct vxlan_rdst *rdst, bool did_rsc)
  {
        struct vxlan_dev *vxlan = netdev_priv(dev);
 +      struct sock *sk = vxlan->vn_sock->sock->sk;
        struct rtable *rt = NULL;
        const struct iphdr *old_iph;
        struct flowi4 fl4;
                md.vni = htonl(vni << 8);
                md.gbp = skb->mark;
  
 -              err = vxlan_xmit_skb(rt, skb, fl4.saddr,
 +              err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,
                                     dst->sin.sin_addr.s_addr, tos, ttl, df,
                                     src_port, dst_port, &md,
                                     !net_eq(vxlan->net, dev_net(vxlan->dev)),
                iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
  #if IS_ENABLED(CONFIG_IPV6)
        } else {
 -              struct sock *sk = vxlan->vn_sock->sock->sk;
                struct dst_entry *ndst;
                struct flowi6 fl6;
                u32 flags;
                md.vni = htonl(vni << 8);
                md.gbp = skb->mark;
  
 -              err = vxlan6_xmit_skb(ndst, skb, dev, &fl6.saddr, &fl6.daddr,
 +              err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
                                      0, ttl, src_port, dst_port, &md,
                                      !net_eq(vxlan->net, dev_net(vxlan->dev)),
                                      vxlan->flags);
@@@ -2161,22 -2175,37 +2161,22 @@@ static void vxlan_cleanup(unsigned lon
  
  static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan)
  {
 +      struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
        __u32 vni = vxlan->default_dst.remote_vni;
  
        vxlan->vn_sock = vs;
 +      spin_lock(&vn->sock_lock);
        hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
 +      spin_unlock(&vn->sock_lock);
  }
  
  /* Setup stats when device is created */
  static int vxlan_init(struct net_device *dev)
  {
 -      struct vxlan_dev *vxlan = netdev_priv(dev);
 -      struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
 -      struct vxlan_sock *vs;
 -      bool ipv6 = vxlan->flags & VXLAN_F_IPV6;
 -
        dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
        if (!dev->tstats)
                return -ENOMEM;
  
 -      spin_lock(&vn->sock_lock);
 -      vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
 -                           vxlan->dst_port, vxlan->flags);
 -      if (vs && atomic_add_unless(&vs->refcnt, 1, 0)) {
 -              /* If we have a socket with same port already, reuse it */
 -              vxlan_vs_add_dev(vs, vxlan);
 -      } else {
 -              /* otherwise make new socket outside of RTNL */
 -              dev_hold(dev);
 -              queue_work(vxlan_wq, &vxlan->sock_work);
 -      }
 -      spin_unlock(&vn->sock_lock);
 -
        return 0;
  }
  
@@@ -2194,9 -2223,12 +2194,9 @@@ static void vxlan_fdb_delete_default(st
  static void vxlan_uninit(struct net_device *dev)
  {
        struct vxlan_dev *vxlan = netdev_priv(dev);
 -      struct vxlan_sock *vs = vxlan->vn_sock;
  
        vxlan_fdb_delete_default(vxlan);
  
 -      if (vs)
 -              vxlan_sock_release(vs);
        free_percpu(dev->tstats);
  }
  
  static int vxlan_open(struct net_device *dev)
  {
        struct vxlan_dev *vxlan = netdev_priv(dev);
 -      struct vxlan_sock *vs = vxlan->vn_sock;
 +      struct vxlan_sock *vs;
 +      int ret = 0;
  
 -      /* socket hasn't been created */
 -      if (!vs)
 -              return -ENOTCONN;
 +      vs = vxlan_sock_add(vxlan->net, vxlan->dst_port, vxlan_rcv, NULL,
 +                          false, vxlan->flags);
 +      if (IS_ERR(vs))
 +              return PTR_ERR(vs);
 +
 +      vxlan_vs_add_dev(vs, vxlan);
  
        if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
 -              vxlan_sock_hold(vs);
 -              dev_hold(dev);
 -              queue_work(vxlan_wq, &vxlan->igmp_join);
 +              ret = vxlan_igmp_join(vxlan);
 +              if (ret) {
 +                      vxlan_sock_release(vs);
 +                      return ret;
 +              }
        }
  
        if (vxlan->age_interval)
                mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
  
 -      return 0;
 +      return ret;
  }
  
  /* Purge the forwarding table */
@@@ -2253,18 -2279,19 +2253,18 @@@ static int vxlan_stop(struct net_devic
        struct vxlan_dev *vxlan = netdev_priv(dev);
        struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
        struct vxlan_sock *vs = vxlan->vn_sock;
 +      int ret = 0;
  
 -      if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
 -          !vxlan_group_used(vn, vxlan)) {
 -              vxlan_sock_hold(vs);
 -              dev_hold(dev);
 -              queue_work(vxlan_wq, &vxlan->igmp_leave);
 -      }
 +      if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
 +          !vxlan_group_used(vn, vxlan))
 +              ret = vxlan_igmp_leave(vxlan);
  
        del_timer_sync(&vxlan->age_timer);
  
        vxlan_flush(vxlan);
 +      vxlan_sock_release(vs);
  
 -      return 0;
 +      return ret;
  }
  
  /* Stub, nothing needs to be done. */
@@@ -2375,6 -2402,9 +2375,6 @@@ static void vxlan_setup(struct net_devi
  
        INIT_LIST_HEAD(&vxlan->next);
        spin_lock_init(&vxlan->hash_lock);
 -      INIT_WORK(&vxlan->igmp_join, vxlan_igmp_join);
 -      INIT_WORK(&vxlan->igmp_leave, vxlan_igmp_leave);
 -      INIT_WORK(&vxlan->sock_work, vxlan_sock_work);
  
        init_timer_deferrable(&vxlan->age_timer);
        vxlan->age_timer.function = vxlan_cleanup;
@@@ -2486,6 -2516,7 +2486,6 @@@ static struct socket *vxlan_create_sock
                    !(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
        } else {
                udp_conf.family = AF_INET;
 -              udp_conf.local_ip.s_addr = INADDR_ANY;
        }
  
        udp_conf.local_udp_port = port;
@@@ -2521,8 -2552,6 +2521,8 @@@ static struct vxlan_sock *vxlan_socket_
  
        sock = vxlan_create_sock(net, ipv6, port, flags);
        if (IS_ERR(sock)) {
 +              pr_info("Cannot bind port %d, err=%ld\n", ntohs(port),
 +                      PTR_ERR(sock));
                kfree(vs);
                return ERR_CAST(sock);
        }
@@@ -2562,23 -2591,45 +2562,23 @@@ struct vxlan_sock *vxlan_sock_add(struc
        struct vxlan_sock *vs;
        bool ipv6 = flags & VXLAN_F_IPV6;
  
 -      vs = vxlan_socket_create(net, port, rcv, data, flags);
 -      if (!IS_ERR(vs))
 -              return vs;
 -
 -      if (no_share)   /* Return error if sharing is not allowed. */
 -              return vs;
 -
 -      spin_lock(&vn->sock_lock);
 -      vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, flags);
 -      if (vs && ((vs->rcv != rcv) ||
 -                 !atomic_add_unless(&vs->refcnt, 1, 0)))
 -                      vs = ERR_PTR(-EBUSY);
 -      spin_unlock(&vn->sock_lock);
 -
 -      if (!vs)
 -              vs = ERR_PTR(-EINVAL);
 +      if (!no_share) {
 +              spin_lock(&vn->sock_lock);
 +              vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port,
 +                                   flags);
 +              if (vs && vs->rcv == rcv) {
 +                      if (!atomic_add_unless(&vs->refcnt, 1, 0))
 +                              vs = ERR_PTR(-EBUSY);
 +                      spin_unlock(&vn->sock_lock);
 +                      return vs;
 +              }
 +              spin_unlock(&vn->sock_lock);
 +      }
  
 -      return vs;
 +      return vxlan_socket_create(net, port, rcv, data, flags);
  }
  EXPORT_SYMBOL_GPL(vxlan_sock_add);
  
 -/* Scheduled at device creation to bind to a socket */
 -static void vxlan_sock_work(struct work_struct *work)
 -{
 -      struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, sock_work);
 -      struct net *net = vxlan->net;
 -      struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 -      __be16 port = vxlan->dst_port;
 -      struct vxlan_sock *nvs;
 -
 -      nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags);
 -      spin_lock(&vn->sock_lock);
 -      if (!IS_ERR(nvs))
 -              vxlan_vs_add_dev(nvs, vxlan);
 -      spin_unlock(&vn->sock_lock);
 -
 -      dev_put(vxlan->dev);
 -}
 -
  static int vxlan_newlink(struct net *src_net, struct net_device *dev,
                         struct nlattr *tb[], struct nlattr *data[])
  {
        /* Unless IPv6 is explicitly requested, assume IPv4 */
        dst->remote_ip.sa.sa_family = AF_INET;
        if (data[IFLA_VXLAN_GROUP]) {
 -              dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
 +              dst->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
        } else if (data[IFLA_VXLAN_GROUP6]) {
                if (!IS_ENABLED(CONFIG_IPV6))
                        return -EPFNOSUPPORT;
  
 -              nla_memcpy(&dst->remote_ip.sin6.sin6_addr, data[IFLA_VXLAN_GROUP6],
 -                         sizeof(struct in6_addr));
 +              dst->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
                dst->remote_ip.sa.sa_family = AF_INET6;
                use_ipv6 = true;
        }
  
        if (data[IFLA_VXLAN_LOCAL]) {
 -              vxlan->saddr.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
 +              vxlan->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
                vxlan->saddr.sa.sa_family = AF_INET;
        } else if (data[IFLA_VXLAN_LOCAL6]) {
                if (!IS_ENABLED(CONFIG_IPV6))
                        return -EPFNOSUPPORT;
  
                /* TODO: respect scope id */
 -              nla_memcpy(&vxlan->saddr.sin6.sin6_addr, data[IFLA_VXLAN_LOCAL6],
 -                         sizeof(struct in6_addr));
 +              vxlan->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
                vxlan->saddr.sa.sa_family = AF_INET6;
                use_ipv6 = true;
        }
@@@ -2803,13 -2856,13 +2803,13 @@@ static int vxlan_fill_info(struct sk_bu
  
        if (!vxlan_addr_any(&dst->remote_ip)) {
                if (dst->remote_ip.sa.sa_family == AF_INET) {
 -                      if (nla_put_be32(skb, IFLA_VXLAN_GROUP,
 -                                       dst->remote_ip.sin.sin_addr.s_addr))
 +                      if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
 +                                          dst->remote_ip.sin.sin_addr.s_addr))
                                goto nla_put_failure;
  #if IS_ENABLED(CONFIG_IPV6)
                } else {
 -                      if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr),
 -                                  &dst->remote_ip.sin6.sin6_addr))
 +                      if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
 +                                           &dst->remote_ip.sin6.sin6_addr))
                                goto nla_put_failure;
  #endif
                }
  
        if (!vxlan_addr_any(&vxlan->saddr)) {
                if (vxlan->saddr.sa.sa_family == AF_INET) {
 -                      if (nla_put_be32(skb, IFLA_VXLAN_LOCAL,
 -                                       vxlan->saddr.sin.sin_addr.s_addr))
 +                      if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
 +                                          vxlan->saddr.sin.sin_addr.s_addr))
                                goto nla_put_failure;
  #if IS_ENABLED(CONFIG_IPV6)
                } else {
 -                      if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr),
 -                                  &vxlan->saddr.sin6.sin6_addr))
 +                      if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
 +                                           &vxlan->saddr.sin6.sin6_addr))
                                goto nla_put_failure;
  #endif
                }
diff --combined include/linux/brcmphy.h
index cab60661752237f736c817588d1d0e1a01469cdf,1c9920b38fa1b15f8dd91b2e47c653e9f7cebe0b..ae2982c0f7a60ed93339e767feaf1fc89aa02134
@@@ -11,6 -11,7 +11,7 @@@
  #define PHY_ID_BCM5421                        0x002060e0
  #define PHY_ID_BCM5464                        0x002060b0
  #define PHY_ID_BCM5461                        0x002060c0
+ #define PHY_ID_BCM54616S              0x03625d10
  #define PHY_ID_BCM57780                       0x03625d90
  
  #define PHY_ID_BCM7250                        0xae025280
@@@ -19,7 -20,6 +20,7 @@@
  #define PHY_ID_BCM7425                        0x03625e60
  #define PHY_ID_BCM7429                        0x600d8730
  #define PHY_ID_BCM7439                        0x600d8480
 +#define PHY_ID_BCM7439_2              0xae025080
  #define PHY_ID_BCM7445                        0x600d8510
  
  #define PHY_BCM_OUI_MASK              0xfffffc00
diff --combined net/ipv4/fou.c
index 263710259774151e40fa67ba3aa9652d4a1e2955,335e75207284e13eab64b748188d82764818b46d..af150b43b214123b052c43ec7e40449af3d7ecd2
  #include <uapi/linux/fou.h>
  #include <uapi/linux/genetlink.h>
  
 -static DEFINE_SPINLOCK(fou_lock);
 -static LIST_HEAD(fou_list);
 -
  struct fou {
        struct socket *sock;
        u8 protocol;
        u8 flags;
 -      u16 port;
 +      __be16 port;
 +      u16 type;
        struct udp_offload udp_offloads;
        struct list_head list;
  };
@@@ -35,13 -37,6 +35,13 @@@ struct fou_cfg 
        struct udp_port_cfg udp_config;
  };
  
 +static unsigned int fou_net_id;
 +
 +struct fou_net {
 +      struct list_head fou_list;
 +      struct mutex fou_lock;
 +};
 +
  static inline struct fou *fou_from_sock(struct sock *sk)
  {
        return sk->sk_user_data;
@@@ -392,21 -387,20 +392,21 @@@ out_unlock
        return err;
  }
  
 -static int fou_add_to_port_list(struct fou *fou)
 +static int fou_add_to_port_list(struct net *net, struct fou *fou)
  {
 +      struct fou_net *fn = net_generic(net, fou_net_id);
        struct fou *fout;
  
 -      spin_lock(&fou_lock);
 -      list_for_each_entry(fout, &fou_list, list) {
 +      mutex_lock(&fn->fou_lock);
 +      list_for_each_entry(fout, &fn->fou_list, list) {
                if (fou->port == fout->port) {
 -                      spin_unlock(&fou_lock);
 +                      mutex_unlock(&fn->fou_lock);
                        return -EALREADY;
                }
        }
  
 -      list_add(&fou->list, &fou_list);
 -      spin_unlock(&fou_lock);
 +      list_add(&fou->list, &fn->fou_list);
 +      mutex_unlock(&fn->fou_lock);
  
        return 0;
  }
@@@ -416,10 -410,14 +416,10 @@@ static void fou_release(struct fou *fou
        struct socket *sock = fou->sock;
        struct sock *sk = sock->sk;
  
 -      udp_del_offload(&fou->udp_offloads);
 -
 +      if (sk->sk_family == AF_INET)
 +              udp_del_offload(&fou->udp_offloads);
        list_del(&fou->list);
 -
 -      /* Remove hooks into tunnel socket */
 -      sk->sk_user_data = NULL;
 -
 -      sock_release(sock);
 +      udp_tunnel_sock_release(sock);
  
        kfree(fou);
  }
@@@ -449,10 -447,10 +449,10 @@@ static int gue_encap_init(struct sock *
  static int fou_create(struct net *net, struct fou_cfg *cfg,
                      struct socket **sockp)
  {
 -      struct fou *fou = NULL;
 -      int err;
        struct socket *sock = NULL;
 +      struct fou *fou = NULL;
        struct sock *sk;
 +      int err;
  
        /* Open UDP socket */
        err = udp_sock_create(net, &cfg->udp_config, &sock);
                goto error;
        }
  
 +      fou->type = cfg->type;
 +
        udp_sk(sk)->encap_type = 1;
        udp_encap_enable();
  
                        goto error;
        }
  
 -      err = fou_add_to_port_list(fou);
 +      err = fou_add_to_port_list(net, fou);
        if (err)
                goto error;
  
  error:
        kfree(fou);
        if (sock)
 -              sock_release(sock);
 +              udp_tunnel_sock_release(sock);
  
        return err;
  }
  
  static int fou_destroy(struct net *net, struct fou_cfg *cfg)
  {
 -      struct fou *fou;
 -      u16 port = cfg->udp_config.local_udp_port;
 +      struct fou_net *fn = net_generic(net, fou_net_id);
 +      __be16 port = cfg->udp_config.local_udp_port;
        int err = -EINVAL;
 +      struct fou *fou;
  
 -      spin_lock(&fou_lock);
 -      list_for_each_entry(fou, &fou_list, list) {
 +      mutex_lock(&fn->fou_lock);
 +      list_for_each_entry(fou, &fn->fou_list, list) {
                if (fou->port == port) {
 -                      udp_del_offload(&fou->udp_offloads);
                        fou_release(fou);
                        err = 0;
                        break;
                }
        }
 -      spin_unlock(&fou_lock);
 +      mutex_unlock(&fn->fou_lock);
  
        return err;
  }
@@@ -577,7 -573,7 +577,7 @@@ static int parse_nl_config(struct genl_
        }
  
        if (info->attrs[FOU_ATTR_PORT]) {
 -              u16 port = nla_get_u16(info->attrs[FOU_ATTR_PORT]);
 +              __be16 port = nla_get_be16(info->attrs[FOU_ATTR_PORT]);
  
                cfg->udp_config.local_udp_port = port;
        }
  
  static int fou_nl_cmd_add_port(struct sk_buff *skb, struct genl_info *info)
  {
 +      struct net *net = genl_info_net(info);
        struct fou_cfg cfg;
        int err;
  
        if (err)
                return err;
  
 -      return fou_create(&init_net, &cfg, NULL);
 +      return fou_create(net, &cfg, NULL);
  }
  
  static int fou_nl_cmd_rm_port(struct sk_buff *skb, struct genl_info *info)
  {
 +      struct net *net = genl_info_net(info);
 +      struct fou_cfg cfg;
 +      int err;
 +
 +      err = parse_nl_config(info, &cfg);
 +      if (err)
 +              return err;
 +
 +      return fou_destroy(net, &cfg);
 +}
 +
 +static int fou_fill_info(struct fou *fou, struct sk_buff *msg)
 +{
 +      if (nla_put_u8(msg, FOU_ATTR_AF, fou->sock->sk->sk_family) ||
 +          nla_put_be16(msg, FOU_ATTR_PORT, fou->port) ||
 +          nla_put_u8(msg, FOU_ATTR_IPPROTO, fou->protocol) ||
 +          nla_put_u8(msg, FOU_ATTR_TYPE, fou->type))
 +              return -1;
 +
 +      if (fou->flags & FOU_F_REMCSUM_NOPARTIAL)
 +              if (nla_put_flag(msg, FOU_ATTR_REMCSUM_NOPARTIAL))
 +                      return -1;
 +      return 0;
 +}
 +
 +static int fou_dump_info(struct fou *fou, u32 portid, u32 seq,
 +                       u32 flags, struct sk_buff *skb, u8 cmd)
 +{
 +      void *hdr;
 +
 +      hdr = genlmsg_put(skb, portid, seq, &fou_nl_family, flags, cmd);
 +      if (!hdr)
 +              return -ENOMEM;
 +
 +      if (fou_fill_info(fou, skb) < 0)
 +              goto nla_put_failure;
 +
 +      genlmsg_end(skb, hdr);
 +      return 0;
 +
 +nla_put_failure:
 +      genlmsg_cancel(skb, hdr);
 +      return -EMSGSIZE;
 +}
 +
 +static int fou_nl_cmd_get_port(struct sk_buff *skb, struct genl_info *info)
 +{
 +      struct net *net = genl_info_net(info);
 +      struct fou_net *fn = net_generic(net, fou_net_id);
 +      struct sk_buff *msg;
        struct fou_cfg cfg;
 +      struct fou *fout;
 +      __be16 port;
 +      int ret;
 +
 +      ret = parse_nl_config(info, &cfg);
 +      if (ret)
 +              return ret;
 +      port = cfg.udp_config.local_udp_port;
 +      if (port == 0)
 +              return -EINVAL;
 +
 +      msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
 +      if (!msg)
 +              return -ENOMEM;
 +
 +      ret = -ESRCH;
 +      mutex_lock(&fn->fou_lock);
 +      list_for_each_entry(fout, &fn->fou_list, list) {
 +              if (port == fout->port) {
 +                      ret = fou_dump_info(fout, info->snd_portid,
 +                                          info->snd_seq, 0, msg,
 +                                          info->genlhdr->cmd);
 +                      break;
 +              }
 +      }
 +      mutex_unlock(&fn->fou_lock);
 +      if (ret < 0)
 +              goto out_free;
  
 -      parse_nl_config(info, &cfg);
 +      return genlmsg_reply(msg, info);
  
 -      return fou_destroy(&init_net, &cfg);
 +out_free:
 +      nlmsg_free(msg);
 +      return ret;
 +}
 +
 +static int fou_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
 +{
 +      struct net *net = sock_net(skb->sk);
 +      struct fou_net *fn = net_generic(net, fou_net_id);
 +      struct fou *fout;
 +      int idx = 0, ret;
 +
 +      mutex_lock(&fn->fou_lock);
 +      list_for_each_entry(fout, &fn->fou_list, list) {
 +              if (idx++ < cb->args[0])
 +                      continue;
 +              ret = fou_dump_info(fout, NETLINK_CB(cb->skb).portid,
 +                                  cb->nlh->nlmsg_seq, NLM_F_MULTI,
 +                                  skb, FOU_CMD_GET);
 +              if (ret)
 +                      goto done;
 +      }
 +      mutex_unlock(&fn->fou_lock);
 +
 +done:
 +      cb->args[0] = idx;
 +      return skb->len;
  }
  
  static const struct genl_ops fou_nl_ops[] = {
                .policy = fou_nl_policy,
                .flags = GENL_ADMIN_PERM,
        },
 +      {
 +              .cmd = FOU_CMD_GET,
 +              .doit = fou_nl_cmd_get_port,
 +              .dumpit = fou_nl_dump,
 +              .policy = fou_nl_policy,
 +      },
  };
  
  size_t fou_encap_hlen(struct ip_tunnel_encap *e)
@@@ -886,12 -771,12 +886,12 @@@ EXPORT_SYMBOL(gue_build_header)
  
  #ifdef CONFIG_NET_FOU_IP_TUNNELS
  
- static const struct ip_tunnel_encap_ops __read_mostly fou_iptun_ops = {
+ static const struct ip_tunnel_encap_ops fou_iptun_ops = {
        .encap_hlen = fou_encap_hlen,
        .build_header = fou_build_header,
  };
  
- static const struct ip_tunnel_encap_ops __read_mostly gue_iptun_ops = {
+ static const struct ip_tunnel_encap_ops gue_iptun_ops = {
        .encap_hlen = gue_encap_hlen,
        .build_header = gue_build_header,
  };
@@@ -935,63 -820,38 +935,63 @@@ static void ip_tunnel_encap_del_fou_ops
  
  #endif
  
 +static __net_init int fou_init_net(struct net *net)
 +{
 +      struct fou_net *fn = net_generic(net, fou_net_id);
 +
 +      INIT_LIST_HEAD(&fn->fou_list);
 +      mutex_init(&fn->fou_lock);
 +      return 0;
 +}
 +
 +static __net_exit void fou_exit_net(struct net *net)
 +{
 +      struct fou_net *fn = net_generic(net, fou_net_id);
 +      struct fou *fou, *next;
 +
 +      /* Close all the FOU sockets */
 +      mutex_lock(&fn->fou_lock);
 +      list_for_each_entry_safe(fou, next, &fn->fou_list, list)
 +              fou_release(fou);
 +      mutex_unlock(&fn->fou_lock);
 +}
 +
 +static struct pernet_operations fou_net_ops = {
 +      .init = fou_init_net,
 +      .exit = fou_exit_net,
 +      .id   = &fou_net_id,
 +      .size = sizeof(struct fou_net),
 +};
 +
  static int __init fou_init(void)
  {
        int ret;
  
 +      ret = register_pernet_device(&fou_net_ops);
 +      if (ret)
 +              goto exit;
 +
        ret = genl_register_family_with_ops(&fou_nl_family,
                                            fou_nl_ops);
 -
        if (ret < 0)
 -              goto exit;
 +              goto unregister;
  
        ret = ip_tunnel_encap_add_fou_ops();
 -      if (ret < 0)
 -              genl_unregister_family(&fou_nl_family);
 +      if (ret == 0)
 +              return 0;
  
 +      genl_unregister_family(&fou_nl_family);
 +unregister:
 +      unregister_pernet_device(&fou_net_ops);
  exit:
        return ret;
  }
  
  static void __exit fou_fini(void)
  {
 -      struct fou *fou, *next;
 -
        ip_tunnel_encap_del_fou_ops();
 -
        genl_unregister_family(&fou_nl_family);
 -
 -      /* Close all the FOU sockets */
 -
 -      spin_lock(&fou_lock);
 -      list_for_each_entry_safe(fou, next, &fou_list, list)
 -              fou_release(fou);
 -      spin_unlock(&fou_lock);
 +      unregister_pernet_device(&fou_net_ops);
  }
  
  module_init(fou_init);
diff --combined net/ipv4/geneve.c
index b77f5e84c623f055fe277ea2178a29589fadaf1b,a566a2e4715b33d4b4a60e9756afa71f98f00774..8986e63f3bda61a6c8ba980c050b96ec90625107
@@@ -113,10 -113,6 +113,6 @@@ int geneve_xmit_skb(struct geneve_sock 
        int min_headroom;
        int err;
  
-       skb = udp_tunnel_handle_offloads(skb, csum);
-       if (IS_ERR(skb))
-               return PTR_ERR(skb);
        min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
                        + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr)
                        + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
        if (unlikely(!skb))
                return -ENOMEM;
  
+       skb = udp_tunnel_handle_offloads(skb, csum);
+       if (IS_ERR(skb))
+               return PTR_ERR(skb);
        gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len);
        geneve_build_header(gnvh, tun_flags, vni, opt_len, opt);
  
        skb_set_inner_protocol(skb, htons(ETH_P_TEB));
  
 -      return udp_tunnel_xmit_skb(rt, skb, src, dst,
 +      return udp_tunnel_xmit_skb(rt, gs->sock->sk, skb, src, dst,
                                   tos, ttl, df, src_port, dst_port, xnet,
                                   !csum);
  }
@@@ -196,7 -196,7 +196,7 @@@ static struct sk_buff **geneve_gro_rece
  
        rcu_read_lock();
        ptype = gro_find_receive_by_type(type);
 -      if (ptype == NULL) {
 +      if (!ptype) {
                flush = 1;
                goto out_unlock;
        }
@@@ -230,7 -230,7 +230,7 @@@ static int geneve_gro_complete(struct s
  
        rcu_read_lock();
        ptype = gro_find_complete_by_type(type);
 -      if (ptype != NULL)
 +      if (ptype)
                err = ptype->callbacks.gro_complete(skb, nhoff + gh_len);
  
        rcu_read_unlock();
diff --combined net/ipv4/tcp_output.c
index e662d85d1635d0269b669bb0f726760be3bae0d2,d520492ba698944620fe6207dffff2786eec7dfd..8c8d7e06b72fc1e5c4a50ca55136757f0501f8c0
@@@ -518,26 -518,17 +518,26 @@@ static void tcp_options_write(__be32 *p
  
        if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
                struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
 +              u8 *p = (u8 *)ptr;
 +              u32 len; /* Fast Open option length */
 +
 +              if (foc->exp) {
 +                      len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
 +                      *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
 +                                   TCPOPT_FASTOPEN_MAGIC);
 +                      p += TCPOLEN_EXP_FASTOPEN_BASE;
 +              } else {
 +                      len = TCPOLEN_FASTOPEN_BASE + foc->len;
 +                      *p++ = TCPOPT_FASTOPEN;
 +                      *p++ = len;
 +              }
  
 -              *ptr++ = htonl((TCPOPT_EXP << 24) |
 -                             ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
 -                             TCPOPT_FASTOPEN_MAGIC);
 -
 -              memcpy(ptr, foc->val, foc->len);
 -              if ((foc->len & 3) == 2) {
 -                      u8 *align = ((u8 *)ptr) + foc->len;
 -                      align[0] = align[1] = TCPOPT_NOP;
 +              memcpy(p, foc->val, foc->len);
 +              if ((len & 3) == 2) {
 +                      p[foc->len] = TCPOPT_NOP;
 +                      p[foc->len + 1] = TCPOPT_NOP;
                }
 -              ptr += (foc->len + 3) >> 2;
 +              ptr += (len + 3) >> 2;
        }
  }
  
@@@ -574,7 -565,7 +574,7 @@@ static unsigned int tcp_syn_options(str
        opts->mss = tcp_advertise_mss(sk);
        remaining -= TCPOLEN_MSS_ALIGNED;
  
 -      if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
 +      if (likely(sysctl_tcp_timestamps && !*md5)) {
                opts->options |= OPTION_TS;
                opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
                opts->tsecr = tp->rx_opt.ts_recent;
        }
  
        if (fastopen && fastopen->cookie.len >= 0) {
 -              u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
 +              u32 need = fastopen->cookie.len;
 +
 +              need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
 +                                             TCPOLEN_FASTOPEN_BASE;
                need = (need + 3) & ~3U;  /* Align to 32 bits */
                if (remaining >= need) {
                        opts->options |= OPTION_FAST_OPEN_COOKIE;
                        opts->fastopen_cookie = &fastopen->cookie;
                        remaining -= need;
                        tp->syn_fastopen = 1;
 +                      tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
                }
        }
  
@@@ -614,14 -601,15 +614,14 @@@ static unsigned int tcp_synack_options(
                                   struct request_sock *req,
                                   unsigned int mss, struct sk_buff *skb,
                                   struct tcp_out_options *opts,
 -                                 struct tcp_md5sig_key **md5,
 +                                 const struct tcp_md5sig_key *md5,
                                   struct tcp_fastopen_cookie *foc)
  {
        struct inet_request_sock *ireq = inet_rsk(req);
        unsigned int remaining = MAX_TCP_OPTION_SPACE;
  
  #ifdef CONFIG_TCP_MD5SIG
 -      *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
 -      if (*md5) {
 +      if (md5) {
                opts->options |= OPTION_MD5;
                remaining -= TCPOLEN_MD5SIG_ALIGNED;
  
                 */
                ireq->tstamp_ok &= !ireq->sack_ok;
        }
 -#else
 -      *md5 = NULL;
  #endif
  
        /* We always send an MSS option. */
                        remaining -= TCPOLEN_SACKPERM_ALIGNED;
        }
        if (foc != NULL && foc->len >= 0) {
 -              u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
 +              u32 need = foc->len;
 +
 +              need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
 +                                 TCPOLEN_FASTOPEN_BASE;
                need = (need + 3) & ~3U;  /* Align to 32 bits */
                if (remaining >= need) {
                        opts->options |= OPTION_FAST_OPEN_COOKIE;
@@@ -1002,7 -989,7 +1002,7 @@@ static int tcp_transmit_skb(struct soc
        if (md5) {
                sk_nocaps_add(sk, NETIF_F_GSO_MASK);
                tp->af_specific->calc_md5_hash(opts.hash_location,
 -                                             md5, sk, NULL, skb);
 +                                             md5, sk, skb);
        }
  #endif
  
@@@ -1164,7 -1151,7 +1164,7 @@@ int tcp_fragment(struct sock *sk, struc
  
        /* Get a new skb... force flag on. */
        buff = sk_stream_alloc_skb(sk, nsize, gfp);
 -      if (buff == NULL)
 +      if (!buff)
                return -ENOMEM; /* We'll just try again later. */
  
        sk->sk_wmem_queued += buff->truesize;
@@@ -1367,8 -1354,6 +1367,8 @@@ void tcp_mtup_init(struct sock *sk
                               icsk->icsk_af_ops->net_header_len;
        icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
        icsk->icsk_mtup.probe_size = 0;
 +      if (icsk->icsk_mtup.enabled)
 +              icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
  }
  EXPORT_SYMBOL(tcp_mtup_init);
  
@@@ -1723,7 -1708,7 +1723,7 @@@ static int tso_fragment(struct sock *sk
                return tcp_fragment(sk, skb, len, mss_now, gfp);
  
        buff = sk_stream_alloc_skb(sk, 0, gfp);
 -      if (unlikely(buff == NULL))
 +      if (unlikely(!buff))
                return -ENOMEM;
  
        sk->sk_wmem_queued += buff->truesize;
  static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
                                 bool *is_cwnd_limited, u32 max_segs)
  {
 -      struct tcp_sock *tp = tcp_sk(sk);
        const struct inet_connection_sock *icsk = inet_csk(sk);
 -      u32 send_win, cong_win, limit, in_flight;
 +      u32 age, send_win, cong_win, limit, in_flight;
 +      struct tcp_sock *tp = tcp_sk(sk);
 +      struct skb_mstamp now;
 +      struct sk_buff *head;
        int win_divisor;
  
        if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
                goto send_now;
  
 -      if (icsk->icsk_ca_state != TCP_CA_Open)
 +      if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
                goto send_now;
  
 -      /* Defer for less than two clock ticks. */
 -      if (tp->tso_deferred &&
 -          (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
 +      /* Avoid bursty behavior by allowing defer
 +       * only if the last write was recent.
 +       */
 +      if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
                goto send_now;
  
        in_flight = tcp_packets_in_flight(tp);
                        goto send_now;
        }
  
 -      /* Ok, it looks like it is advisable to defer.
 -       * Do not rearm the timer if already set to not break TCP ACK clocking.
 -       */
 -      if (!tp->tso_deferred)
 -              tp->tso_deferred = 1 | (jiffies << 1);
 +      head = tcp_write_queue_head(sk);
 +      skb_mstamp_get(&now);
 +      age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
 +      /* If next ACK is likely to come too late (half srtt), do not defer */
 +      if (age < (tp->srtt_us >> 4))
 +              goto send_now;
 +
 +      /* Ok, it looks like it is advisable to defer. */
  
        if (cong_win < send_win && cong_win < skb->len)
                *is_cwnd_limited = true;
        return true;
  
  send_now:
 -      tp->tso_deferred = 0;
        return false;
  }
  
 +static inline void tcp_mtu_check_reprobe(struct sock *sk)
 +{
 +      struct inet_connection_sock *icsk = inet_csk(sk);
 +      struct tcp_sock *tp = tcp_sk(sk);
 +      struct net *net = sock_net(sk);
 +      u32 interval;
 +      s32 delta;
 +
 +      interval = net->ipv4.sysctl_tcp_probe_interval;
 +      delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
 +      if (unlikely(delta >= interval * HZ)) {
 +              int mss = tcp_current_mss(sk);
 +
 +              /* Update current search range */
 +              icsk->icsk_mtup.probe_size = 0;
 +              icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
 +                      sizeof(struct tcphdr) +
 +                      icsk->icsk_af_ops->net_header_len;
 +              icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
 +
 +              /* Update probe time stamp */
 +              icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
 +      }
 +}
 +
  /* Create a new MTU probe if we are ready.
   * MTU probe is regularly attempting to increase the path MTU by
   * deliberately sending larger packets.  This discovers routing
@@@ -1882,13 -1837,11 +1882,13 @@@ static int tcp_mtu_probe(struct sock *s
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_connection_sock *icsk = inet_csk(sk);
        struct sk_buff *skb, *nskb, *next;
 +      struct net *net = sock_net(sk);
        int len;
        int probe_size;
        int size_needed;
        int copy;
        int mss_now;
 +      int interval;
  
        /* Not currently probing/verifying,
         * not in recovery,
            tp->rx_opt.num_sacks || tp->rx_opt.dsack)
                return -1;
  
 -      /* Very simple search strategy: just double the MSS. */
 +      /* Use binary search for probe_size between tcp_mss_base,
 +       * and current mss_clamp. if (search_high - search_low)
 +       * smaller than a threshold, backoff from probing.
 +       */
        mss_now = tcp_current_mss(sk);
 -      probe_size = 2 * tp->mss_cache;
 +      probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
 +                                  icsk->icsk_mtup.search_low) >> 1);
        size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
 -      if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
 -              /* TODO: set timer for probe_converge_event */
 +      interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
 +      /* When misfortune happens, we are reprobing actively,
 +       * and then reprobe timer has expired. We stick with current
 +       * probing process by not resetting search range to its orignal.
 +       */
 +      if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
 +              interval < net->ipv4.sysctl_tcp_probe_threshold) {
 +              /* Check whether enough time has elaplased for
 +               * another round of probing.
 +               */
 +              tcp_mtu_check_reprobe(sk);
                return -1;
        }
  
        }
  
        /* We're allowed to probe.  Build it now. */
 -      if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
 +      nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC);
 +      if (!nskb)
                return -1;
        sk->sk_wmem_queued += nskb->truesize;
        sk_mem_charge(sk, nskb->truesize);
@@@ -2240,7 -2179,7 +2240,7 @@@ void tcp_send_loss_probe(struct sock *s
        int mss = tcp_current_mss(sk);
        int err = -1;
  
 -      if (tcp_send_head(sk) != NULL) {
 +      if (tcp_send_head(sk)) {
                err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
                goto rearm_timer;
        }
@@@ -2750,7 -2689,7 +2750,7 @@@ void tcp_xmit_retransmit_queue(struct s
                if (skb == tcp_send_head(sk))
                        break;
                /* we could do better than to assign each time */
 -              if (hole == NULL)
 +              if (!hole)
                        tp->retransmit_skb_hint = skb;
  
                /* Assume this retransmit will generate
@@@ -2774,7 -2713,7 +2774,7 @@@ begin_fwd
                        if (!tcp_can_forward_retransmit(sk))
                                break;
                        /* Backtrack if necessary to non-L'ed skb */
 -                      if (hole != NULL) {
 +                      if (hole) {
                                skb = hole;
                                hole = NULL;
                        }
                        goto begin_fwd;
  
                } else if (!(sacked & TCPCB_LOST)) {
 -                      if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
 +                      if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
                                hole = skb;
                        continue;
  
@@@ -2827,7 -2766,7 +2827,7 @@@ void tcp_send_fin(struct sock *sk
         */
        mss_now = tcp_current_mss(sk);
  
 -      if (tcp_send_head(sk) != NULL) {
 +      if (tcp_send_head(sk)) {
                TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
                TCP_SKB_CB(skb)->end_seq++;
                tp->write_seq++;
@@@ -2885,14 -2824,14 +2885,14 @@@ int tcp_send_synack(struct sock *sk
        struct sk_buff *skb;
  
        skb = tcp_write_queue_head(sk);
 -      if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
 +      if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
                pr_debug("%s: wrong queue state\n", __func__);
                return -EFAULT;
        }
        if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
                if (skb_cloned(skb)) {
                        struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
 -                      if (nskb == NULL)
 +                      if (!nskb)
                                return -ENOMEM;
                        tcp_unlink_write_queue(skb, sk);
                        __skb_header_release(nskb);
@@@ -2927,7 -2866,7 +2927,7 @@@ struct sk_buff *tcp_make_synack(struct 
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcphdr *th;
        struct sk_buff *skb;
 -      struct tcp_md5sig_key *md5;
 +      struct tcp_md5sig_key *md5 = NULL;
        int tcp_header_size;
        int mss;
  
        skb_reserve(skb, MAX_TCP_HEADER);
  
        skb_dst_set(skb, dst);
 -      security_skb_owned_by(skb, sk);
  
        mss = dst_metric_advmss(dst);
        if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
        else
  #endif
        skb_mstamp_get(&skb->skb_mstamp);
 -      tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
 +
 +#ifdef CONFIG_TCP_MD5SIG
 +      rcu_read_lock();
 +      md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
 +#endif
 +      tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
                                             foc) + sizeof(*th);
  
        skb_push(skb, tcp_header_size);
  
  #ifdef CONFIG_TCP_MD5SIG
        /* Okay, we have all we need - do the md5 hash if needed */
 -      if (md5) {
 +      if (md5)
                tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
 -                                             md5, NULL, req, skb);
 -      }
 +                                             md5, req_to_sk(req), skb);
 +      rcu_read_unlock();
  #endif
  
+       /* Do not fool tcpdump (if any), clean our debris */
+       skb->tstamp.tv64 = 0;
        return skb;
  }
  EXPORT_SYMBOL(tcp_make_synack);
@@@ -3031,7 -2968,7 +3033,7 @@@ static void tcp_connect_init(struct soc
                (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
  
  #ifdef CONFIG_TCP_MD5SIG
 -      if (tp->af_specific->md5_lookup(sk, sk) != NULL)
 +      if (tp->af_specific->md5_lookup(sk, sk))
                tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
  #endif
  
@@@ -3317,7 -3254,7 +3319,7 @@@ void tcp_send_ack(struct sock *sk
         * sock.
         */
        buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
 -      if (buff == NULL) {
 +      if (!buff) {
                inet_csk_schedule_ack(sk);
                inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
                inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
@@@ -3361,7 -3298,7 +3363,7 @@@ static int tcp_xmit_probe_skb(struct so
  
        /* We don't queue it, tcp_transmit_skb() sets ownership. */
        skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
 -      if (skb == NULL)
 +      if (!skb)
                return -1;
  
        /* Reserve space for headers and set control bits. */
@@@ -3392,8 -3329,8 +3394,8 @@@ int tcp_write_wakeup(struct sock *sk
        if (sk->sk_state == TCP_CLOSE)
                return -1;
  
 -      if ((skb = tcp_send_head(sk)) != NULL &&
 -          before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
 +      skb = tcp_send_head(sk);
 +      if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
                int err;
                unsigned int mss = tcp_current_mss(sk);
                unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
diff --combined net/ipv6/ip6_vti.c
index b53148444e157f821c86b467b166fc9ce7bd5ccb,a4ac85052e44e9ce2091ecd4ab5df2f9cbde2512..ed9d681207fa340881fd100db0ea1cb3eb9a2ffb
@@@ -218,7 -218,7 +218,7 @@@ static struct ip6_tnl *vti6_tnl_create(
                sprintf(name, "ip6_vti%%d");
  
        dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, vti6_dev_setup);
 -      if (dev == NULL)
 +      if (!dev)
                goto failed;
  
        dev_net_set(dev, net);
@@@ -288,8 -288,7 +288,7 @@@ static struct ip6_tnl *vti6_locate(stru
  static void vti6_dev_uninit(struct net_device *dev)
  {
        struct ip6_tnl *t = netdev_priv(dev);
-       struct net *net = dev_net(dev);
-       struct vti6_net *ip6n = net_generic(net, vti6_net_id);
+       struct vti6_net *ip6n = net_generic(t->net, vti6_net_id);
  
        if (dev == ip6n->fb_tnl_dev)
                RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
@@@ -305,7 -304,7 +304,7 @@@ static int vti6_rcv(struct sk_buff *skb
  
        rcu_read_lock();
        t = vti6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr);
 -      if (t != NULL) {
 +      if (t) {
                if (t->parms.proto != IPPROTO_IPV6 && t->parms.proto != 0) {
                        rcu_read_unlock();
                        goto discard;
@@@ -601,6 -600,8 +600,6 @@@ static void vti6_link_config(struct ip6
                dev->flags |= IFF_POINTOPOINT;
        else
                dev->flags &= ~IFF_POINTOPOINT;
 -
 -      dev->iflink = p->link;
  }
  
  /**
@@@ -714,7 -715,7 +713,7 @@@ vti6_ioctl(struct net_device *dev, stru
                } else {
                        memset(&p, 0, sizeof(p));
                }
 -              if (t == NULL)
 +              if (!t)
                        t = netdev_priv(dev);
                vti6_parm_to_user(&p, &t->parms);
                if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
                vti6_parm_from_user(&p1, &p);
                t = vti6_locate(net, &p1, cmd == SIOCADDTUNNEL);
                if (dev != ip6n->fb_tnl_dev && cmd == SIOCCHGTUNNEL) {
 -                      if (t != NULL) {
 +                      if (t) {
                                if (t->dev != dev) {
                                        err = -EEXIST;
                                        break;
                        err = -ENOENT;
                        vti6_parm_from_user(&p1, &p);
                        t = vti6_locate(net, &p1, 0);
 -                      if (t == NULL)
 +                      if (!t)
                                break;
                        err = -EPERM;
                        if (t->dev == ip6n->fb_tnl_dev)
@@@ -806,7 -807,6 +805,7 @@@ static const struct net_device_ops vti6
        .ndo_do_ioctl   = vti6_ioctl,
        .ndo_change_mtu = vti6_change_mtu,
        .ndo_get_stats64 = ip_tunnel_get_stats64,
 +      .ndo_get_iflink = ip6_tnl_get_iflink,
  };
  
  /**
@@@ -896,10 -896,12 +895,10 @@@ static void vti6_netlink_parms(struct n
                parms->link = nla_get_u32(data[IFLA_VTI_LINK]);
  
        if (data[IFLA_VTI_LOCAL])
 -              nla_memcpy(&parms->laddr, data[IFLA_VTI_LOCAL],
 -                         sizeof(struct in6_addr));
 +              parms->laddr = nla_get_in6_addr(data[IFLA_VTI_LOCAL]);
  
        if (data[IFLA_VTI_REMOTE])
 -              nla_memcpy(&parms->raddr, data[IFLA_VTI_REMOTE],
 -                         sizeof(struct in6_addr));
 +              parms->raddr = nla_get_in6_addr(data[IFLA_VTI_REMOTE]);
  
        if (data[IFLA_VTI_IKEY])
                parms->i_key = nla_get_be32(data[IFLA_VTI_IKEY]);
@@@ -980,8 -982,10 +979,8 @@@ static int vti6_fill_info(struct sk_buf
        struct __ip6_tnl_parm *parm = &tunnel->parms;
  
        if (nla_put_u32(skb, IFLA_VTI_LINK, parm->link) ||
 -          nla_put(skb, IFLA_VTI_LOCAL, sizeof(struct in6_addr),
 -                  &parm->laddr) ||
 -          nla_put(skb, IFLA_VTI_REMOTE, sizeof(struct in6_addr),
 -                  &parm->raddr) ||
 +          nla_put_in6_addr(skb, IFLA_VTI_LOCAL, &parm->laddr) ||
 +          nla_put_in6_addr(skb, IFLA_VTI_REMOTE, &parm->raddr) ||
            nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) ||
            nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key))
                goto nla_put_failure;
@@@ -1022,7 -1026,7 +1021,7 @@@ static void __net_exit vti6_destroy_tun
  
        for (h = 0; h < HASH_SIZE; h++) {
                t = rtnl_dereference(ip6n->tnls_r_l[h]);
 -              while (t != NULL) {
 +              while (t) {
                        unregister_netdevice_queue(t->dev, &list);
                        t = rtnl_dereference(t->next);
                }
diff --combined net/rds/rds.h
index c3f2855c3d8432272f7899608513a499558d9ad8,02d8fd5b40c08336bfd115ccdbc782357ca262c1..0d41155a2258cbbd16e19171c3daa376e3a83877
@@@ -110,6 -110,7 +110,7 @@@ struct rds_connection 
        void                    *c_transport_data;
  
        atomic_t                c_state;
+       unsigned long           c_send_gen;
        unsigned long           c_flags;
        unsigned long           c_reconnect_jiffies;
        struct delayed_work     c_send_w;
@@@ -702,8 -703,8 +703,8 @@@ void rds_inc_init(struct rds_incoming *
  void rds_inc_put(struct rds_incoming *inc);
  void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
                       struct rds_incoming *inc, gfp_t gfp);
 -int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 -              size_t size, int msg_flags);
 +int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
 +              int msg_flags);
  void rds_clear_recv_queue(struct rds_sock *rs);
  int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msg);
  void rds_inc_info_copy(struct rds_incoming *inc,
                       __be32 saddr, __be32 daddr, int flip);
  
  /* send.c */
 -int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 -              size_t payload_len);
 +int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len);
  void rds_send_reset(struct rds_connection *conn);
  int rds_send_xmit(struct rds_connection *conn);
  struct sockaddr_in;
diff --combined net/rds/send.c
index 44672befc0ee29a3e04ca01768c087fd0abd2f36,49f77efd82b9783260cda91a58b5b62ea9e98d1c..e9430f537f9c2bb23bbaeeb66933e1e85058bd34
@@@ -140,8 -140,11 +140,11 @@@ int rds_send_xmit(struct rds_connectio
        struct scatterlist *sg;
        int ret = 0;
        LIST_HEAD(to_be_dropped);
+       int batch_count;
+       unsigned long send_gen = 0;
  
  restart:
+       batch_count = 0;
  
        /*
         * sendmsg calls here after having queued its message on the send
                goto out;
        }
  
+       /*
+        * we record the send generation after doing the xmit acquire.
+        * if someone else manages to jump in and do some work, we'll use
+        * this to avoid a goto restart farther down.
+        *
+        * The acquire_in_xmit() check above ensures that only one
+        * caller can increment c_send_gen at any time.
+        */
+       conn->c_send_gen++;
+       send_gen = conn->c_send_gen;
        /*
         * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT,
         * we do the opposite to avoid races.
                if (!rm) {
                        unsigned int len;
  
+                       batch_count++;
+                       /* we want to process as big a batch as we can, but
+                        * we also want to avoid softlockups.  If we've been
+                        * through a lot of messages, lets back off and see
+                        * if anyone else jumps in
+                        */
+                       if (batch_count >= 1024)
+                               goto over_batch;
                        spin_lock_irqsave(&conn->c_lock, flags);
  
                        if (!list_empty(&conn->c_send_queue)) {
                }
        }
  
+ over_batch:
        if (conn->c_trans->xmit_complete)
                conn->c_trans->xmit_complete(conn);
        release_in_xmit(conn);
  
        /* Nuke any messages we decided not to retransmit. */
         * If the transport cannot continue (i.e ret != 0), then it must
         * call us when more room is available, such as from the tx
         * completion handler.
+        *
+        * We have an extra generation check here so that if someone manages
+        * to jump in after our release_in_xmit, we'll see that they have done
+        * some work and we will skip our goto
         */
        if (ret == 0) {
                smp_mb();
-               if (!list_empty(&conn->c_send_queue)) {
+               if (!list_empty(&conn->c_send_queue) &&
+                   send_gen == conn->c_send_gen) {
                        rds_stats_inc(s_send_lock_queue_raced);
                        goto restart;
                }
@@@ -920,7 -949,8 +949,7 @@@ static int rds_cmsg_send(struct rds_soc
        return ret;
  }
  
 -int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 -              size_t payload_len)
 +int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
  {
        struct sock *sk = sock->sk;
        struct rds_sock *rs = rds_sk_to_rs(sk);