MODULE_AUTHOR("Emulex Corporation");
MODULE_LICENSE("GPL");
+/* num_vfs module param is obsolete.
+ * Use sysfs method to enable/disable VFs.
+ */
static unsigned int num_vfs;
module_param(num_vfs, uint, S_IRUGO);
MODULE_PARM_DESC(num_vfs, "Number of PCI VFs to initialize");
ip_hdr(skb)->protocol : ipv6_hdr(skb)->nexthdr;
}
-static void wrb_fill_hdr(struct be_adapter *adapter, struct be_eth_hdr_wrb *hdr,
- struct sk_buff *skb, u32 wrb_cnt, u32 len,
- bool skip_hw_vlan)
+static inline bool be_is_txq_full(struct be_tx_obj *txo)
{
- u16 vlan_tag, proto;
+ return atomic_read(&txo->q.used) + BE_MAX_TX_FRAG_COUNT >= txo->q.len;
+}
- memset(hdr, 0, sizeof(*hdr));
+static inline bool be_can_txq_wake(struct be_tx_obj *txo)
+{
+ return atomic_read(&txo->q.used) < txo->q.len / 2;
+}
+
+static inline bool be_is_tx_compl_pending(struct be_tx_obj *txo)
+{
+ return atomic_read(&txo->q.used) > txo->pend_wrb_cnt;
+}
- SET_TX_WRB_HDR_BITS(crc, hdr, 1);
+static void be_get_wrb_params_from_skb(struct be_adapter *adapter,
+ struct sk_buff *skb,
+ struct be_wrb_params *wrb_params)
+{
+ u16 proto;
if (skb_is_gso(skb)) {
- SET_TX_WRB_HDR_BITS(lso, hdr, 1);
- SET_TX_WRB_HDR_BITS(lso_mss, hdr, skb_shinfo(skb)->gso_size);
+ BE_WRB_F_SET(wrb_params->features, LSO, 1);
+ wrb_params->lso_mss = skb_shinfo(skb)->gso_size;
if (skb_is_gso_v6(skb) && !lancer_chip(adapter))
- SET_TX_WRB_HDR_BITS(lso6, hdr, 1);
+ BE_WRB_F_SET(wrb_params->features, LSO6, 1);
} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
if (skb->encapsulation) {
- SET_TX_WRB_HDR_BITS(ipcs, hdr, 1);
+ BE_WRB_F_SET(wrb_params->features, IPCS, 1);
proto = skb_inner_ip_proto(skb);
} else {
proto = skb_ip_proto(skb);
}
if (proto == IPPROTO_TCP)
- SET_TX_WRB_HDR_BITS(tcpcs, hdr, 1);
+ BE_WRB_F_SET(wrb_params->features, TCPCS, 1);
else if (proto == IPPROTO_UDP)
- SET_TX_WRB_HDR_BITS(udpcs, hdr, 1);
+ BE_WRB_F_SET(wrb_params->features, UDPCS, 1);
}
if (skb_vlan_tag_present(skb)) {
- SET_TX_WRB_HDR_BITS(vlan, hdr, 1);
- vlan_tag = be_get_tx_vlan_tag(adapter, skb);
- SET_TX_WRB_HDR_BITS(vlan_tag, hdr, vlan_tag);
+ BE_WRB_F_SET(wrb_params->features, VLAN, 1);
+ wrb_params->vlan_tag = be_get_tx_vlan_tag(adapter, skb);
}
- SET_TX_WRB_HDR_BITS(num_wrb, hdr, wrb_cnt);
- SET_TX_WRB_HDR_BITS(len, hdr, len);
+ BE_WRB_F_SET(wrb_params->features, CRC, 1);
+}
+
+static void wrb_fill_hdr(struct be_adapter *adapter,
+ struct be_eth_hdr_wrb *hdr,
+ struct be_wrb_params *wrb_params,
+ struct sk_buff *skb)
+{
+ memset(hdr, 0, sizeof(*hdr));
- /* Hack to skip HW VLAN tagging needs evt = 1, compl = 0
- * When this hack is not needed, the evt bit is set while ringing DB
+ SET_TX_WRB_HDR_BITS(crc, hdr,
+ BE_WRB_F_GET(wrb_params->features, CRC));
+ SET_TX_WRB_HDR_BITS(ipcs, hdr,
+ BE_WRB_F_GET(wrb_params->features, IPCS));
+ SET_TX_WRB_HDR_BITS(tcpcs, hdr,
+ BE_WRB_F_GET(wrb_params->features, TCPCS));
+ SET_TX_WRB_HDR_BITS(udpcs, hdr,
+ BE_WRB_F_GET(wrb_params->features, UDPCS));
+
+ SET_TX_WRB_HDR_BITS(lso, hdr,
+ BE_WRB_F_GET(wrb_params->features, LSO));
+ SET_TX_WRB_HDR_BITS(lso6, hdr,
+ BE_WRB_F_GET(wrb_params->features, LSO6));
+ SET_TX_WRB_HDR_BITS(lso_mss, hdr, wrb_params->lso_mss);
+
+ /* Hack to skip HW VLAN tagging needs evt = 1, compl = 0. When this
+ * hack is not needed, the evt bit is set while ringing DB.
*/
- if (skip_hw_vlan)
- SET_TX_WRB_HDR_BITS(event, hdr, 1);
+ SET_TX_WRB_HDR_BITS(event, hdr,
+ BE_WRB_F_GET(wrb_params->features, VLAN_SKIP_HW));
+ SET_TX_WRB_HDR_BITS(vlan, hdr,
+ BE_WRB_F_GET(wrb_params->features, VLAN));
+ SET_TX_WRB_HDR_BITS(vlan_tag, hdr, wrb_params->vlan_tag);
+
+ SET_TX_WRB_HDR_BITS(num_wrb, hdr, skb_wrb_cnt(skb));
+ SET_TX_WRB_HDR_BITS(len, hdr, skb->len);
}
static void unmap_tx_frag(struct device *dev, struct be_eth_wrb *wrb,
}
}
-/* Returns the number of WRBs used up by the skb */
+/* Grab a WRB header for xmit */
+static u16 be_tx_get_wrb_hdr(struct be_tx_obj *txo)
+{
+ u16 head = txo->q.head;
+
+ queue_head_inc(&txo->q);
+ return head;
+}
+
+/* Set up the WRB header for xmit */
+static void be_tx_setup_wrb_hdr(struct be_adapter *adapter,
+ struct be_tx_obj *txo,
+ struct be_wrb_params *wrb_params,
+ struct sk_buff *skb, u16 head)
+{
+ u32 num_frags = skb_wrb_cnt(skb);
+ struct be_queue_info *txq = &txo->q;
+ struct be_eth_hdr_wrb *hdr = queue_index_node(txq, head);
+
+ wrb_fill_hdr(adapter, hdr, wrb_params, skb);
+ be_dws_cpu_to_le(hdr, sizeof(*hdr));
+
+ BUG_ON(txo->sent_skb_list[head]);
+ txo->sent_skb_list[head] = skb;
+ txo->last_req_hdr = head;
+ atomic_add(num_frags, &txq->used);
+ txo->last_req_wrb_cnt = num_frags;
+ txo->pend_wrb_cnt += num_frags;
+}
+
+/* Setup a WRB fragment (buffer descriptor) for xmit */
+static void be_tx_setup_wrb_frag(struct be_tx_obj *txo, dma_addr_t busaddr,
+ int len)
+{
+ struct be_eth_wrb *wrb;
+ struct be_queue_info *txq = &txo->q;
+
+ wrb = queue_head_node(txq);
+ wrb_fill(wrb, busaddr, len);
+ queue_head_inc(txq);
+}
+
+/* Bring the queue back to the state it was in before be_xmit_enqueue() routine
+ * was invoked. The producer index is restored to the previous packet and the
+ * WRBs of the current packet are unmapped. Invoked to handle tx setup errors.
+ */
+static void be_xmit_restore(struct be_adapter *adapter,
+ struct be_tx_obj *txo, u16 head, bool map_single,
+ u32 copied)
+{
+ struct device *dev;
+ struct be_eth_wrb *wrb;
+ struct be_queue_info *txq = &txo->q;
+
+ dev = &adapter->pdev->dev;
+ txq->head = head;
+
+ /* skip the first wrb (hdr); it's not mapped */
+ queue_head_inc(txq);
+ while (copied) {
+ wrb = queue_head_node(txq);
+ unmap_tx_frag(dev, wrb, map_single);
+ map_single = false;
+ copied -= le32_to_cpu(wrb->frag_len);
+ queue_head_inc(txq);
+ }
+
+ txq->head = head;
+}
+
+/* Enqueue the given packet for transmit. This routine allocates WRBs for the
+ * packet, dma maps the packet buffers and sets up the WRBs. Returns the number
+ * of WRBs used up by the packet.
+ */
static u32 be_xmit_enqueue(struct be_adapter *adapter, struct be_tx_obj *txo,
- struct sk_buff *skb, bool skip_hw_vlan)
+ struct sk_buff *skb,
+ struct be_wrb_params *wrb_params)
{
u32 i, copied = 0, wrb_cnt = skb_wrb_cnt(skb);
struct device *dev = &adapter->pdev->dev;
struct be_queue_info *txq = &txo->q;
- struct be_eth_hdr_wrb *hdr;
bool map_single = false;
- struct be_eth_wrb *wrb;
- dma_addr_t busaddr;
u16 head = txq->head;
+ dma_addr_t busaddr;
+ int len;
- hdr = queue_head_node(txq);
- wrb_fill_hdr(adapter, hdr, skb, wrb_cnt, skb->len, skip_hw_vlan);
- be_dws_cpu_to_le(hdr, sizeof(*hdr));
-
- queue_head_inc(txq);
+ head = be_tx_get_wrb_hdr(txo);
if (skb->len > skb->data_len) {
- int len = skb_headlen(skb);
+ len = skb_headlen(skb);
busaddr = dma_map_single(dev, skb->data, len, DMA_TO_DEVICE);
if (dma_mapping_error(dev, busaddr))
goto dma_err;
map_single = true;
- wrb = queue_head_node(txq);
- wrb_fill(wrb, busaddr, len);
- queue_head_inc(txq);
+ be_tx_setup_wrb_frag(txo, busaddr, len);
copied += len;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
const struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[i];
+ len = skb_frag_size(frag);
- busaddr = skb_frag_dma_map(dev, frag, 0,
- skb_frag_size(frag), DMA_TO_DEVICE);
+ busaddr = skb_frag_dma_map(dev, frag, 0, len, DMA_TO_DEVICE);
if (dma_mapping_error(dev, busaddr))
goto dma_err;
- wrb = queue_head_node(txq);
- wrb_fill(wrb, busaddr, skb_frag_size(frag));
- queue_head_inc(txq);
- copied += skb_frag_size(frag);
+ be_tx_setup_wrb_frag(txo, busaddr, len);
+ copied += len;
}
- BUG_ON(txo->sent_skb_list[head]);
- txo->sent_skb_list[head] = skb;
- txo->last_req_hdr = head;
- atomic_add(wrb_cnt, &txq->used);
- txo->last_req_wrb_cnt = wrb_cnt;
- txo->pend_wrb_cnt += wrb_cnt;
+ be_tx_setup_wrb_hdr(adapter, txo, wrb_params, skb, head);
be_tx_stats_update(txo, skb);
return wrb_cnt;
dma_err:
- /* Bring the queue back to the state it was in before this
- * routine was invoked.
- */
- txq->head = head;
- /* skip the first wrb (hdr); it's not mapped */
- queue_head_inc(txq);
- while (copied) {
- wrb = queue_head_node(txq);
- unmap_tx_frag(dev, wrb, map_single);
- map_single = false;
- copied -= le32_to_cpu(wrb->frag_len);
- adapter->drv_stats.dma_map_errors++;
- queue_head_inc(txq);
- }
- txq->head = head;
+ adapter->drv_stats.dma_map_errors++;
+ be_xmit_restore(adapter, txo, head, map_single, copied);
return 0;
}
static struct sk_buff *be_insert_vlan_in_pkt(struct be_adapter *adapter,
struct sk_buff *skb,
- bool *skip_hw_vlan)
+ struct be_wrb_params
+ *wrb_params)
{
u16 vlan_tag = 0;
/* f/w workaround to set skip_hw_vlan = 1, informs the F/W to
* skip VLAN insertion
*/
- if (skip_hw_vlan)
- *skip_hw_vlan = true;
+ BE_WRB_F_SET(wrb_params->features, VLAN_SKIP_HW, 1);
}
if (vlan_tag) {
vlan_tag);
if (unlikely(!skb))
return skb;
- if (skip_hw_vlan)
- *skip_hw_vlan = true;
+ BE_WRB_F_SET(wrb_params->features, VLAN_SKIP_HW, 1);
}
return skb;
static struct sk_buff *be_lancer_xmit_workarounds(struct be_adapter *adapter,
struct sk_buff *skb,
- bool *skip_hw_vlan)
+ struct be_wrb_params
+ *wrb_params)
{
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
unsigned int eth_hdr_len;
*/
if (be_pvid_tagging_enabled(adapter) &&
veh->h_vlan_proto == htons(ETH_P_8021Q))
- *skip_hw_vlan = true;
+ BE_WRB_F_SET(wrb_params->features, VLAN_SKIP_HW, 1);
/* HW has a bug wherein it will calculate CSUM for VLAN
* pkts even though it is disabled.
*/
if (skb->ip_summed != CHECKSUM_PARTIAL &&
skb_vlan_tag_present(skb)) {
- skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan);
+ skb = be_insert_vlan_in_pkt(adapter, skb, wrb_params);
if (unlikely(!skb))
goto err;
}
*/
if (be_ipv6_tx_stall_chk(adapter, skb) &&
be_vlan_tag_tx_chk(adapter, skb)) {
- skb = be_insert_vlan_in_pkt(adapter, skb, skip_hw_vlan);
+ skb = be_insert_vlan_in_pkt(adapter, skb, wrb_params);
if (unlikely(!skb))
goto err;
}
static struct sk_buff *be_xmit_workarounds(struct be_adapter *adapter,
struct sk_buff *skb,
- bool *skip_hw_vlan)
+ struct be_wrb_params *wrb_params)
{
/* Lancer, SH-R ASICs have a bug wherein Packets that are 32 bytes or
* less may cause a transmit stall on that port. So the work-around is
}
if (BEx_chip(adapter) || lancer_chip(adapter)) {
- skb = be_lancer_xmit_workarounds(adapter, skb, skip_hw_vlan);
+ skb = be_lancer_xmit_workarounds(adapter, skb, wrb_params);
if (!skb)
return NULL;
}
static netdev_tx_t be_xmit(struct sk_buff *skb, struct net_device *netdev)
{
- bool skip_hw_vlan = false, flush = !skb->xmit_more;
struct be_adapter *adapter = netdev_priv(netdev);
u16 q_idx = skb_get_queue_mapping(skb);
struct be_tx_obj *txo = &adapter->tx_obj[q_idx];
- struct be_queue_info *txq = &txo->q;
+ struct be_wrb_params wrb_params = { 0 };
+ bool flush = !skb->xmit_more;
u16 wrb_cnt;
- skb = be_xmit_workarounds(adapter, skb, &skip_hw_vlan);
+ skb = be_xmit_workarounds(adapter, skb, &wrb_params);
if (unlikely(!skb))
goto drop;
- wrb_cnt = be_xmit_enqueue(adapter, txo, skb, skip_hw_vlan);
+ be_get_wrb_params_from_skb(adapter, skb, &wrb_params);
+
+ wrb_cnt = be_xmit_enqueue(adapter, txo, skb, &wrb_params);
if (unlikely(!wrb_cnt)) {
dev_kfree_skb_any(skb);
goto drop;
}
- if ((atomic_read(&txq->used) + BE_MAX_TX_FRAG_COUNT) >= txq->len) {
+ if (be_is_txq_full(txo)) {
netif_stop_subqueue(netdev, q_idx);
tx_stats(txo)->tx_stops++;
}
if (rxo->rx_post_starved)
rxo->rx_post_starved = false;
do {
- notify = min(256u, posted);
+ notify = min(MAX_NUM_POST_ERX_DB, posted);
be_rxq_notify(adapter, rxq->id, notify);
posted -= notify;
} while (posted);
}
}
-static struct be_eth_tx_compl *be_tx_compl_get(struct be_queue_info *tx_cq)
+static struct be_tx_compl_info *be_tx_compl_get(struct be_tx_obj *txo)
{
- struct be_eth_tx_compl *txcp = queue_tail_node(tx_cq);
+ struct be_queue_info *tx_cq = &txo->cq;
+ struct be_tx_compl_info *txcp = &txo->txcp;
+ struct be_eth_tx_compl *compl = queue_tail_node(tx_cq);
- if (txcp->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] == 0)
+ if (compl->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] == 0)
return NULL;
+ /* Ensure load ordering of valid bit dword and other dwords below */
rmb();
- be_dws_le_to_cpu(txcp, sizeof(*txcp));
+ be_dws_le_to_cpu(compl, sizeof(*compl));
- txcp->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] = 0;
+ txcp->status = GET_TX_COMPL_BITS(status, compl);
+ txcp->end_index = GET_TX_COMPL_BITS(wrb_index, compl);
+ compl->dw[offsetof(struct amap_eth_tx_compl, valid) / 32] = 0;
queue_tail_inc(tx_cq);
return txcp;
}
{
u16 end_idx, notified_idx, cmpl = 0, timeo = 0, num_wrbs = 0;
struct device *dev = &adapter->pdev->dev;
- struct be_tx_obj *txo;
+ struct be_tx_compl_info *txcp;
struct be_queue_info *txq;
- struct be_eth_tx_compl *txcp;
+ struct be_tx_obj *txo;
int i, pending_txqs;
/* Stop polling for compls when HW has been silent for 10ms */
cmpl = 0;
num_wrbs = 0;
txq = &txo->q;
- while ((txcp = be_tx_compl_get(&txo->cq))) {
- end_idx = GET_TX_COMPL_BITS(wrb_index, txcp);
- num_wrbs += be_tx_compl_process(adapter, txo,
- end_idx);
+ while ((txcp = be_tx_compl_get(txo))) {
+ num_wrbs +=
+ be_tx_compl_process(adapter, txo,
+ txcp->end_index);
cmpl++;
}
if (cmpl) {
atomic_sub(num_wrbs, &txq->used);
timeo = 0;
}
- if (atomic_read(&txq->used) == txo->pend_wrb_cnt)
+ if (!be_is_tx_compl_pending(txo))
pending_txqs--;
}
napi_hash_del(&eqo->napi);
netif_napi_del(&eqo->napi);
}
+ free_cpumask_var(eqo->affinity_mask);
be_queue_free(adapter, &eqo->q);
}
}
adapter->cfg_num_qs);
for_all_evt_queues(adapter, eqo, i) {
+ if (!zalloc_cpumask_var(&eqo->affinity_mask, GFP_KERNEL))
+ return -ENOMEM;
+ cpumask_set_cpu_local_first(i, dev_to_node(&adapter->pdev->dev),
+ eqo->affinity_mask);
+
netif_napi_add(adapter->netdev, &eqo->napi, be_poll,
BE_NAPI_WEIGHT);
napi_hash_add(&eqo->napi);
static int be_tx_qs_create(struct be_adapter *adapter)
{
- struct be_queue_info *cq, *eq;
+ struct be_queue_info *cq;
struct be_tx_obj *txo;
+ struct be_eq_obj *eqo;
int status, i;
adapter->num_tx_qs = min(adapter->num_evt_qs, be_max_txqs(adapter));
/* If num_evt_qs is less than num_tx_qs, then more than
* one txq share an eq
*/
- eq = &adapter->eq_obj[i % adapter->num_evt_qs].q;
- status = be_cmd_cq_create(adapter, cq, eq, false, 3);
+ eqo = &adapter->eq_obj[i % adapter->num_evt_qs];
+ status = be_cmd_cq_create(adapter, cq, &eqo->q, false, 3);
if (status)
return status;
status = be_cmd_txq_create(adapter, txo);
if (status)
return status;
+
+ netif_set_xps_queue(adapter->netdev, eqo->affinity_mask,
+ eqo->idx);
}
dev_info(&adapter->pdev->dev, "created %d TX queue(s)\n",
int rc, i;
/* We can create as many RSS rings as there are EQs. */
- adapter->num_rx_qs = adapter->num_evt_qs;
+ adapter->num_rss_qs = adapter->num_evt_qs;
+
+ /* We'll use RSS only if atleast 2 RSS rings are supported. */
+ if (adapter->num_rss_qs <= 1)
+ adapter->num_rss_qs = 0;
- /* We'll use RSS only if atleast 2 RSS rings are supported.
- * When RSS is used, we'll need a default RXQ for non-IP traffic.
+ adapter->num_rx_qs = adapter->num_rss_qs + adapter->need_def_rxq;
+
+ /* When the interface is not capable of RSS rings (and there is no
+ * need to create a default RXQ) we'll still need one RXQ
*/
- if (adapter->num_rx_qs > 1)
- adapter->num_rx_qs++;
+ if (adapter->num_rx_qs == 0)
+ adapter->num_rx_qs = 1;
adapter->big_page_size = (1 << get_order(rx_frag_size)) * PAGE_SIZE;
for_all_rx_queues(adapter, rxo, i) {
}
dev_info(&adapter->pdev->dev,
- "created %d RSS queue(s) and 1 default RX queue\n",
- adapter->num_rx_qs - 1);
+ "created %d RX queue(s)\n", adapter->num_rx_qs);
return 0;
}
return work_done;
}
-static inline void be_update_tx_err(struct be_tx_obj *txo, u32 status)
+static inline void be_update_tx_err(struct be_tx_obj *txo, u8 status)
{
switch (status) {
case BE_TX_COMP_HDR_PARSE_ERR:
}
}
-static inline void lancer_update_tx_err(struct be_tx_obj *txo, u32 status)
+static inline void lancer_update_tx_err(struct be_tx_obj *txo, u8 status)
{
switch (status) {
case LANCER_TX_COMP_LSO_ERR:
static void be_process_tx(struct be_adapter *adapter, struct be_tx_obj *txo,
int idx)
{
- struct be_eth_tx_compl *txcp;
int num_wrbs = 0, work_done = 0;
- u32 compl_status;
- u16 last_idx;
+ struct be_tx_compl_info *txcp;
- while ((txcp = be_tx_compl_get(&txo->cq))) {
- last_idx = GET_TX_COMPL_BITS(wrb_index, txcp);
- num_wrbs += be_tx_compl_process(adapter, txo, last_idx);
+ while ((txcp = be_tx_compl_get(txo))) {
+ num_wrbs += be_tx_compl_process(adapter, txo, txcp->end_index);
work_done++;
- compl_status = GET_TX_COMPL_BITS(status, txcp);
- if (compl_status) {
+ if (txcp->status) {
if (lancer_chip(adapter))
- lancer_update_tx_err(txo, compl_status);
+ lancer_update_tx_err(txo, txcp->status);
else
- be_update_tx_err(txo, compl_status);
+ be_update_tx_err(txo, txcp->status);
}
}
/* As Tx wrbs have been freed up, wake up netdev queue
* if it was stopped due to lack of tx wrbs. */
if (__netif_subqueue_stopped(adapter->netdev, idx) &&
- atomic_read(&txo->q.used) < txo->q.len / 2) {
+ be_can_txq_wake(txo)) {
netif_wake_subqueue(adapter->netdev, idx);
}
sliport_err2 = ioread32(adapter->db +
SLIPORT_ERROR2_OFFSET);
adapter->hw_error = true;
+ error_detected = true;
/* Do not log error messages if its a FW reset */
if (sliport_err1 == SLIPORT_ERROR_FW_RESET1 &&
sliport_err2 == SLIPORT_ERROR_FW_RESET2) {
dev_info(dev, "Firmware update in progress\n");
} else {
- error_detected = true;
dev_err(dev, "Error detected in the card\n");
dev_err(dev, "ERR: sliport status 0x%x\n",
sliport_status);
status = request_irq(vec, be_msix, 0, eqo->desc, eqo);
if (status)
goto err_msix;
+
+ irq_set_affinity_hint(vec, eqo->affinity_mask);
}
return 0;
{
struct net_device *netdev = adapter->netdev;
struct be_eq_obj *eqo;
- int i;
+ int i, vec;
if (!adapter->isr_registered)
return;
}
/* MSIx */
- for_all_evt_queues(adapter, eqo, i)
- free_irq(be_msix_vec_get(adapter, eqo), eqo);
+ for_all_evt_queues(adapter, eqo, i) {
+ vec = be_msix_vec_get(adapter, eqo);
+ irq_set_affinity_hint(vec, NULL);
+ free_irq(vec, eqo);
+ }
done:
adapter->isr_registered = false;
return rc;
}
- /* The FW would like the default RXQ to be created first */
- rxo = default_rxo(adapter);
- rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id, rx_frag_size,
- adapter->if_handle, false, &rxo->rss_id);
- if (rc)
- return rc;
+ if (adapter->need_def_rxq || !adapter->num_rss_qs) {
+ rxo = default_rxo(adapter);
+ rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id,
+ rx_frag_size, adapter->if_handle,
+ false, &rxo->rss_id);
+ if (rc)
+ return rc;
+ }
for_all_rss_queues(adapter, rxo, i) {
rc = be_cmd_rxq_create(adapter, &rxo->q, rxo->cq.id,
}
if (be_multi_rxq(adapter)) {
- for (j = 0; j < RSS_INDIR_TABLE_LEN;
- j += adapter->num_rx_qs - 1) {
+ for (j = 0; j < RSS_INDIR_TABLE_LEN; j += adapter->num_rss_qs) {
for_all_rss_queues(adapter, rxo, i) {
if ((j + i) >= RSS_INDIR_TABLE_LEN)
break;
int status = 0;
u8 mac[ETH_ALEN];
- memset(mac, 0, ETH_ALEN);
+ eth_zero_addr(mac);
cmd.size = sizeof(struct be_cmd_req_acpi_wol_magic_config);
cmd.va = dma_zalloc_coherent(&adapter->pdev->dev, cmd.size, &cmd.dma,
}
}
+static void be_cancel_err_detection(struct be_adapter *adapter)
+{
+ if (adapter->flags & BE_FLAGS_ERR_DETECTION_SCHEDULED) {
+ cancel_delayed_work_sync(&adapter->be_err_detection_work);
+ adapter->flags &= ~BE_FLAGS_ERR_DETECTION_SCHEDULED;
+ }
+}
+
static void be_mac_clear(struct be_adapter *adapter)
{
if (adapter->pmac_id) {
}
#endif
+static u16 be_calculate_vf_qs(struct be_adapter *adapter, u16 num_vfs)
+{
+ struct be_resources res = adapter->pool_res;
+ u16 num_vf_qs = 1;
+
+ /* Distribute the queue resources equally among the PF and it's VFs
+ * Do not distribute queue resources in multi-channel configuration.
+ */
+ if (num_vfs && !be_is_mc(adapter)) {
+ /* If number of VFs requested is 8 less than max supported,
+ * assign 8 queue pairs to the PF and divide the remaining
+ * resources evenly among the VFs
+ */
+ if (num_vfs < (be_max_vfs(adapter) - 8))
+ num_vf_qs = (res.max_rss_qs - 8) / num_vfs;
+ else
+ num_vf_qs = res.max_rss_qs / num_vfs;
+
+ /* Skyhawk-R chip supports only MAX_RSS_IFACES RSS capable
+ * interfaces per port. Provide RSS on VFs, only if number
+ * of VFs requested is less than MAX_RSS_IFACES limit.
+ */
+ if (num_vfs >= MAX_RSS_IFACES)
+ num_vf_qs = 1;
+ }
+ return num_vf_qs;
+}
+
static int be_clear(struct be_adapter *adapter)
{
+ struct pci_dev *pdev = adapter->pdev;
+ u16 num_vf_qs;
+
be_cancel_worker(adapter);
if (sriov_enabled(adapter))
/* Re-configure FW to distribute resources evenly across max-supported
* number of VFs, only when VFs are not already enabled.
*/
- if (be_physfn(adapter) && !pci_vfs_assigned(adapter->pdev))
+ if (skyhawk_chip(adapter) && be_physfn(adapter) &&
+ !pci_vfs_assigned(pdev)) {
+ num_vf_qs = be_calculate_vf_qs(adapter,
+ pci_sriov_get_totalvfs(pdev));
be_cmd_set_sriov_config(adapter, adapter->pool_res,
- pci_sriov_get_totalvfs(adapter->pdev));
+ pci_sriov_get_totalvfs(pdev),
+ num_vf_qs);
+ }
#ifdef CONFIG_BE2NET_VXLAN
be_disable_vxlan_offloads(adapter);
en_flags = BE_IF_FLAGS_UNTAGGED | BE_IF_FLAGS_BROADCAST |
BE_IF_FLAGS_MULTICAST | BE_IF_FLAGS_PASS_L3L4_ERRORS |
- BE_IF_FLAGS_RSS;
+ BE_IF_FLAGS_RSS | BE_IF_FLAGS_DEFQ_RSS;
en_flags &= cap_flags;
for_all_vfs(adapter, vf_cfg, vf) {
if (!BE3_chip(adapter)) {
status = be_cmd_get_profile_config(adapter, &res,
+ RESOURCE_LIMITS,
vf + 1);
if (!status) {
cap_flags = res.if_cap_flags;
/* On a SuperNIC profile, the driver needs to use the
* GET_PROFILE_CONFIG cmd to query the per-function TXQ limits
*/
- be_cmd_get_profile_config(adapter, &super_nic_res, 0);
+ be_cmd_get_profile_config(adapter, &super_nic_res,
+ RESOURCE_LIMITS, 0);
/* Some old versions of BE3 FW don't report max_tx_qs value */
res->max_tx_qs = super_nic_res.max_tx_qs ? : BE3_MAX_TX_QS;
} else {
res->max_evt_qs = 1;
res->if_cap_flags = BE_IF_CAP_FLAGS_WANT;
+ res->if_cap_flags &= ~BE_IF_FLAGS_DEFQ_RSS;
if (!(adapter->function_caps & BE_FUNCTION_CAPS_RSS))
res->if_cap_flags &= ~BE_IF_FLAGS_RSS;
}
static int be_get_sriov_config(struct be_adapter *adapter)
{
- struct device *dev = &adapter->pdev->dev;
struct be_resources res = {0};
int max_vfs, old_vfs;
- /* Some old versions of BE3 FW don't report max_vfs value */
- be_cmd_get_profile_config(adapter, &res, 0);
+ be_cmd_get_profile_config(adapter, &res, RESOURCE_LIMITS, 0);
+ /* Some old versions of BE3 FW don't report max_vfs value */
if (BE3_chip(adapter) && !res.max_vfs) {
max_vfs = pci_sriov_get_totalvfs(adapter->pdev);
res.max_vfs = max_vfs > 0 ? min(MAX_VFS, max_vfs) : 0;
adapter->pool_res = res;
- if (!be_max_vfs(adapter)) {
- if (num_vfs)
- dev_warn(dev, "SRIOV is disabled. Ignoring num_vfs\n");
- adapter->num_vfs = 0;
- return 0;
- }
-
- pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter));
-
- /* validate num_vfs module param */
+ /* If during previous unload of the driver, the VFs were not disabled,
+ * then we cannot rely on the PF POOL limits for the TotalVFs value.
+ * Instead use the TotalVFs value stored in the pci-dev struct.
+ */
old_vfs = pci_num_vf(adapter->pdev);
if (old_vfs) {
- dev_info(dev, "%d VFs are already enabled\n", old_vfs);
- if (old_vfs != num_vfs)
- dev_warn(dev, "Ignoring num_vfs=%d setting\n", num_vfs);
+ dev_info(&adapter->pdev->dev, "%d VFs are already enabled\n",
+ old_vfs);
+
+ adapter->pool_res.max_vfs =
+ pci_sriov_get_totalvfs(adapter->pdev);
adapter->num_vfs = old_vfs;
- } else {
- if (num_vfs > be_max_vfs(adapter)) {
- dev_info(dev, "Resources unavailable to init %d VFs\n",
- num_vfs);
- dev_info(dev, "Limiting to %d VFs\n",
- be_max_vfs(adapter));
- }
- adapter->num_vfs = min_t(u16, num_vfs, be_max_vfs(adapter));
}
return 0;
}
+static void be_alloc_sriov_res(struct be_adapter *adapter)
+{
+ int old_vfs = pci_num_vf(adapter->pdev);
+ u16 num_vf_qs;
+ int status;
+
+ be_get_sriov_config(adapter);
+
+ if (!old_vfs)
+ pci_sriov_set_totalvfs(adapter->pdev, be_max_vfs(adapter));
+
+ /* When the HW is in SRIOV capable configuration, the PF-pool
+ * resources are given to PF during driver load, if there are no
+ * old VFs. This facility is not available in BE3 FW.
+ * Also, this is done by FW in Lancer chip.
+ */
+ if (skyhawk_chip(adapter) && be_max_vfs(adapter) && !old_vfs) {
+ num_vf_qs = be_calculate_vf_qs(adapter, 0);
+ status = be_cmd_set_sriov_config(adapter, adapter->pool_res, 0,
+ num_vf_qs);
+ if (status)
+ dev_err(&adapter->pdev->dev,
+ "Failed to optimize SRIOV resources\n");
+ }
+}
+
static int be_get_resources(struct be_adapter *adapter)
{
struct device *dev = &adapter->pdev->dev;
if (status)
return status;
+ /* If a deafault RXQ must be created, we'll use up one RSSQ*/
+ if (res.max_rss_qs && res.max_rss_qs == res.max_rx_qs &&
+ !(res.if_cap_flags & BE_IF_FLAGS_DEFQ_RSS))
+ res.max_rss_qs -= 1;
+
/* If RoCE may be enabled stash away half the EQs for RoCE */
if (be_roce_supported(adapter))
res.max_evt_qs /= 2;
adapter->res = res;
}
+ /* If FW supports RSS default queue, then skip creating non-RSS
+ * queue for non-IP traffic.
+ */
+ adapter->need_def_rxq = (be_if_cap_flags(adapter) &
+ BE_IF_FLAGS_DEFQ_RSS) ? 0 : 1;
+
dev_info(dev, "Max: txqs %d, rxqs %d, rss %d, eqs %d, vfs %d\n",
be_max_txqs(adapter), be_max_rxqs(adapter),
be_max_rss(adapter), be_max_eqs(adapter),
be_max_uc(adapter), be_max_mc(adapter),
be_max_vlans(adapter));
+ /* Sanitize cfg_num_qs based on HW and platform limits */
+ adapter->cfg_num_qs = min_t(u16, netif_get_num_default_rss_queues(),
+ be_max_qs(adapter));
return 0;
}
-static void be_sriov_config(struct be_adapter *adapter)
-{
- struct device *dev = &adapter->pdev->dev;
- int status;
-
- status = be_get_sriov_config(adapter);
- if (status) {
- dev_err(dev, "Failed to query SR-IOV configuration\n");
- dev_err(dev, "SR-IOV cannot be enabled\n");
- return;
- }
-
- /* When the HW is in SRIOV capable configuration, the PF-pool
- * resources are equally distributed across the max-number of
- * VFs. The user may request only a subset of the max-vfs to be
- * enabled. Based on num_vfs, redistribute the resources across
- * num_vfs so that each VF will have access to more number of
- * resources. This facility is not available in BE3 FW.
- * Also, this is done by FW in Lancer chip.
- */
- if (be_max_vfs(adapter) && !pci_num_vf(adapter->pdev)) {
- status = be_cmd_set_sriov_config(adapter,
- adapter->pool_res,
- adapter->num_vfs);
- if (status)
- dev_err(dev, "Failed to optimize SR-IOV resources\n");
- }
-}
-
static int be_get_config(struct be_adapter *adapter)
{
+ int status, level;
u16 profile_id;
- int status;
+
+ status = be_cmd_get_cntl_attributes(adapter);
+ if (status)
+ return status;
status = be_cmd_query_fw_cfg(adapter);
if (status)
return status;
+ if (BEx_chip(adapter)) {
+ level = be_cmd_get_fw_log_level(adapter);
+ adapter->msg_enable =
+ level <= FW_LOG_LEVEL_DEFAULT ? NETIF_MSG_HW : 0;
+ }
+
+ be_cmd_get_acpi_wol_cap(adapter);
+
be_cmd_query_port_name(adapter);
if (be_physfn(adapter)) {
"Using profile 0x%x\n", profile_id);
}
- if (!BE2_chip(adapter) && be_physfn(adapter))
- be_sriov_config(adapter);
-
status = be_get_resources(adapter);
if (status)
return status;
if (!adapter->pmac_id)
return -ENOMEM;
- /* Sanitize cfg_num_qs based on HW and platform limits */
- adapter->cfg_num_qs = min(adapter->cfg_num_qs, be_max_qs(adapter));
-
return 0;
}
adapter->flags |= BE_FLAGS_WORKER_SCHEDULED;
}
+static void be_schedule_err_detection(struct be_adapter *adapter)
+{
+ schedule_delayed_work(&adapter->be_err_detection_work,
+ msecs_to_jiffies(1000));
+ adapter->flags |= BE_FLAGS_ERR_DETECTION_SCHEDULED;
+}
+
static int be_setup_queues(struct be_adapter *adapter)
{
struct net_device *netdev = adapter->netdev;
return fw_major;
}
+/* If any VFs are already enabled don't FLR the PF */
+static bool be_reset_required(struct be_adapter *adapter)
+{
+ return pci_num_vf(adapter->pdev) ? false : true;
+}
+
+/* Wait for the FW to be ready and perform the required initialization */
+static int be_func_init(struct be_adapter *adapter)
+{
+ int status;
+
+ status = be_fw_wait_ready(adapter);
+ if (status)
+ return status;
+
+ if (be_reset_required(adapter)) {
+ status = be_cmd_reset_function(adapter);
+ if (status)
+ return status;
+
+ /* Wait for interrupts to quiesce after an FLR */
+ msleep(100);
+
+ /* We can clear all errors when function reset succeeds */
+ be_clear_all_error(adapter);
+ }
+
+ /* Tell FW we're ready to fire cmds */
+ status = be_cmd_fw_init(adapter);
+ if (status)
+ return status;
+
+ /* Allow interrupts for other ULPs running on NIC function */
+ be_intr_set(adapter, true);
+
+ return 0;
+}
+
static int be_setup(struct be_adapter *adapter)
{
struct device *dev = &adapter->pdev->dev;
int status;
+ status = be_func_init(adapter);
+ if (status)
+ return status;
+
be_setup_init(adapter);
if (!lancer_chip(adapter))
be_cmd_req_native_mode(adapter);
+ if (!BE2_chip(adapter) && be_physfn(adapter))
+ be_alloc_sriov_res(adapter);
+
status = be_get_config(adapter);
if (status)
goto err;
be_set_rx_mode(adapter->netdev);
- be_cmd_get_acpi_wol_cap(adapter);
-
status = be_cmd_set_flow_control(adapter, adapter->tx_fc,
adapter->rx_fc);
if (status)
netdev->ethtool_ops = &be_ethtool_ops;
}
-static void be_unmap_pci_bars(struct be_adapter *adapter)
+static void be_cleanup(struct be_adapter *adapter)
{
- if (adapter->csr)
- pci_iounmap(adapter->pdev, adapter->csr);
- if (adapter->db)
- pci_iounmap(adapter->pdev, adapter->db);
-}
+ struct net_device *netdev = adapter->netdev;
-static int db_bar(struct be_adapter *adapter)
-{
- if (lancer_chip(adapter) || !be_physfn(adapter))
- return 0;
- else
- return 4;
+ rtnl_lock();
+ netif_device_detach(netdev);
+ if (netif_running(netdev))
+ be_close(netdev);
+ rtnl_unlock();
+
+ be_clear(adapter);
}
-static int be_roce_map_pci_bars(struct be_adapter *adapter)
+static int be_resume(struct be_adapter *adapter)
{
- if (skyhawk_chip(adapter)) {
- adapter->roce_db.size = 4096;
- adapter->roce_db.io_addr = pci_resource_start(adapter->pdev,
- db_bar(adapter));
- adapter->roce_db.total_size = pci_resource_len(adapter->pdev,
+ struct net_device *netdev = adapter->netdev;
+ int status;
+
+ status = be_setup(adapter);
+ if (status)
+ return status;
+
+ if (netif_running(netdev)) {
+ status = be_open(netdev);
+ if (status)
+ return status;
+ }
+
+ netif_device_attach(netdev);
+
+ return 0;
+}
+
+static int be_err_recover(struct be_adapter *adapter)
+{
+ struct device *dev = &adapter->pdev->dev;
+ int status;
+
+ status = be_resume(adapter);
+ if (status)
+ goto err;
+
+ dev_info(dev, "Adapter recovery successful\n");
+ return 0;
+err:
+ if (be_physfn(adapter))
+ dev_err(dev, "Adapter recovery failed\n");
+ else
+ dev_err(dev, "Re-trying adapter recovery\n");
+
+ return status;
+}
+
+static void be_err_detection_task(struct work_struct *work)
+{
+ struct be_adapter *adapter =
+ container_of(work, struct be_adapter,
+ be_err_detection_work.work);
+ int status = 0;
+
+ be_detect_error(adapter);
+
+ if (adapter->hw_error) {
+ be_cleanup(adapter);
+
+ /* As of now error recovery support is in Lancer only */
+ if (lancer_chip(adapter))
+ status = be_err_recover(adapter);
+ }
+
+ /* Always attempt recovery on VFs */
+ if (!status || be_virtfn(adapter))
+ be_schedule_err_detection(adapter);
+}
+
+static void be_log_sfp_info(struct be_adapter *adapter)
+{
+ int status;
+
+ status = be_cmd_query_sfp_info(adapter);
+ if (!status) {
+ dev_err(&adapter->pdev->dev,
+ "Unqualified SFP+ detected on %c from %s part no: %s",
+ adapter->port_name, adapter->phy.vendor_name,
+ adapter->phy.vendor_pn);
+ }
+ adapter->flags &= ~BE_FLAGS_EVT_INCOMPATIBLE_SFP;
+}
+
+static void be_worker(struct work_struct *work)
+{
+ struct be_adapter *adapter =
+ container_of(work, struct be_adapter, work.work);
+ struct be_rx_obj *rxo;
+ int i;
+
+ /* when interrupts are not yet enabled, just reap any pending
+ * mcc completions
+ */
+ if (!netif_running(adapter->netdev)) {
+ local_bh_disable();
+ be_process_mcc(adapter);
+ local_bh_enable();
+ goto reschedule;
+ }
+
+ if (!adapter->stats_cmd_sent) {
+ if (lancer_chip(adapter))
+ lancer_cmd_get_pport_stats(adapter,
+ &adapter->stats_cmd);
+ else
+ be_cmd_get_stats(adapter, &adapter->stats_cmd);
+ }
+
+ if (be_physfn(adapter) &&
+ MODULO(adapter->work_counter, adapter->be_get_temp_freq) == 0)
+ be_cmd_get_die_temperature(adapter);
+
+ for_all_rx_queues(adapter, rxo, i) {
+ /* Replenish RX-queues starved due to memory
+ * allocation failures.
+ */
+ if (rxo->rx_post_starved)
+ be_post_rx_frags(rxo, GFP_KERNEL, MAX_RX_POST);
+ }
+
+ be_eqd_update(adapter);
+
+ if (adapter->flags & BE_FLAGS_EVT_INCOMPATIBLE_SFP)
+ be_log_sfp_info(adapter);
+
+reschedule:
+ adapter->work_counter++;
+ schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000));
+}
+
+static void be_unmap_pci_bars(struct be_adapter *adapter)
+{
+ if (adapter->csr)
+ pci_iounmap(adapter->pdev, adapter->csr);
+ if (adapter->db)
+ pci_iounmap(adapter->pdev, adapter->db);
+}
+
+static int db_bar(struct be_adapter *adapter)
+{
+ if (lancer_chip(adapter) || !be_physfn(adapter))
+ return 0;
+ else
+ return 4;
+}
+
+static int be_roce_map_pci_bars(struct be_adapter *adapter)
+{
+ if (skyhawk_chip(adapter)) {
+ adapter->roce_db.size = 4096;
+ adapter->roce_db.io_addr = pci_resource_start(adapter->pdev,
+ db_bar(adapter));
+ adapter->roce_db.total_size = pci_resource_len(adapter->pdev,
db_bar(adapter));
}
return 0;
{
struct pci_dev *pdev = adapter->pdev;
u8 __iomem *addr;
+ u32 sli_intf;
+
+ pci_read_config_dword(adapter->pdev, SLI_INTF_REG_OFFSET, &sli_intf);
+ adapter->sli_family = (sli_intf & SLI_INTF_FAMILY_MASK) >>
+ SLI_INTF_FAMILY_SHIFT;
+ adapter->virtfn = (sli_intf & SLI_INTF_FT_MASK) ? 1 : 0;
if (BEx_chip(adapter) && be_physfn(adapter)) {
adapter->csr = pci_iomap(pdev, 2, 0);
return -ENOMEM;
}
-static void be_ctrl_cleanup(struct be_adapter *adapter)
+static void be_drv_cleanup(struct be_adapter *adapter)
{
struct be_dma_mem *mem = &adapter->mbox_mem_alloced;
-
- be_unmap_pci_bars(adapter);
+ struct device *dev = &adapter->pdev->dev;
if (mem->va)
- dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va,
- mem->dma);
+ dma_free_coherent(dev, mem->size, mem->va, mem->dma);
mem = &adapter->rx_filter;
if (mem->va)
- dma_free_coherent(&adapter->pdev->dev, mem->size, mem->va,
- mem->dma);
+ dma_free_coherent(dev, mem->size, mem->va, mem->dma);
+
+ mem = &adapter->stats_cmd;
+ if (mem->va)
+ dma_free_coherent(dev, mem->size, mem->va, mem->dma);
}
-static int be_ctrl_init(struct be_adapter *adapter)
+/* Allocate and initialize various fields in be_adapter struct */
+static int be_drv_init(struct be_adapter *adapter)
{
struct be_dma_mem *mbox_mem_alloc = &adapter->mbox_mem_alloced;
struct be_dma_mem *mbox_mem_align = &adapter->mbox_mem;
struct be_dma_mem *rx_filter = &adapter->rx_filter;
- u32 sli_intf;
- int status;
-
- pci_read_config_dword(adapter->pdev, SLI_INTF_REG_OFFSET, &sli_intf);
- adapter->sli_family = (sli_intf & SLI_INTF_FAMILY_MASK) >>
- SLI_INTF_FAMILY_SHIFT;
- adapter->virtfn = (sli_intf & SLI_INTF_FT_MASK) ? 1 : 0;
-
- status = be_map_pci_bars(adapter);
- if (status)
- goto done;
+ struct be_dma_mem *stats_cmd = &adapter->stats_cmd;
+ struct device *dev = &adapter->pdev->dev;
+ int status = 0;
mbox_mem_alloc->size = sizeof(struct be_mcc_mailbox) + 16;
- mbox_mem_alloc->va = dma_alloc_coherent(&adapter->pdev->dev,
- mbox_mem_alloc->size,
+ mbox_mem_alloc->va = dma_alloc_coherent(dev, mbox_mem_alloc->size,
&mbox_mem_alloc->dma,
GFP_KERNEL);
- if (!mbox_mem_alloc->va) {
- status = -ENOMEM;
- goto unmap_pci_bars;
- }
+ if (!mbox_mem_alloc->va)
+ return -ENOMEM;
+
mbox_mem_align->size = sizeof(struct be_mcc_mailbox);
mbox_mem_align->va = PTR_ALIGN(mbox_mem_alloc->va, 16);
mbox_mem_align->dma = PTR_ALIGN(mbox_mem_alloc->dma, 16);
memset(mbox_mem_align->va, 0, sizeof(struct be_mcc_mailbox));
rx_filter->size = sizeof(struct be_cmd_req_rx_filter);
- rx_filter->va = dma_zalloc_coherent(&adapter->pdev->dev,
- rx_filter->size, &rx_filter->dma,
- GFP_KERNEL);
+ rx_filter->va = dma_zalloc_coherent(dev, rx_filter->size,
+ &rx_filter->dma, GFP_KERNEL);
if (!rx_filter->va) {
status = -ENOMEM;
goto free_mbox;
}
+ if (lancer_chip(adapter))
+ stats_cmd->size = sizeof(struct lancer_cmd_req_pport_stats);
+ else if (BE2_chip(adapter))
+ stats_cmd->size = sizeof(struct be_cmd_req_get_stats_v0);
+ else if (BE3_chip(adapter))
+ stats_cmd->size = sizeof(struct be_cmd_req_get_stats_v1);
+ else
+ stats_cmd->size = sizeof(struct be_cmd_req_get_stats_v2);
+ stats_cmd->va = dma_zalloc_coherent(dev, stats_cmd->size,
+ &stats_cmd->dma, GFP_KERNEL);
+ if (!stats_cmd->va) {
+ status = -ENOMEM;
+ goto free_rx_filter;
+ }
+
mutex_init(&adapter->mbox_lock);
spin_lock_init(&adapter->mcc_lock);
spin_lock_init(&adapter->mcc_cq_lock);
-
init_completion(&adapter->et_cmd_compl);
- pci_save_state(adapter->pdev);
- return 0;
-
-free_mbox:
- dma_free_coherent(&adapter->pdev->dev, mbox_mem_alloc->size,
- mbox_mem_alloc->va, mbox_mem_alloc->dma);
-
-unmap_pci_bars:
- be_unmap_pci_bars(adapter);
-
-done:
- return status;
-}
-static void be_stats_cleanup(struct be_adapter *adapter)
-{
- struct be_dma_mem *cmd = &adapter->stats_cmd;
+ pci_save_state(adapter->pdev);
- if (cmd->va)
- dma_free_coherent(&adapter->pdev->dev, cmd->size,
- cmd->va, cmd->dma);
-}
+ INIT_DELAYED_WORK(&adapter->work, be_worker);
+ INIT_DELAYED_WORK(&adapter->be_err_detection_work,
+ be_err_detection_task);
-static int be_stats_init(struct be_adapter *adapter)
-{
- struct be_dma_mem *cmd = &adapter->stats_cmd;
+ adapter->rx_fc = true;
+ adapter->tx_fc = true;
- if (lancer_chip(adapter))
- cmd->size = sizeof(struct lancer_cmd_req_pport_stats);
- else if (BE2_chip(adapter))
- cmd->size = sizeof(struct be_cmd_req_get_stats_v0);
- else if (BE3_chip(adapter))
- cmd->size = sizeof(struct be_cmd_req_get_stats_v1);
- else
- /* ALL non-BE ASICs */
- cmd->size = sizeof(struct be_cmd_req_get_stats_v2);
+ /* Must be a power of 2 or else MODULO will BUG_ON */
+ adapter->be_get_temp_freq = 64;
- cmd->va = dma_zalloc_coherent(&adapter->pdev->dev, cmd->size, &cmd->dma,
- GFP_KERNEL);
- if (!cmd->va)
- return -ENOMEM;
return 0;
+
+free_rx_filter:
+ dma_free_coherent(dev, rx_filter->size, rx_filter->va, rx_filter->dma);
+free_mbox:
+ dma_free_coherent(dev, mbox_mem_alloc->size, mbox_mem_alloc->va,
+ mbox_mem_alloc->dma);
+ return status;
}
static void be_remove(struct pci_dev *pdev)
be_roce_dev_remove(adapter);
be_intr_set(adapter, false);
- cancel_delayed_work_sync(&adapter->func_recovery_work);
+ be_cancel_err_detection(adapter);
unregister_netdev(adapter->netdev);
/* tell fw we're done with firing cmds */
be_cmd_fw_clean(adapter);
- be_stats_cleanup(adapter);
-
- be_ctrl_cleanup(adapter);
+ be_unmap_pci_bars(adapter);
+ be_drv_cleanup(adapter);
pci_disable_pcie_error_reporting(pdev);
free_netdev(adapter->netdev);
}
-static int be_get_initial_config(struct be_adapter *adapter)
-{
- int status, level;
-
- status = be_cmd_get_cntl_attributes(adapter);
- if (status)
- return status;
-
- /* Must be a power of 2 or else MODULO will BUG_ON */
- adapter->be_get_temp_freq = 64;
-
- if (BEx_chip(adapter)) {
- level = be_cmd_get_fw_log_level(adapter);
- adapter->msg_enable =
- level <= FW_LOG_LEVEL_DEFAULT ? NETIF_MSG_HW : 0;
- }
-
- adapter->cfg_num_qs = netif_get_num_default_rss_queues();
- return 0;
-}
-
-static int lancer_recover_func(struct be_adapter *adapter)
-{
- struct device *dev = &adapter->pdev->dev;
- int status;
-
- status = lancer_test_and_set_rdy_state(adapter);
- if (status)
- goto err;
-
- if (netif_running(adapter->netdev))
- be_close(adapter->netdev);
-
- be_clear(adapter);
-
- be_clear_all_error(adapter);
-
- status = be_setup(adapter);
- if (status)
- goto err;
-
- if (netif_running(adapter->netdev)) {
- status = be_open(adapter->netdev);
- if (status)
- goto err;
- }
-
- dev_err(dev, "Adapter recovery successful\n");
- return 0;
-err:
- if (status == -EAGAIN)
- dev_err(dev, "Waiting for resource provisioning\n");
- else
- dev_err(dev, "Adapter recovery failed\n");
-
- return status;
-}
-
-static void be_func_recovery_task(struct work_struct *work)
-{
- struct be_adapter *adapter =
- container_of(work, struct be_adapter, func_recovery_work.work);
- int status = 0;
-
- be_detect_error(adapter);
-
- if (adapter->hw_error && lancer_chip(adapter)) {
- rtnl_lock();
- netif_device_detach(adapter->netdev);
- rtnl_unlock();
-
- status = lancer_recover_func(adapter);
- if (!status)
- netif_device_attach(adapter->netdev);
- }
-
- /* In Lancer, for all errors other than provisioning error (-EAGAIN),
- * no need to attempt further recovery.
- */
- if (!status || status == -EAGAIN)
- schedule_delayed_work(&adapter->func_recovery_work,
- msecs_to_jiffies(1000));
-}
-
-static void be_log_sfp_info(struct be_adapter *adapter)
-{
- int status;
-
- status = be_cmd_query_sfp_info(adapter);
- if (!status) {
- dev_err(&adapter->pdev->dev,
- "Unqualified SFP+ detected on %c from %s part no: %s",
- adapter->port_name, adapter->phy.vendor_name,
- adapter->phy.vendor_pn);
- }
- adapter->flags &= ~BE_FLAGS_EVT_INCOMPATIBLE_SFP;
-}
-
-static void be_worker(struct work_struct *work)
-{
- struct be_adapter *adapter =
- container_of(work, struct be_adapter, work.work);
- struct be_rx_obj *rxo;
- int i;
-
- /* when interrupts are not yet enabled, just reap any pending
- * mcc completions */
- if (!netif_running(adapter->netdev)) {
- local_bh_disable();
- be_process_mcc(adapter);
- local_bh_enable();
- goto reschedule;
- }
-
- if (!adapter->stats_cmd_sent) {
- if (lancer_chip(adapter))
- lancer_cmd_get_pport_stats(adapter,
- &adapter->stats_cmd);
- else
- be_cmd_get_stats(adapter, &adapter->stats_cmd);
- }
-
- if (be_physfn(adapter) &&
- MODULO(adapter->work_counter, adapter->be_get_temp_freq) == 0)
- be_cmd_get_die_temperature(adapter);
-
- for_all_rx_queues(adapter, rxo, i) {
- /* Replenish RX-queues starved due to memory
- * allocation failures.
- */
- if (rxo->rx_post_starved)
- be_post_rx_frags(rxo, GFP_KERNEL, MAX_RX_POST);
- }
-
- be_eqd_update(adapter);
-
- if (adapter->flags & BE_FLAGS_EVT_INCOMPATIBLE_SFP)
- be_log_sfp_info(adapter);
-
-reschedule:
- adapter->work_counter++;
- schedule_delayed_work(&adapter->work, msecs_to_jiffies(1000));
-}
-
-/* If any VFs are already enabled don't FLR the PF */
-static bool be_reset_required(struct be_adapter *adapter)
-{
- return pci_num_vf(adapter->pdev) ? false : true;
-}
-
static char *mc_name(struct be_adapter *adapter)
{
char *str = ""; /* default */
if (!status)
dev_info(&pdev->dev, "PCIe error reporting enabled\n");
- status = be_ctrl_init(adapter);
+ status = be_map_pci_bars(adapter);
if (status)
goto free_netdev;
- /* sync up with fw's ready state */
- if (be_physfn(adapter)) {
- status = be_fw_wait_ready(adapter);
- if (status)
- goto ctrl_clean;
- }
-
- if (be_reset_required(adapter)) {
- status = be_cmd_reset_function(adapter);
- if (status)
- goto ctrl_clean;
-
- /* Wait for interrupts to quiesce after an FLR */
- msleep(100);
- }
-
- /* Allow interrupts for other ULPs running on NIC function */
- be_intr_set(adapter, true);
-
- /* tell fw we're ready to fire cmds */
- status = be_cmd_fw_init(adapter);
- if (status)
- goto ctrl_clean;
-
- status = be_stats_init(adapter);
- if (status)
- goto ctrl_clean;
-
- status = be_get_initial_config(adapter);
+ status = be_drv_init(adapter);
if (status)
- goto stats_clean;
-
- INIT_DELAYED_WORK(&adapter->work, be_worker);
- INIT_DELAYED_WORK(&adapter->func_recovery_work, be_func_recovery_task);
- adapter->rx_fc = true;
- adapter->tx_fc = true;
+ goto unmap_bars;
status = be_setup(adapter);
if (status)
- goto stats_clean;
+ goto drv_cleanup;
be_netdev_init(netdev);
status = register_netdev(netdev);
be_roce_dev_add(adapter);
- schedule_delayed_work(&adapter->func_recovery_work,
- msecs_to_jiffies(1000));
+ be_schedule_err_detection(adapter);
dev_info(&pdev->dev, "%s: %s %s port %c\n", nic_name(pdev),
func_name(adapter), mc_name(adapter), adapter->port_name);
unsetup:
be_clear(adapter);
-stats_clean:
- be_stats_cleanup(adapter);
-ctrl_clean:
- be_ctrl_cleanup(adapter);
+drv_cleanup:
+ be_drv_cleanup(adapter);
+unmap_bars:
+ be_unmap_pci_bars(adapter);
free_netdev:
free_netdev(netdev);
rel_reg:
static int be_suspend(struct pci_dev *pdev, pm_message_t state)
{
struct be_adapter *adapter = pci_get_drvdata(pdev);
- struct net_device *netdev = adapter->netdev;
if (adapter->wol_en)
be_setup_wol(adapter, true);
be_intr_set(adapter, false);
- cancel_delayed_work_sync(&adapter->func_recovery_work);
+ be_cancel_err_detection(adapter);
- netif_device_detach(netdev);
- if (netif_running(netdev)) {
- rtnl_lock();
- be_close(netdev);
- rtnl_unlock();
- }
- be_clear(adapter);
+ be_cleanup(adapter);
pci_save_state(pdev);
pci_disable_device(pdev);
return 0;
}
-static int be_resume(struct pci_dev *pdev)
+static int be_pci_resume(struct pci_dev *pdev)
{
- int status = 0;
struct be_adapter *adapter = pci_get_drvdata(pdev);
- struct net_device *netdev = adapter->netdev;
-
- netif_device_detach(netdev);
+ int status = 0;
status = pci_enable_device(pdev);
if (status)
pci_set_power_state(pdev, PCI_D0);
pci_restore_state(pdev);
- status = be_fw_wait_ready(adapter);
- if (status)
- return status;
-
- status = be_cmd_reset_function(adapter);
- if (status)
- return status;
-
- be_intr_set(adapter, true);
- /* tell fw we're ready to fire cmds */
- status = be_cmd_fw_init(adapter);
+ status = be_resume(adapter);
if (status)
return status;
- be_setup(adapter);
- if (netif_running(netdev)) {
- rtnl_lock();
- be_open(netdev);
- rtnl_unlock();
- }
-
- schedule_delayed_work(&adapter->func_recovery_work,
- msecs_to_jiffies(1000));
- netif_device_attach(netdev);
+ be_schedule_err_detection(adapter);
if (adapter->wol_en)
be_setup_wol(adapter, false);
be_roce_dev_shutdown(adapter);
cancel_delayed_work_sync(&adapter->work);
- cancel_delayed_work_sync(&adapter->func_recovery_work);
+ be_cancel_err_detection(adapter);
netif_device_detach(adapter->netdev);
pci_channel_state_t state)
{
struct be_adapter *adapter = pci_get_drvdata(pdev);
- struct net_device *netdev = adapter->netdev;
dev_err(&adapter->pdev->dev, "EEH error detected\n");
if (!adapter->eeh_error) {
adapter->eeh_error = true;
- cancel_delayed_work_sync(&adapter->func_recovery_work);
+ be_cancel_err_detection(adapter);
- rtnl_lock();
- netif_device_detach(netdev);
- if (netif_running(netdev))
- be_close(netdev);
- rtnl_unlock();
-
- be_clear(adapter);
+ be_cleanup(adapter);
}
if (state == pci_channel_io_perm_failure)
{
int status = 0;
struct be_adapter *adapter = pci_get_drvdata(pdev);
- struct net_device *netdev = adapter->netdev;
dev_info(&adapter->pdev->dev, "EEH resume\n");
pci_save_state(pdev);
- status = be_cmd_reset_function(adapter);
+ status = be_resume(adapter);
if (status)
goto err;
- /* On some BE3 FW versions, after a HW reset,
- * interrupts will remain disabled for each function.
- * So, explicitly enable interrupts
+ be_schedule_err_detection(adapter);
+ return;
+err:
+ dev_err(&adapter->pdev->dev, "EEH resume failed\n");
+}
+
+static int be_pci_sriov_configure(struct pci_dev *pdev, int num_vfs)
+{
+ struct be_adapter *adapter = pci_get_drvdata(pdev);
+ u16 num_vf_qs;
+ int status;
+
+ if (!num_vfs)
+ be_vf_clear(adapter);
+
+ adapter->num_vfs = num_vfs;
+
+ if (adapter->num_vfs == 0 && pci_vfs_assigned(pdev)) {
+ dev_warn(&pdev->dev,
+ "Cannot disable VFs while they are assigned\n");
+ return -EBUSY;
+ }
+
+ /* When the HW is in SRIOV capable configuration, the PF-pool resources
+ * are equally distributed across the max-number of VFs. The user may
+ * request only a subset of the max-vfs to be enabled.
+ * Based on num_vfs, redistribute the resources across num_vfs so that
+ * each VF will have access to more number of resources.
+ * This facility is not available in BE3 FW.
+ * Also, this is done by FW in Lancer chip.
*/
- be_intr_set(adapter, true);
+ if (skyhawk_chip(adapter) && !pci_num_vf(pdev)) {
+ num_vf_qs = be_calculate_vf_qs(adapter, adapter->num_vfs);
+ status = be_cmd_set_sriov_config(adapter, adapter->pool_res,
+ adapter->num_vfs, num_vf_qs);
+ if (status)
+ dev_err(&pdev->dev,
+ "Failed to optimize SR-IOV resources\n");
+ }
- /* tell fw we're ready to fire cmds */
- status = be_cmd_fw_init(adapter);
+ status = be_get_resources(adapter);
if (status)
- goto err;
+ return be_cmd_status(status);
- status = be_setup(adapter);
+ /* Updating real_num_tx/rx_queues() requires rtnl_lock() */
+ rtnl_lock();
+ status = be_update_queues(adapter);
+ rtnl_unlock();
if (status)
- goto err;
+ return be_cmd_status(status);
- if (netif_running(netdev)) {
- status = be_open(netdev);
- if (status)
- goto err;
- }
+ if (adapter->num_vfs)
+ status = be_vf_setup(adapter);
- schedule_delayed_work(&adapter->func_recovery_work,
- msecs_to_jiffies(1000));
- netif_device_attach(netdev);
- return;
-err:
- dev_err(&adapter->pdev->dev, "EEH resume failed\n");
+ if (!status)
+ return adapter->num_vfs;
+
+ return 0;
}
static const struct pci_error_handlers be_eeh_handlers = {
.probe = be_probe,
.remove = be_remove,
.suspend = be_suspend,
- .resume = be_resume,
+ .resume = be_pci_resume,
.shutdown = be_shutdown,
+ .sriov_configure = be_pci_sriov_configure,
.err_handler = &be_eeh_handlers
};
rx_frag_size = 2048;
}
+ if (num_vfs > 0) {
+ pr_info(DRV_NAME " : Module param num_vfs is obsolete.");
+ pr_info(DRV_NAME " : Use sysfs method to enable VFs\n");
+ }
+
return pci_register_driver(&be_driver);
}
module_init(be_init_module);
__u8 ttl;
u32 flags; /* VXLAN_F_* in vxlan.h */
- struct work_struct sock_work;
- struct work_struct igmp_join;
- struct work_struct igmp_leave;
-
unsigned long age_interval;
struct timer_list age_timer;
spinlock_t hash_lock;
static u32 vxlan_salt __read_mostly;
static struct workqueue_struct *vxlan_wq;
-static void vxlan_sock_work(struct work_struct *work);
-
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
- if (a->sa.sa_family != b->sa.sa_family)
- return false;
- if (a->sa.sa_family == AF_INET6)
- return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
- else
- return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
+ if (a->sa.sa_family != b->sa.sa_family)
+ return false;
+ if (a->sa.sa_family == AF_INET6)
+ return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
+ else
+ return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}
static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
{
- if (ipa->sa.sa_family == AF_INET6)
- return ipv6_addr_any(&ipa->sin6.sin6_addr);
- else
- return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
+ if (ipa->sa.sa_family == AF_INET6)
+ return ipv6_addr_any(&ipa->sin6.sin6_addr);
+ else
+ return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
}
static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
{
- if (ipa->sa.sa_family == AF_INET6)
- return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
- else
- return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
+ if (ipa->sa.sa_family == AF_INET6)
+ return ipv6_addr_is_multicast(&ipa->sin6.sin6_addr);
+ else
+ return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
}
static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
- if (nla_len(nla) >= sizeof(struct in6_addr)) {
- nla_memcpy(&ip->sin6.sin6_addr, nla, sizeof(struct in6_addr));
- ip->sa.sa_family = AF_INET6;
- return 0;
- } else if (nla_len(nla) >= sizeof(__be32)) {
- ip->sin.sin_addr.s_addr = nla_get_be32(nla);
- ip->sa.sa_family = AF_INET;
- return 0;
- } else {
- return -EAFNOSUPPORT;
- }
+ if (nla_len(nla) >= sizeof(struct in6_addr)) {
+ ip->sin6.sin6_addr = nla_get_in6_addr(nla);
+ ip->sa.sa_family = AF_INET6;
+ return 0;
+ } else if (nla_len(nla) >= sizeof(__be32)) {
+ ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
+ ip->sa.sa_family = AF_INET;
+ return 0;
+ } else {
+ return -EAFNOSUPPORT;
+ }
}
static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
- const union vxlan_addr *ip)
+ const union vxlan_addr *ip)
{
- if (ip->sa.sa_family == AF_INET6)
- return nla_put(skb, attr, sizeof(struct in6_addr), &ip->sin6.sin6_addr);
- else
- return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
+ if (ip->sa.sa_family == AF_INET6)
+ return nla_put_in6_addr(skb, attr, &ip->sin6.sin6_addr);
+ else
+ return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
}
#else /* !CONFIG_IPV6 */
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
- return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
+ return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}
static inline bool vxlan_addr_any(const union vxlan_addr *ipa)
{
- return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
+ return ipa->sin.sin_addr.s_addr == htonl(INADDR_ANY);
}
static inline bool vxlan_addr_multicast(const union vxlan_addr *ipa)
{
- return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
+ return IN_MULTICAST(ntohl(ipa->sin.sin_addr.s_addr));
}
static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
- if (nla_len(nla) >= sizeof(struct in6_addr)) {
- return -EAFNOSUPPORT;
- } else if (nla_len(nla) >= sizeof(__be32)) {
- ip->sin.sin_addr.s_addr = nla_get_be32(nla);
- ip->sa.sa_family = AF_INET;
- return 0;
- } else {
- return -EAFNOSUPPORT;
- }
+ if (nla_len(nla) >= sizeof(struct in6_addr)) {
+ return -EAFNOSUPPORT;
+ } else if (nla_len(nla) >= sizeof(__be32)) {
+ ip->sin.sin_addr.s_addr = nla_get_in_addr(nla);
+ ip->sa.sa_family = AF_INET;
+ return 0;
+ } else {
+ return -EAFNOSUPPORT;
+ }
}
static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
- const union vxlan_addr *ip)
+ const union vxlan_addr *ip)
{
- return nla_put_be32(skb, attr, ip->sin.sin_addr.s_addr);
+ return nla_put_in_addr(skb, attr, ip->sin.sin_addr.s_addr);
}
#endif
/* Watch incoming packets to learn mapping between Ethernet address
* and Tunnel endpoint.
- * Return true if packet is bogus and should be droppped.
+ * Return true if packet is bogus and should be dropped.
*/
static bool vxlan_snoop(struct net_device *dev,
union vxlan_addr *src_ip, const u8 *src_mac)
return false;
}
-static void vxlan_sock_hold(struct vxlan_sock *vs)
-{
- atomic_inc(&vs->refcnt);
-}
-
void vxlan_sock_release(struct vxlan_sock *vs)
{
struct sock *sk = vs->sock->sk;
}
EXPORT_SYMBOL_GPL(vxlan_sock_release);
-/* Callback to update multicast group membership when first VNI on
- * multicast asddress is brought up
- * Done as workqueue because ip_mc_join_group acquires RTNL.
+/* Update multicast group membership when first VNI on
+ * multicast address is brought up
*/
-static void vxlan_igmp_join(struct work_struct *work)
+static int vxlan_igmp_join(struct vxlan_dev *vxlan)
{
- struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_join);
struct vxlan_sock *vs = vxlan->vn_sock;
struct sock *sk = vs->sock->sk;
union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
int ifindex = vxlan->default_dst.remote_ifindex;
+ int ret = -EINVAL;
lock_sock(sk);
if (ip->sa.sa_family == AF_INET) {
.imr_ifindex = ifindex,
};
- ip_mc_join_group(sk, &mreq);
+ ret = ip_mc_join_group(sk, &mreq);
#if IS_ENABLED(CONFIG_IPV6)
} else {
- ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
- &ip->sin6.sin6_addr);
+ ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
+ &ip->sin6.sin6_addr);
#endif
}
release_sock(sk);
- vxlan_sock_release(vs);
- dev_put(vxlan->dev);
+ return ret;
}
/* Inverse of vxlan_igmp_join when last VNI is brought down */
-static void vxlan_igmp_leave(struct work_struct *work)
+static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
{
- struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, igmp_leave);
struct vxlan_sock *vs = vxlan->vn_sock;
struct sock *sk = vs->sock->sk;
union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
int ifindex = vxlan->default_dst.remote_ifindex;
+ int ret = -EINVAL;
lock_sock(sk);
if (ip->sa.sa_family == AF_INET) {
.imr_ifindex = ifindex,
};
- ip_mc_leave_group(sk, &mreq);
+ ret = ip_mc_leave_group(sk, &mreq);
#if IS_ENABLED(CONFIG_IPV6)
} else {
- ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
- &ip->sin6.sin6_addr);
+ ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
+ &ip->sin6.sin6_addr);
#endif
}
-
release_sock(sk);
- vxlan_sock_release(vs);
- dev_put(vxlan->dev);
+ return ret;
}
static struct vxlanhdr *vxlan_remcsum(struct sk_buff *skb, struct vxlanhdr *vh,
* this as a malformed packet. This behavior diverges from
* VXLAN RFC (RFC7348) which stipulates that bits in reserved
* in reserved fields are to be ignored. The approach here
- * maintains compatbility with previous stack code, and also
+ * maintains compatibility with previous stack code, and also
* is more robust and provides a little more security in
* adding extensions to VXLAN.
*/
}
#if IS_ENABLED(CONFIG_IPV6)
-static int vxlan6_xmit_skb(struct dst_entry *dst, struct sk_buff *skb,
+static int vxlan6_xmit_skb(struct dst_entry *dst, struct sock *sk,
+ struct sk_buff *skb,
struct net_device *dev, struct in6_addr *saddr,
struct in6_addr *daddr, __u8 prio, __u8 ttl,
__be16 src_port, __be16 dst_port,
}
}
- skb = iptunnel_handle_offloads(skb, udp_sum, type);
- if (IS_ERR(skb)) {
- err = -EINVAL;
- goto err;
- }
-
skb_scrub_packet(skb, xnet);
min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
goto err;
}
+ skb = iptunnel_handle_offloads(skb, udp_sum, type);
+ if (IS_ERR(skb)) {
+ err = -EINVAL;
+ goto err;
+ }
+
vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
vxh->vx_flags = htonl(VXLAN_HF_VNI);
vxh->vx_vni = md->vni;
skb_set_inner_protocol(skb, htons(ETH_P_TEB));
- udp_tunnel6_xmit_skb(dst, skb, dev, saddr, daddr, prio,
+ udp_tunnel6_xmit_skb(dst, sk, skb, dev, saddr, daddr, prio,
ttl, src_port, dst_port,
!!(vxflags & VXLAN_F_UDP_ZERO_CSUM6_TX));
return 0;
}
#endif
-int vxlan_xmit_skb(struct rtable *rt, struct sk_buff *skb,
+int vxlan_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
__be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df,
__be16 src_port, __be16 dst_port,
struct vxlan_metadata *md, bool xnet, u32 vxflags)
}
}
- skb = iptunnel_handle_offloads(skb, udp_sum, type);
- if (IS_ERR(skb))
- return PTR_ERR(skb);
-
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ VXLAN_HLEN + sizeof(struct iphdr)
+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
if (WARN_ON(!skb))
return -ENOMEM;
+ skb = iptunnel_handle_offloads(skb, udp_sum, type);
+ if (IS_ERR(skb))
+ return PTR_ERR(skb);
+
vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
vxh->vx_flags = htonl(VXLAN_HF_VNI);
vxh->vx_vni = md->vni;
skb_set_inner_protocol(skb, htons(ETH_P_TEB));
- return udp_tunnel_xmit_skb(rt, skb, src, dst, tos,
+ return udp_tunnel_xmit_skb(rt, sk, skb, src, dst, tos,
ttl, df, src_port, dst_port, xnet,
!(vxflags & VXLAN_F_UDP_CSUM));
}
struct vxlan_rdst *rdst, bool did_rsc)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
+ struct sock *sk = vxlan->vn_sock->sock->sk;
struct rtable *rt = NULL;
const struct iphdr *old_iph;
struct flowi4 fl4;
md.vni = htonl(vni << 8);
md.gbp = skb->mark;
- err = vxlan_xmit_skb(rt, skb, fl4.saddr,
+ err = vxlan_xmit_skb(rt, sk, skb, fl4.saddr,
dst->sin.sin_addr.s_addr, tos, ttl, df,
src_port, dst_port, &md,
!net_eq(vxlan->net, dev_net(vxlan->dev)),
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
#if IS_ENABLED(CONFIG_IPV6)
} else {
- struct sock *sk = vxlan->vn_sock->sock->sk;
struct dst_entry *ndst;
struct flowi6 fl6;
u32 flags;
md.vni = htonl(vni << 8);
md.gbp = skb->mark;
- err = vxlan6_xmit_skb(ndst, skb, dev, &fl6.saddr, &fl6.daddr,
+ err = vxlan6_xmit_skb(ndst, sk, skb, dev, &fl6.saddr, &fl6.daddr,
0, ttl, src_port, dst_port, &md,
!net_eq(vxlan->net, dev_net(vxlan->dev)),
vxlan->flags);
static void vxlan_vs_add_dev(struct vxlan_sock *vs, struct vxlan_dev *vxlan)
{
+ struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
__u32 vni = vxlan->default_dst.remote_vni;
vxlan->vn_sock = vs;
+ spin_lock(&vn->sock_lock);
hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
+ spin_unlock(&vn->sock_lock);
}
/* Setup stats when device is created */
static int vxlan_init(struct net_device *dev)
{
- struct vxlan_dev *vxlan = netdev_priv(dev);
- struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
- struct vxlan_sock *vs;
- bool ipv6 = vxlan->flags & VXLAN_F_IPV6;
-
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
- spin_lock(&vn->sock_lock);
- vs = vxlan_find_sock(vxlan->net, ipv6 ? AF_INET6 : AF_INET,
- vxlan->dst_port, vxlan->flags);
- if (vs && atomic_add_unless(&vs->refcnt, 1, 0)) {
- /* If we have a socket with same port already, reuse it */
- vxlan_vs_add_dev(vs, vxlan);
- } else {
- /* otherwise make new socket outside of RTNL */
- dev_hold(dev);
- queue_work(vxlan_wq, &vxlan->sock_work);
- }
- spin_unlock(&vn->sock_lock);
-
return 0;
}
static void vxlan_uninit(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
- struct vxlan_sock *vs = vxlan->vn_sock;
vxlan_fdb_delete_default(vxlan);
- if (vs)
- vxlan_sock_release(vs);
free_percpu(dev->tstats);
}
static int vxlan_open(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
- struct vxlan_sock *vs = vxlan->vn_sock;
+ struct vxlan_sock *vs;
+ int ret = 0;
- /* socket hasn't been created */
- if (!vs)
- return -ENOTCONN;
+ vs = vxlan_sock_add(vxlan->net, vxlan->dst_port, vxlan_rcv, NULL,
+ false, vxlan->flags);
+ if (IS_ERR(vs))
+ return PTR_ERR(vs);
+
+ vxlan_vs_add_dev(vs, vxlan);
if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
- vxlan_sock_hold(vs);
- dev_hold(dev);
- queue_work(vxlan_wq, &vxlan->igmp_join);
+ ret = vxlan_igmp_join(vxlan);
+ if (ret) {
+ vxlan_sock_release(vs);
+ return ret;
+ }
}
if (vxlan->age_interval)
mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
- return 0;
+ return ret;
}
/* Purge the forwarding table */
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
struct vxlan_sock *vs = vxlan->vn_sock;
+ int ret = 0;
- if (vs && vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
- !vxlan_group_used(vn, vxlan)) {
- vxlan_sock_hold(vs);
- dev_hold(dev);
- queue_work(vxlan_wq, &vxlan->igmp_leave);
- }
+ if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
+ !vxlan_group_used(vn, vxlan))
+ ret = vxlan_igmp_leave(vxlan);
del_timer_sync(&vxlan->age_timer);
vxlan_flush(vxlan);
+ vxlan_sock_release(vs);
- return 0;
+ return ret;
}
/* Stub, nothing needs to be done. */
INIT_LIST_HEAD(&vxlan->next);
spin_lock_init(&vxlan->hash_lock);
- INIT_WORK(&vxlan->igmp_join, vxlan_igmp_join);
- INIT_WORK(&vxlan->igmp_leave, vxlan_igmp_leave);
- INIT_WORK(&vxlan->sock_work, vxlan_sock_work);
init_timer_deferrable(&vxlan->age_timer);
vxlan->age_timer.function = vxlan_cleanup;
!(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
} else {
udp_conf.family = AF_INET;
- udp_conf.local_ip.s_addr = INADDR_ANY;
}
udp_conf.local_udp_port = port;
sock = vxlan_create_sock(net, ipv6, port, flags);
if (IS_ERR(sock)) {
+ pr_info("Cannot bind port %d, err=%ld\n", ntohs(port),
+ PTR_ERR(sock));
kfree(vs);
return ERR_CAST(sock);
}
struct vxlan_sock *vs;
bool ipv6 = flags & VXLAN_F_IPV6;
- vs = vxlan_socket_create(net, port, rcv, data, flags);
- if (!IS_ERR(vs))
- return vs;
-
- if (no_share) /* Return error if sharing is not allowed. */
- return vs;
-
- spin_lock(&vn->sock_lock);
- vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port, flags);
- if (vs && ((vs->rcv != rcv) ||
- !atomic_add_unless(&vs->refcnt, 1, 0)))
- vs = ERR_PTR(-EBUSY);
- spin_unlock(&vn->sock_lock);
-
- if (!vs)
- vs = ERR_PTR(-EINVAL);
+ if (!no_share) {
+ spin_lock(&vn->sock_lock);
+ vs = vxlan_find_sock(net, ipv6 ? AF_INET6 : AF_INET, port,
+ flags);
+ if (vs && vs->rcv == rcv) {
+ if (!atomic_add_unless(&vs->refcnt, 1, 0))
+ vs = ERR_PTR(-EBUSY);
+ spin_unlock(&vn->sock_lock);
+ return vs;
+ }
+ spin_unlock(&vn->sock_lock);
+ }
- return vs;
+ return vxlan_socket_create(net, port, rcv, data, flags);
}
EXPORT_SYMBOL_GPL(vxlan_sock_add);
-/* Scheduled at device creation to bind to a socket */
-static void vxlan_sock_work(struct work_struct *work)
-{
- struct vxlan_dev *vxlan = container_of(work, struct vxlan_dev, sock_work);
- struct net *net = vxlan->net;
- struct vxlan_net *vn = net_generic(net, vxlan_net_id);
- __be16 port = vxlan->dst_port;
- struct vxlan_sock *nvs;
-
- nvs = vxlan_sock_add(net, port, vxlan_rcv, NULL, false, vxlan->flags);
- spin_lock(&vn->sock_lock);
- if (!IS_ERR(nvs))
- vxlan_vs_add_dev(nvs, vxlan);
- spin_unlock(&vn->sock_lock);
-
- dev_put(vxlan->dev);
-}
-
static int vxlan_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[])
{
/* Unless IPv6 is explicitly requested, assume IPv4 */
dst->remote_ip.sa.sa_family = AF_INET;
if (data[IFLA_VXLAN_GROUP]) {
- dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]);
+ dst->remote_ip.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_GROUP]);
} else if (data[IFLA_VXLAN_GROUP6]) {
if (!IS_ENABLED(CONFIG_IPV6))
return -EPFNOSUPPORT;
- nla_memcpy(&dst->remote_ip.sin6.sin6_addr, data[IFLA_VXLAN_GROUP6],
- sizeof(struct in6_addr));
+ dst->remote_ip.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_GROUP6]);
dst->remote_ip.sa.sa_family = AF_INET6;
use_ipv6 = true;
}
if (data[IFLA_VXLAN_LOCAL]) {
- vxlan->saddr.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
+ vxlan->saddr.sin.sin_addr.s_addr = nla_get_in_addr(data[IFLA_VXLAN_LOCAL]);
vxlan->saddr.sa.sa_family = AF_INET;
} else if (data[IFLA_VXLAN_LOCAL6]) {
if (!IS_ENABLED(CONFIG_IPV6))
return -EPFNOSUPPORT;
/* TODO: respect scope id */
- nla_memcpy(&vxlan->saddr.sin6.sin6_addr, data[IFLA_VXLAN_LOCAL6],
- sizeof(struct in6_addr));
+ vxlan->saddr.sin6.sin6_addr = nla_get_in6_addr(data[IFLA_VXLAN_LOCAL6]);
vxlan->saddr.sa.sa_family = AF_INET6;
use_ipv6 = true;
}
if (!vxlan_addr_any(&dst->remote_ip)) {
if (dst->remote_ip.sa.sa_family == AF_INET) {
- if (nla_put_be32(skb, IFLA_VXLAN_GROUP,
- dst->remote_ip.sin.sin_addr.s_addr))
+ if (nla_put_in_addr(skb, IFLA_VXLAN_GROUP,
+ dst->remote_ip.sin.sin_addr.s_addr))
goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
} else {
- if (nla_put(skb, IFLA_VXLAN_GROUP6, sizeof(struct in6_addr),
- &dst->remote_ip.sin6.sin6_addr))
+ if (nla_put_in6_addr(skb, IFLA_VXLAN_GROUP6,
+ &dst->remote_ip.sin6.sin6_addr))
goto nla_put_failure;
#endif
}
if (!vxlan_addr_any(&vxlan->saddr)) {
if (vxlan->saddr.sa.sa_family == AF_INET) {
- if (nla_put_be32(skb, IFLA_VXLAN_LOCAL,
- vxlan->saddr.sin.sin_addr.s_addr))
+ if (nla_put_in_addr(skb, IFLA_VXLAN_LOCAL,
+ vxlan->saddr.sin.sin_addr.s_addr))
goto nla_put_failure;
#if IS_ENABLED(CONFIG_IPV6)
} else {
- if (nla_put(skb, IFLA_VXLAN_LOCAL6, sizeof(struct in6_addr),
- &vxlan->saddr.sin6.sin6_addr))
+ if (nla_put_in6_addr(skb, IFLA_VXLAN_LOCAL6,
+ &vxlan->saddr.sin6.sin6_addr))
goto nla_put_failure;
#endif
}
if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
struct tcp_fastopen_cookie *foc = opts->fastopen_cookie;
+ u8 *p = (u8 *)ptr;
+ u32 len; /* Fast Open option length */
+
+ if (foc->exp) {
+ len = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+ *ptr = htonl((TCPOPT_EXP << 24) | (len << 16) |
+ TCPOPT_FASTOPEN_MAGIC);
+ p += TCPOLEN_EXP_FASTOPEN_BASE;
+ } else {
+ len = TCPOLEN_FASTOPEN_BASE + foc->len;
+ *p++ = TCPOPT_FASTOPEN;
+ *p++ = len;
+ }
- *ptr++ = htonl((TCPOPT_EXP << 24) |
- ((TCPOLEN_EXP_FASTOPEN_BASE + foc->len) << 16) |
- TCPOPT_FASTOPEN_MAGIC);
-
- memcpy(ptr, foc->val, foc->len);
- if ((foc->len & 3) == 2) {
- u8 *align = ((u8 *)ptr) + foc->len;
- align[0] = align[1] = TCPOPT_NOP;
+ memcpy(p, foc->val, foc->len);
+ if ((len & 3) == 2) {
+ p[foc->len] = TCPOPT_NOP;
+ p[foc->len + 1] = TCPOPT_NOP;
}
- ptr += (foc->len + 3) >> 2;
+ ptr += (len + 3) >> 2;
}
}
opts->mss = tcp_advertise_mss(sk);
remaining -= TCPOLEN_MSS_ALIGNED;
- if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
+ if (likely(sysctl_tcp_timestamps && !*md5)) {
opts->options |= OPTION_TS;
opts->tsval = tcp_skb_timestamp(skb) + tp->tsoffset;
opts->tsecr = tp->rx_opt.ts_recent;
}
if (fastopen && fastopen->cookie.len >= 0) {
- u32 need = TCPOLEN_EXP_FASTOPEN_BASE + fastopen->cookie.len;
+ u32 need = fastopen->cookie.len;
+
+ need += fastopen->cookie.exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
opts->options |= OPTION_FAST_OPEN_COOKIE;
opts->fastopen_cookie = &fastopen->cookie;
remaining -= need;
tp->syn_fastopen = 1;
+ tp->syn_fastopen_exp = fastopen->cookie.exp ? 1 : 0;
}
}
struct request_sock *req,
unsigned int mss, struct sk_buff *skb,
struct tcp_out_options *opts,
- struct tcp_md5sig_key **md5,
+ const struct tcp_md5sig_key *md5,
struct tcp_fastopen_cookie *foc)
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
#ifdef CONFIG_TCP_MD5SIG
- *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
- if (*md5) {
+ if (md5) {
opts->options |= OPTION_MD5;
remaining -= TCPOLEN_MD5SIG_ALIGNED;
*/
ireq->tstamp_ok &= !ireq->sack_ok;
}
-#else
- *md5 = NULL;
#endif
/* We always send an MSS option. */
remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
if (foc != NULL && foc->len >= 0) {
- u32 need = TCPOLEN_EXP_FASTOPEN_BASE + foc->len;
+ u32 need = foc->len;
+
+ need += foc->exp ? TCPOLEN_EXP_FASTOPEN_BASE :
+ TCPOLEN_FASTOPEN_BASE;
need = (need + 3) & ~3U; /* Align to 32 bits */
if (remaining >= need) {
opts->options |= OPTION_FAST_OPEN_COOKIE;
if (md5) {
sk_nocaps_add(sk, NETIF_F_GSO_MASK);
tp->af_specific->calc_md5_hash(opts.hash_location,
- md5, sk, NULL, skb);
+ md5, sk, skb);
}
#endif
/* Get a new skb... force flag on. */
buff = sk_stream_alloc_skb(sk, nsize, gfp);
- if (buff == NULL)
+ if (!buff)
return -ENOMEM; /* We'll just try again later. */
sk->sk_wmem_queued += buff->truesize;
icsk->icsk_af_ops->net_header_len;
icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
icsk->icsk_mtup.probe_size = 0;
+ if (icsk->icsk_mtup.enabled)
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
}
EXPORT_SYMBOL(tcp_mtup_init);
return tcp_fragment(sk, skb, len, mss_now, gfp);
buff = sk_stream_alloc_skb(sk, 0, gfp);
- if (unlikely(buff == NULL))
+ if (unlikely(!buff))
return -ENOMEM;
sk->sk_wmem_queued += buff->truesize;
static bool tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb,
bool *is_cwnd_limited, u32 max_segs)
{
- struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
- u32 send_win, cong_win, limit, in_flight;
+ u32 age, send_win, cong_win, limit, in_flight;
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct skb_mstamp now;
+ struct sk_buff *head;
int win_divisor;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
goto send_now;
- if (icsk->icsk_ca_state != TCP_CA_Open)
+ if (!((1 << icsk->icsk_ca_state) & (TCPF_CA_Open | TCPF_CA_CWR)))
goto send_now;
- /* Defer for less than two clock ticks. */
- if (tp->tso_deferred &&
- (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1)
+ /* Avoid bursty behavior by allowing defer
+ * only if the last write was recent.
+ */
+ if ((s32)(tcp_time_stamp - tp->lsndtime) > 0)
goto send_now;
in_flight = tcp_packets_in_flight(tp);
goto send_now;
}
- /* Ok, it looks like it is advisable to defer.
- * Do not rearm the timer if already set to not break TCP ACK clocking.
- */
- if (!tp->tso_deferred)
- tp->tso_deferred = 1 | (jiffies << 1);
+ head = tcp_write_queue_head(sk);
+ skb_mstamp_get(&now);
+ age = skb_mstamp_us_delta(&now, &head->skb_mstamp);
+ /* If next ACK is likely to come too late (half srtt), do not defer */
+ if (age < (tp->srtt_us >> 4))
+ goto send_now;
+
+ /* Ok, it looks like it is advisable to defer. */
if (cong_win < send_win && cong_win < skb->len)
*is_cwnd_limited = true;
return true;
send_now:
- tp->tso_deferred = 0;
return false;
}
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
+ u32 interval;
+ s32 delta;
+
+ interval = net->ipv4.sysctl_tcp_probe_interval;
+ delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+ if (unlikely(delta >= interval * HZ)) {
+ int mss = tcp_current_mss(sk);
+
+ /* Update current search range */
+ icsk->icsk_mtup.probe_size = 0;
+ icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+ sizeof(struct tcphdr) +
+ icsk->icsk_af_ops->net_header_len;
+ icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+ /* Update probe time stamp */
+ icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+ }
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb, *nskb, *next;
+ struct net *net = sock_net(sk);
int len;
int probe_size;
int size_needed;
int copy;
int mss_now;
+ int interval;
/* Not currently probing/verifying,
* not in recovery,
tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1;
- /* Very simple search strategy: just double the MSS. */
+ /* Use binary search for probe_size between tcp_mss_base,
+ * and current mss_clamp. if (search_high - search_low)
+ * smaller than a threshold, backoff from probing.
+ */
mss_now = tcp_current_mss(sk);
- probe_size = 2 * tp->mss_cache;
+ probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+ icsk->icsk_mtup.search_low) >> 1);
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
- if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
- /* TODO: set timer for probe_converge_event */
+ interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+ /* When misfortune happens, we are reprobing actively,
+ * and then reprobe timer has expired. We stick with current
+ * probing process by not resetting search range to its orignal.
+ */
+ if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+ interval < net->ipv4.sysctl_tcp_probe_threshold) {
+ /* Check whether enough time has elaplased for
+ * another round of probing.
+ */
+ tcp_mtu_check_reprobe(sk);
return -1;
}
}
/* We're allowed to probe. Build it now. */
- if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+ nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC);
+ if (!nskb)
return -1;
sk->sk_wmem_queued += nskb->truesize;
sk_mem_charge(sk, nskb->truesize);
int mss = tcp_current_mss(sk);
int err = -1;
- if (tcp_send_head(sk) != NULL) {
+ if (tcp_send_head(sk)) {
err = tcp_write_xmit(sk, mss, TCP_NAGLE_OFF, 2, GFP_ATOMIC);
goto rearm_timer;
}
if (skb == tcp_send_head(sk))
break;
/* we could do better than to assign each time */
- if (hole == NULL)
+ if (!hole)
tp->retransmit_skb_hint = skb;
/* Assume this retransmit will generate
if (!tcp_can_forward_retransmit(sk))
break;
/* Backtrack if necessary to non-L'ed skb */
- if (hole != NULL) {
+ if (hole) {
skb = hole;
hole = NULL;
}
goto begin_fwd;
} else if (!(sacked & TCPCB_LOST)) {
- if (hole == NULL && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
+ if (!hole && !(sacked & (TCPCB_SACKED_RETRANS|TCPCB_SACKED_ACKED)))
hole = skb;
continue;
*/
mss_now = tcp_current_mss(sk);
- if (tcp_send_head(sk) != NULL) {
+ if (tcp_send_head(sk)) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(skb)->end_seq++;
tp->write_seq++;
struct sk_buff *skb;
skb = tcp_write_queue_head(sk);
- if (skb == NULL || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ if (!skb || !(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
pr_debug("%s: wrong queue state\n", __func__);
return -EFAULT;
}
if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)) {
if (skb_cloned(skb)) {
struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC);
- if (nskb == NULL)
+ if (!nskb)
return -ENOMEM;
tcp_unlink_write_queue(skb, sk);
__skb_header_release(nskb);
struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th;
struct sk_buff *skb;
- struct tcp_md5sig_key *md5;
+ struct tcp_md5sig_key *md5 = NULL;
int tcp_header_size;
int mss;
skb_reserve(skb, MAX_TCP_HEADER);
skb_dst_set(skb, dst);
- security_skb_owned_by(skb, sk);
mss = dst_metric_advmss(dst);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
else
#endif
skb_mstamp_get(&skb->skb_mstamp);
- tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, &md5,
+
+#ifdef CONFIG_TCP_MD5SIG
+ rcu_read_lock();
+ md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req));
+#endif
+ tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5,
foc) + sizeof(*th);
skb_push(skb, tcp_header_size);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
- if (md5) {
+ if (md5)
tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
- md5, NULL, req, skb);
- }
+ md5, req_to_sk(req), skb);
+ rcu_read_unlock();
#endif
+ /* Do not fool tcpdump (if any), clean our debris */
+ skb->tstamp.tv64 = 0;
return skb;
}
EXPORT_SYMBOL(tcp_make_synack);
(sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
#ifdef CONFIG_TCP_MD5SIG
- if (tp->af_specific->md5_lookup(sk, sk) != NULL)
+ if (tp->af_specific->md5_lookup(sk, sk))
tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
#endif
* sock.
*/
buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
- if (buff == NULL) {
+ if (!buff) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
/* We don't queue it, tcp_transmit_skb() sets ownership. */
skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
- if (skb == NULL)
+ if (!skb)
return -1;
/* Reserve space for headers and set control bits. */
if (sk->sk_state == TCP_CLOSE)
return -1;
- if ((skb = tcp_send_head(sk)) != NULL &&
- before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
+ skb = tcp_send_head(sk);
+ if (skb && before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
int err;
unsigned int mss = tcp_current_mss(sk);
unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;