From: David S. Miller <davem@davemloft.net>
Date: Fri, 23 Oct 2015 13:58:09 +0000 (-0700)
Subject: Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next... 
X-Git-Tag: firefly_0821_release~176^2~818^2~76
X-Git-Url: http://plrg.eecs.uci.edu/git/?a=commitdiff_plain;h=bf7958607d0b792e0c43482c0c78786023a69832;hp=8443c1a4b192089e62642d847ebac3e4d15134c3;p=firefly-linux-kernel-4.4.55.git

Merge branch 'master' of git://git./linux/kernel/git/jkirsher/next-queue

Jeff Kirsher says:

====================
Intel Wired LAN Driver Updates 2015-10-23

This series contains updates to i40e, i40evf, if_link, ixgbe and ixgbevf.

Anjali adds a workaround to drop any flow control frames from being
transmitted from any VSI, so that a malicious VF cannot send flow control
or PFC packets out on the wire.  Also fixed a bug in debugfs by grabbing
the filter list lock before adding or deleting a filter.

Akeem fixes an issue where we were unconditionally returning VEB bridge
mode before allowing LB in the add VSI routine, resolve by checking if
the bridge is actually in VEB mode first.

Mitch fixed an issue where the incorrect structure was being used for
VLAN filter list, which meant the VLAN filter list did not get
processed correctly and VLAN filters would not be re-enabled after any
kind of reset.

Helin fixed a problem of possibly getting inconsistent flow control
status after a PF reset.  The issue was requested_mode was being set
with a default value during probe, but the hardware state could be a
different value from this mode.

Carolyn fixed a problem where the driver output of the OEM version
string varied from the other tools.

Jean Sacren fixes up kernel documentation by fixing function header
comments to match actual variables used in the functions.  Also
cleaned up variable initialization, when the variable would be
over-written immediately.

Hiroshi Shimanoto provides three patches to add "trusted" VF by adding
netlink directives and an NDO entry.  Then implement these new controls
in ixgbe and ixgbevf.  This series has gone through several iterations
to address all the suggested community changes and concerns.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
---

diff --git a/drivers/net/dsa/mv88e6060.c b/drivers/net/dsa/mv88e6060.c
index c29aebe1e62b..9093577755f6 100644
--- a/drivers/net/dsa/mv88e6060.c
+++ b/drivers/net/dsa/mv88e6060.c
@@ -26,7 +26,7 @@ static int reg_read(struct dsa_switch *ds, int addr, int reg)
 	if (bus == NULL)
 		return -EINVAL;
 
-	return mdiobus_read(bus, ds->pd->sw_addr + addr, reg);
+	return mdiobus_read_nested(bus, ds->pd->sw_addr + addr, reg);
 }
 
 #define REG_READ(addr, reg)					\
@@ -47,7 +47,7 @@ static int reg_write(struct dsa_switch *ds, int addr, int reg, u16 val)
 	if (bus == NULL)
 		return -EINVAL;
 
-	return mdiobus_write(bus, ds->pd->sw_addr + addr, reg, val);
+	return mdiobus_write_nested(bus, ds->pd->sw_addr + addr, reg, val);
 }
 
 #define REG_WRITE(addr, reg, val)				\
diff --git a/drivers/net/dsa/mv88e6xxx.c b/drivers/net/dsa/mv88e6xxx.c
index 7af96309bc07..b1b14f519d8b 100644
--- a/drivers/net/dsa/mv88e6xxx.c
+++ b/drivers/net/dsa/mv88e6xxx.c
@@ -24,34 +24,6 @@
 #include <net/switchdev.h>
 #include "mv88e6xxx.h"
 
-/* MDIO bus access can be nested in the case of PHYs connected to the
- * internal MDIO bus of the switch, which is accessed via MDIO bus of
- * the Ethernet interface. Avoid lockdep false positives by using
- * mutex_lock_nested().
- */
-static int mv88e6xxx_mdiobus_read(struct mii_bus *bus, int addr, u32 regnum)
-{
-	int ret;
-
-	mutex_lock_nested(&bus->mdio_lock, SINGLE_DEPTH_NESTING);
-	ret = bus->read(bus, addr, regnum);
-	mutex_unlock(&bus->mdio_lock);
-
-	return ret;
-}
-
-static int mv88e6xxx_mdiobus_write(struct mii_bus *bus, int addr, u32 regnum,
-				   u16 val)
-{
-	int ret;
-
-	mutex_lock_nested(&bus->mdio_lock, SINGLE_DEPTH_NESTING);
-	ret = bus->write(bus, addr, regnum, val);
-	mutex_unlock(&bus->mdio_lock);
-
-	return ret;
-}
-
 /* If the switch's ADDR[4:0] strap pins are strapped to zero, it will
  * use all 32 SMI bus addresses on its SMI bus, and all switch registers
  * will be directly accessible on some {device address,register address}
@@ -66,7 +38,7 @@ static int mv88e6xxx_reg_wait_ready(struct mii_bus *bus, int sw_addr)
 	int i;
 
 	for (i = 0; i < 16; i++) {
-		ret = mv88e6xxx_mdiobus_read(bus, sw_addr, SMI_CMD);
+		ret = mdiobus_read_nested(bus, sw_addr, SMI_CMD);
 		if (ret < 0)
 			return ret;
 
@@ -82,7 +54,7 @@ int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg)
 	int ret;
 
 	if (sw_addr == 0)
-		return mv88e6xxx_mdiobus_read(bus, addr, reg);
+		return mdiobus_read_nested(bus, addr, reg);
 
 	/* Wait for the bus to become free. */
 	ret = mv88e6xxx_reg_wait_ready(bus, sw_addr);
@@ -90,8 +62,8 @@ int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg)
 		return ret;
 
 	/* Transmit the read command. */
-	ret = mv88e6xxx_mdiobus_write(bus, sw_addr, SMI_CMD,
-				      SMI_CMD_OP_22_READ | (addr << 5) | reg);
+	ret = mdiobus_write_nested(bus, sw_addr, SMI_CMD,
+				   SMI_CMD_OP_22_READ | (addr << 5) | reg);
 	if (ret < 0)
 		return ret;
 
@@ -101,7 +73,7 @@ int __mv88e6xxx_reg_read(struct mii_bus *bus, int sw_addr, int addr, int reg)
 		return ret;
 
 	/* Read the data. */
-	ret = mv88e6xxx_mdiobus_read(bus, sw_addr, SMI_DATA);
+	ret = mdiobus_read_nested(bus, sw_addr, SMI_DATA);
 	if (ret < 0)
 		return ret;
 
@@ -145,7 +117,7 @@ int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr,
 	int ret;
 
 	if (sw_addr == 0)
-		return mv88e6xxx_mdiobus_write(bus, addr, reg, val);
+		return mdiobus_write_nested(bus, addr, reg, val);
 
 	/* Wait for the bus to become free. */
 	ret = mv88e6xxx_reg_wait_ready(bus, sw_addr);
@@ -153,13 +125,13 @@ int __mv88e6xxx_reg_write(struct mii_bus *bus, int sw_addr, int addr,
 		return ret;
 
 	/* Transmit the data to write. */
-	ret = mv88e6xxx_mdiobus_write(bus, sw_addr, SMI_DATA, val);
+	ret = mdiobus_write_nested(bus, sw_addr, SMI_DATA, val);
 	if (ret < 0)
 		return ret;
 
 	/* Transmit the write command. */
-	ret = mv88e6xxx_mdiobus_write(bus, sw_addr, SMI_CMD,
-				      SMI_CMD_OP_22_WRITE | (addr << 5) | reg);
+	ret = mdiobus_write_nested(bus, sw_addr, SMI_CMD,
+				   SMI_CMD_OP_22_WRITE | (addr << 5) | reg);
 	if (ret < 0)
 		return ret;
 
diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 3b75adfb3f37..55d2d8577d07 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -2583,17 +2583,7 @@ static struct platform_driver cpsw_driver = {
 	.remove = cpsw_remove,
 };
 
-static int __init cpsw_init(void)
-{
-	return platform_driver_register(&cpsw_driver);
-}
-late_initcall(cpsw_init);
-
-static void __exit cpsw_exit(void)
-{
-	platform_driver_unregister(&cpsw_driver);
-}
-module_exit(cpsw_exit);
+module_platform_driver(cpsw_driver);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Cyril Chemparathy <cyril@ti.com>");
diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
index 12f44c53cc8e..88cb4592b6fb 100644
--- a/drivers/net/phy/mdio_bus.c
+++ b/drivers/net/phy/mdio_bus.c
@@ -371,6 +371,33 @@ struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr)
 }
 EXPORT_SYMBOL(mdiobus_scan);
 
+/**
+ * mdiobus_read_nested - Nested version of the mdiobus_read function
+ * @bus: the mii_bus struct
+ * @addr: the phy address
+ * @regnum: register number to read
+ *
+ * In case of nested MDIO bus access avoid lockdep false positives by
+ * using mutex_lock_nested().
+ *
+ * NOTE: MUST NOT be called from interrupt context,
+ * because the bus read/write functions may wait for an interrupt
+ * to conclude the operation.
+ */
+int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum)
+{
+	int retval;
+
+	BUG_ON(in_interrupt());
+
+	mutex_lock_nested(&bus->mdio_lock, SINGLE_DEPTH_NESTING);
+	retval = bus->read(bus, addr, regnum);
+	mutex_unlock(&bus->mdio_lock);
+
+	return retval;
+}
+EXPORT_SYMBOL(mdiobus_read_nested);
+
 /**
  * mdiobus_read - Convenience function for reading a given MII mgmt register
  * @bus: the mii_bus struct
@@ -395,6 +422,34 @@ int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum)
 }
 EXPORT_SYMBOL(mdiobus_read);
 
+/**
+ * mdiobus_write_nested - Nested version of the mdiobus_write function
+ * @bus: the mii_bus struct
+ * @addr: the phy address
+ * @regnum: register number to write
+ * @val: value to write to @regnum
+ *
+ * In case of nested MDIO bus access avoid lockdep false positives by
+ * using mutex_lock_nested().
+ *
+ * NOTE: MUST NOT be called from interrupt context,
+ * because the bus read/write functions may wait for an interrupt
+ * to conclude the operation.
+ */
+int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val)
+{
+	int err;
+
+	BUG_ON(in_interrupt());
+
+	mutex_lock_nested(&bus->mdio_lock, SINGLE_DEPTH_NESTING);
+	err = bus->write(bus, addr, regnum, val);
+	mutex_unlock(&bus->mdio_lock);
+
+	return err;
+}
+EXPORT_SYMBOL(mdiobus_write_nested);
+
 /**
  * mdiobus_write - Convenience function for writing a given MII mgmt register
  * @bus: the mii_bus struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 4c477e6ece33..05fde31b6dc6 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -213,7 +213,9 @@ static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev)
 void devm_mdiobus_free(struct device *dev, struct mii_bus *bus);
 struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr);
 int mdiobus_read(struct mii_bus *bus, int addr, u32 regnum);
+int mdiobus_read_nested(struct mii_bus *bus, int addr, u32 regnum);
 int mdiobus_write(struct mii_bus *bus, int addr, u32 regnum, u16 val);
+int mdiobus_write_nested(struct mii_bus *bus, int addr, u32 regnum, u16 val);
 
 
 #define PHY_INTERRUPT_DISABLED	0x0
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 63615709839d..481fe1c9044c 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -43,7 +43,9 @@ struct inet_connection_sock_af_ops {
 	int	    (*conn_request)(struct sock *sk, struct sk_buff *skb);
 	struct sock *(*syn_recv_sock)(const struct sock *sk, struct sk_buff *skb,
 				      struct request_sock *req,
-				      struct dst_entry *dst);
+				      struct dst_entry *dst,
+				      struct request_sock *req_unhash,
+				      bool *own_req);
 	u16	    net_header_len;
 	u16	    net_frag_header_len;
 	u16	    sockaddr_len;
@@ -272,6 +274,9 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
 			      struct sock *child);
 void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
 				   unsigned long timeout);
+struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
+					 struct request_sock *req,
+					 bool own_req);
 
 static inline void inet_csk_reqsk_queue_added(struct sock *sk)
 {
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 6683ada25fef..de2e3ade6102 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -205,8 +205,8 @@ void inet_put_port(struct sock *sk);
 
 void inet_hashinfo_init(struct inet_hashinfo *h);
 
-int inet_ehash_insert(struct sock *sk, struct sock *osk);
-void __inet_hash_nolisten(struct sock *sk, struct sock *osk);
+bool inet_ehash_insert(struct sock *sk, struct sock *osk);
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
 void __inet_hash(struct sock *sk, struct sock *osk);
 void inet_hash(struct sock *sk);
 void inet_unhash(struct sock *sk);
diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h
index 4757997f76ed..179253f9dcfd 100644
--- a/include/net/mpls_iptunnel.h
+++ b/include/net/mpls_iptunnel.h
@@ -18,7 +18,7 @@
 
 struct mpls_iptunnel_encap {
 	u32	label[MAX_NEW_LABELS];
-	u32	labels;
+	u8	labels;
 };
 
 static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 11e320412216..f80e74c5ad18 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -457,7 +457,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
 void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 				  struct request_sock *req,
-				  struct dst_entry *dst);
+				  struct dst_entry *dst,
+				  struct request_sock *req_unhash,
+				  bool *own_req);
 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 int tcp_connect(struct sock *sk);
diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h
index 923f5a180134..b0e28d24e1a7 100644
--- a/net/dccp/dccp.h
+++ b/net/dccp/dccp.h
@@ -278,7 +278,9 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
 
 struct sock *dccp_v4_request_recv_sock(const struct sock *sk, struct sk_buff *skb,
 				       struct request_sock *req,
-				       struct dst_entry *dst);
+				       struct dst_entry *dst,
+				       struct request_sock *req_unhash,
+				       bool *own_req);
 struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
 			    struct request_sock *req);
 
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 59bc180b02d8..5684e14932bd 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -393,7 +393,9 @@ static inline u64 dccp_v4_init_sequence(const struct sk_buff *skb)
 struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
 				       struct sk_buff *skb,
 				       struct request_sock *req,
-				       struct dst_entry *dst)
+				       struct dst_entry *dst,
+				       struct request_sock *req_unhash,
+				       bool *own_req)
 {
 	struct inet_request_sock *ireq;
 	struct inet_sock *newinet;
@@ -426,7 +428,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
 
 	if (__inet_inherit_port(sk, newsk) < 0)
 		goto put_and_exit;
-	__inet_hash_nolisten(newsk, NULL);
+	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
 
 	return newsk;
 
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index d9cc731f2619..ef4e48ce9143 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -380,7 +380,9 @@ drop:
 static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 					      struct sk_buff *skb,
 					      struct request_sock *req,
-					      struct dst_entry *dst)
+					      struct dst_entry *dst,
+					      struct request_sock *req_unhash,
+					      bool *own_req)
 {
 	struct inet_request_sock *ireq = inet_rsk(req);
 	struct ipv6_pinfo *newnp;
@@ -393,7 +395,8 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 		/*
 		 *	v6 mapped
 		 */
-		newsk = dccp_v4_request_recv_sock(sk, skb, req, dst);
+		newsk = dccp_v4_request_recv_sock(sk, skb, req, dst,
+						  req_unhash, own_req);
 		if (newsk == NULL)
 			return NULL;
 
@@ -511,7 +514,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
 		dccp_done(newsk);
 		goto out;
 	}
-	__inet_hash(newsk, NULL);
+	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
 
 	return newsk;
 
diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c
index d10aace43672..1994f8af646b 100644
--- a/net/dccp/minisocks.c
+++ b/net/dccp/minisocks.c
@@ -143,6 +143,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
 {
 	struct sock *child = NULL;
 	struct dccp_request_sock *dreq = dccp_rsk(req);
+	bool own_req;
 
 	/* Check for retransmitted REQUEST */
 	if (dccp_hdr(skb)->dccph_type == DCCP_PKT_REQUEST) {
@@ -182,14 +183,13 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb,
 	if (dccp_parse_options(sk, dreq, skb))
 		 goto drop;
 
-	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
-	if (child == NULL)
+	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+							 req, &own_req);
+	if (!child)
 		goto listen_overflow;
 
-	inet_csk_reqsk_queue_drop(sk, req);
-	inet_csk_reqsk_queue_add(sk, req, child);
-out:
-	return child;
+	return inet_csk_complete_hashdance(sk, child, req, own_req);
+
 listen_overflow:
 	dccp_pr_debug("listen_overflow!\n");
 	DCCP_SKB_CB(skb)->dccpd_reset_code = DCCP_RESET_CODE_TOO_BUSY;
@@ -198,7 +198,7 @@ drop:
 		req->rsk_ops->send_reset(sk, skb);
 
 	inet_csk_reqsk_queue_drop(sk, req);
-	goto out;
+	return NULL;
 }
 
 EXPORT_SYMBOL_GPL(dccp_check_req);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 8430bc8ccd58..1feb15f23de8 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -523,15 +523,15 @@ static bool reqsk_queue_unlink(struct request_sock_queue *queue,
 			       struct request_sock *req)
 {
 	struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
-	spinlock_t *lock;
-	bool found;
+	bool found = false;
 
-	lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
-
-	spin_lock(lock);
-	found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
-	spin_unlock(lock);
+	if (sk_hashed(req_to_sk(req))) {
+		spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
 
+		spin_lock(lock);
+		found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
+		spin_unlock(lock);
+	}
 	if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
 		reqsk_put(req);
 	return found;
@@ -811,6 +811,25 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
 }
 EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
 
+struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
+					 struct request_sock *req, bool own_req)
+{
+	if (own_req) {
+		inet_csk_reqsk_queue_drop(sk, req);
+		reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
+		inet_csk_reqsk_queue_add(sk, req, child);
+		/* Warning: caller must not call reqsk_put(req);
+		 * child stole last reference on it.
+		 */
+		return child;
+	}
+	/* Too bad, another child took ownership of the request, undo. */
+	bh_unlock_sock(child);
+	sock_put(child);
+	return NULL;
+}
+EXPORT_SYMBOL(inet_csk_complete_hashdance);
+
 /*
  *	This routine closes sockets which have been at least partially
  *	opened, but not yet accepted.
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 958728a22001..ccc5980797fc 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -407,13 +407,13 @@ static u32 inet_sk_port_offset(const struct sock *sk)
 /* insert a socket into ehash, and eventually remove another one
  * (The another one can be a SYN_RECV or TIMEWAIT
  */
-int inet_ehash_insert(struct sock *sk, struct sock *osk)
+bool inet_ehash_insert(struct sock *sk, struct sock *osk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct hlist_nulls_head *list;
 	struct inet_ehash_bucket *head;
 	spinlock_t *lock;
-	int ret = 0;
+	bool ret = true;
 
 	WARN_ON_ONCE(!sk_unhashed(sk));
 
@@ -423,30 +423,41 @@ int inet_ehash_insert(struct sock *sk, struct sock *osk)
 	lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
 
 	spin_lock(lock);
-	__sk_nulls_add_node_rcu(sk, list);
 	if (osk) {
-		WARN_ON(sk->sk_hash != osk->sk_hash);
-		sk_nulls_del_node_init_rcu(osk);
+		WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
+		ret = sk_nulls_del_node_init_rcu(osk);
 	}
+	if (ret)
+		__sk_nulls_add_node_rcu(sk, list);
 	spin_unlock(lock);
 	return ret;
 }
 
-void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
+bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
 {
-	inet_ehash_insert(sk, osk);
-	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	bool ok = inet_ehash_insert(sk, osk);
+
+	if (ok) {
+		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+	} else {
+		percpu_counter_inc(sk->sk_prot->orphan_count);
+		sk->sk_state = TCP_CLOSE;
+		sock_set_flag(sk, SOCK_DEAD);
+		inet_csk_destroy_sock(sk);
+	}
+	return ok;
 }
-EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
+EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
 
 void __inet_hash(struct sock *sk, struct sock *osk)
 {
 	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
 	struct inet_listen_hashbucket *ilb;
 
-	if (sk->sk_state != TCP_LISTEN)
-		return __inet_hash_nolisten(sk, osk);
-
+	if (sk->sk_state != TCP_LISTEN) {
+		inet_ehash_nolisten(sk, osk);
+		return;
+	}
 	WARN_ON(!sk_unhashed(sk));
 	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
 
@@ -567,7 +578,7 @@ ok:
 		inet_bind_hash(sk, tb, port);
 		if (sk_unhashed(sk)) {
 			inet_sk(sk)->inet_sport = htons(port);
-			__inet_hash_nolisten(sk, (struct sock *)tw);
+			inet_ehash_nolisten(sk, (struct sock *)tw);
 		}
 		if (tw)
 			inet_twsk_bind_unhash(tw, hinfo);
@@ -584,7 +595,7 @@ ok:
 	tb  = inet_csk(sk)->icsk_bind_hash;
 	spin_lock_bh(&head->lock);
 	if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
-		__inet_hash_nolisten(sk, NULL);
+		inet_ehash_nolisten(sk, NULL);
 		spin_unlock_bh(&head->lock);
 		return 0;
 	} else {
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 4c0892badb8b..4cbe9f0a4281 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -221,8 +221,10 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
 {
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sock *child;
+	bool own_req;
 
-	child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst);
+	child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
+						 NULL, &own_req);
 	if (child) {
 		atomic_set(&req->rsk_refcnt, 1);
 		sock_rps_save_rxhash(child, skb);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 93396bf7b475..55be6ac70cff 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -133,12 +133,14 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
 	struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
 	struct sock *child;
 	u32 end_seq;
+	bool own_req;
 
 	req->num_retrans = 0;
 	req->num_timeout = 0;
 	req->sk = NULL;
 
-	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+							 NULL, &own_req);
 	if (!child)
 		return NULL;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 30dd45c1f568..1c2648bbac4b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1247,7 +1247,9 @@ EXPORT_SYMBOL(tcp_v4_conn_request);
  */
 struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 				  struct request_sock *req,
-				  struct dst_entry *dst)
+				  struct dst_entry *dst,
+				  struct request_sock *req_unhash,
+				  bool *own_req)
 {
 	struct inet_request_sock *ireq;
 	struct inet_sock *newinet;
@@ -1323,7 +1325,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 
 	if (__inet_inherit_port(sk, newsk) < 0)
 		goto put_and_exit;
-	__inet_hash_nolisten(newsk, NULL);
+	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
 
 	return newsk;
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 1fd5d413a664..3575dd1e5b67 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -580,6 +580,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	const struct tcphdr *th = tcp_hdr(skb);
 	__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 	bool paws_reject = false;
+	bool own_req;
 
 	tmp_opt.saw_tstamp = 0;
 	if (th->doff > (sizeof(struct tcphdr)>>2)) {
@@ -767,18 +768,14 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
 	 * ESTABLISHED STATE. If it will be dropped after
 	 * socket is created, wait for troubles.
 	 */
-	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL);
+	child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL,
+							 req, &own_req);
 	if (!child)
 		goto listen_overflow;
 
 	sock_rps_save_rxhash(child, skb);
 	tcp_synack_rtt_meas(child, req);
-	inet_csk_reqsk_queue_drop(sk, req);
-	inet_csk_reqsk_queue_add(sk, req, child);
-	/* Warning: caller must not call reqsk_put(req);
-	 * child stole last reference on it.
-	 */
-	return child;
+	return inet_csk_complete_hashdance(sk, child, req, own_req);
 
 listen_overflow:
 	if (!sysctl_tcp_abort_on_overflow) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f495d189f5e0..714bc5ad096e 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -965,7 +965,9 @@ drop:
 
 static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 					 struct request_sock *req,
-					 struct dst_entry *dst)
+					 struct dst_entry *dst,
+					 struct request_sock *req_unhash,
+					 bool *own_req)
 {
 	struct inet_request_sock *ireq;
 	struct ipv6_pinfo *newnp;
@@ -984,7 +986,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		 *	v6 mapped
 		 */
 
-		newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst);
+		newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst,
+					     req_unhash, own_req);
 
 		if (!newsk)
 			return NULL;
@@ -1145,7 +1148,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		tcp_done(newsk);
 		goto out;
 	}
-	__inet_hash(newsk, NULL);
+	*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
 
 	return newsk;
 
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index bb185a28de98..cc972e30355b 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -19,36 +19,13 @@
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 #endif
+#include <net/nexthop.h>
 #include "internal.h"
 
-#define LABEL_NOT_SPECIFIED (1<<20)
-#define MAX_NEW_LABELS 2
-
-/* This maximum ha length copied from the definition of struct neighbour */
-#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
-
-enum mpls_payload_type {
-	MPT_UNSPEC, /* IPv4 or IPv6 */
-	MPT_IPV4 = 4,
-	MPT_IPV6 = 6,
-
-	/* Other types not implemented:
-	 *  - Pseudo-wire with or without control word (RFC4385)
-	 *  - GAL (RFC5586)
-	 */
-};
-
-struct mpls_route { /* next hop label forwarding entry */
-	struct net_device __rcu *rt_dev;
-	struct rcu_head		rt_rcu;
-	u32			rt_label[MAX_NEW_LABELS];
-	u8			rt_protocol; /* routing protocol that set this entry */
-	u8                      rt_payload_type;
-	u8			rt_labels;
-	u8			rt_via_alen;
-	u8			rt_via_table;
-	u8			rt_via[0];
-};
+/* Maximum number of labels to look ahead at when selecting a path of
+ * a multipath route
+ */
+#define MAX_MP_SELECT_LABELS 4
 
 static int zero = 0;
 static int label_limit = (1 << 20) - 1;
@@ -80,10 +57,10 @@ bool mpls_output_possible(const struct net_device *dev)
 }
 EXPORT_SYMBOL_GPL(mpls_output_possible);
 
-static unsigned int mpls_rt_header_size(const struct mpls_route *rt)
+static unsigned int mpls_nh_header_size(const struct mpls_nh *nh)
 {
 	/* The size of the layer 2.5 labels to be added for this route */
-	return rt->rt_labels * sizeof(struct mpls_shim_hdr);
+	return nh->nh_labels * sizeof(struct mpls_shim_hdr);
 }
 
 unsigned int mpls_dev_mtu(const struct net_device *dev)
@@ -105,6 +82,80 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
+static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
+					     struct sk_buff *skb, bool bos)
+{
+	struct mpls_entry_decoded dec;
+	struct mpls_shim_hdr *hdr;
+	bool eli_seen = false;
+	int label_index;
+	int nh_index = 0;
+	u32 hash = 0;
+
+	/* No need to look further into packet if there's only
+	 * one path
+	 */
+	if (rt->rt_nhn == 1)
+		goto out;
+
+	for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
+	     label_index++) {
+		if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
+			break;
+
+		/* Read and decode the current label */
+		hdr = mpls_hdr(skb) + label_index;
+		dec = mpls_entry_decode(hdr);
+
+		/* RFC6790 - reserved labels MUST NOT be used as keys
+		 * for the load-balancing function
+		 */
+		if (likely(dec.label >= MPLS_LABEL_FIRST_UNRESERVED)) {
+			hash = jhash_1word(dec.label, hash);
+
+			/* The entropy label follows the entropy label
+			 * indicator, so this means that the entropy
+			 * label was just added to the hash - no need to
+			 * go any deeper either in the label stack or in the
+			 * payload
+			 */
+			if (eli_seen)
+				break;
+		} else if (dec.label == MPLS_LABEL_ENTROPY) {
+			eli_seen = true;
+		}
+
+		bos = dec.bos;
+		if (bos && pskb_may_pull(skb, sizeof(*hdr) * label_index +
+					 sizeof(struct iphdr))) {
+			const struct iphdr *v4hdr;
+
+			v4hdr = (const struct iphdr *)(mpls_hdr(skb) +
+						       label_index);
+			if (v4hdr->version == 4) {
+				hash = jhash_3words(ntohl(v4hdr->saddr),
+						    ntohl(v4hdr->daddr),
+						    v4hdr->protocol, hash);
+			} else if (v4hdr->version == 6 &&
+				pskb_may_pull(skb, sizeof(*hdr) * label_index +
+					      sizeof(struct ipv6hdr))) {
+				const struct ipv6hdr *v6hdr;
+
+				v6hdr = (const struct ipv6hdr *)(mpls_hdr(skb) +
+								label_index);
+
+				hash = __ipv6_addr_jhash(&v6hdr->saddr, hash);
+				hash = __ipv6_addr_jhash(&v6hdr->daddr, hash);
+				hash = jhash_1word(v6hdr->nexthdr, hash);
+			}
+		}
+	}
+
+	nh_index = hash % rt->rt_nhn;
+out:
+	return &rt->rt_nh[nh_index];
+}
+
 static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
 			struct mpls_entry_decoded dec)
 {
@@ -159,6 +210,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	struct net *net = dev_net(dev);
 	struct mpls_shim_hdr *hdr;
 	struct mpls_route *rt;
+	struct mpls_nh *nh;
 	struct mpls_entry_decoded dec;
 	struct net_device *out_dev;
 	struct mpls_dev *mdev;
@@ -196,8 +248,12 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	if (!rt)
 		goto drop;
 
+	nh = mpls_select_multipath(rt, skb, dec.bos);
+	if (!nh)
+		goto drop;
+
 	/* Find the output device */
-	out_dev = rcu_dereference(rt->rt_dev);
+	out_dev = rcu_dereference(nh->nh_dev);
 	if (!mpls_output_possible(out_dev))
 		goto drop;
 
@@ -212,7 +268,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 	dec.ttl -= 1;
 
 	/* Verify the destination can hold the packet */
-	new_header_size = mpls_rt_header_size(rt);
+	new_header_size = mpls_nh_header_size(nh);
 	mtu = mpls_dev_mtu(out_dev);
 	if (mpls_pkt_too_big(skb, mtu - new_header_size))
 		goto drop;
@@ -240,13 +296,14 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
 		/* Push the new labels */
 		hdr = mpls_hdr(skb);
 		bos = dec.bos;
-		for (i = rt->rt_labels - 1; i >= 0; i--) {
-			hdr[i] = mpls_entry_encode(rt->rt_label[i], dec.ttl, 0, bos);
+		for (i = nh->nh_labels - 1; i >= 0; i--) {
+			hdr[i] = mpls_entry_encode(nh->nh_label[i],
+						   dec.ttl, 0, bos);
 			bos = false;
 		}
 	}
 
-	err = neigh_xmit(rt->rt_via_table, out_dev, rt->rt_via, skb);
+	err = neigh_xmit(nh->nh_via_table, out_dev, nh->nh_via, skb);
 	if (err)
 		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
 				    __func__, err);
@@ -270,24 +327,28 @@ static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = {
 struct mpls_route_config {
 	u32			rc_protocol;
 	u32			rc_ifindex;
-	u16			rc_via_table;
-	u16			rc_via_alen;
+	u8			rc_via_table;
+	u8			rc_via_alen;
 	u8			rc_via[MAX_VIA_ALEN];
 	u32			rc_label;
-	u32			rc_output_labels;
+	u8			rc_output_labels;
 	u32			rc_output_label[MAX_NEW_LABELS];
 	u32			rc_nlflags;
 	enum mpls_payload_type	rc_payload_type;
 	struct nl_info		rc_nlinfo;
+	struct rtnexthop	*rc_mp;
+	int			rc_mp_len;
 };
 
-static struct mpls_route *mpls_rt_alloc(size_t alen)
+static struct mpls_route *mpls_rt_alloc(int num_nh)
 {
 	struct mpls_route *rt;
 
-	rt = kzalloc(sizeof(*rt) + alen, GFP_KERNEL);
+	rt = kzalloc(sizeof(*rt) + (num_nh * sizeof(struct mpls_nh)),
+		     GFP_KERNEL);
 	if (rt)
-		rt->rt_via_alen = alen;
+		rt->rt_nhn = num_nh;
+
 	return rt;
 }
 
@@ -312,25 +373,22 @@ static void mpls_notify_route(struct net *net, unsigned index,
 }
 
 static void mpls_route_update(struct net *net, unsigned index,
-			      struct net_device *dev, struct mpls_route *new,
+			      struct mpls_route *new,
 			      const struct nl_info *info)
 {
 	struct mpls_route __rcu **platform_label;
-	struct mpls_route *rt, *old = NULL;
+	struct mpls_route *rt;
 
 	ASSERT_RTNL();
 
 	platform_label = rtnl_dereference(net->mpls.platform_label);
 	rt = rtnl_dereference(platform_label[index]);
-	if (!dev || (rt && (rtnl_dereference(rt->rt_dev) == dev))) {
-		rcu_assign_pointer(platform_label[index], new);
-		old = rt;
-	}
+	rcu_assign_pointer(platform_label[index], new);
 
-	mpls_notify_route(net, index, old, new, info);
+	mpls_notify_route(net, index, rt, new, info);
 
 	/* If we removed a route free it now */
-	mpls_rt_free(old);
+	mpls_rt_free(rt);
 }
 
 static unsigned find_free_label(struct net *net)
@@ -406,40 +464,199 @@ static struct net_device *inet6_fib_lookup_dev(struct net *net, void *addr)
 #endif
 
 static struct net_device *find_outdev(struct net *net,
-				      struct mpls_route_config *cfg)
+				      struct mpls_nh *nh, int oif)
 {
 	struct net_device *dev = NULL;
 
-	if (!cfg->rc_ifindex) {
-		switch (cfg->rc_via_table) {
+	if (!oif) {
+		switch (nh->nh_via_table) {
 		case NEIGH_ARP_TABLE:
-			dev = inet_fib_lookup_dev(net, cfg->rc_via);
+			dev = inet_fib_lookup_dev(net, nh->nh_via);
 			break;
 		case NEIGH_ND_TABLE:
-			dev = inet6_fib_lookup_dev(net, cfg->rc_via);
+			dev = inet6_fib_lookup_dev(net, nh->nh_via);
 			break;
 		case NEIGH_LINK_TABLE:
 			break;
 		}
 	} else {
-		dev = dev_get_by_index(net, cfg->rc_ifindex);
+		dev = dev_get_by_index(net, oif);
 	}
 
 	if (!dev)
 		return ERR_PTR(-ENODEV);
 
+	/* The caller is holding rtnl anyways, so release the dev reference */
+	dev_put(dev);
+
 	return dev;
 }
 
+static int mpls_nh_assign_dev(struct net *net, struct mpls_nh *nh, int oif)
+{
+	struct net_device *dev = NULL;
+	int err = -ENODEV;
+
+	dev = find_outdev(net, nh, oif);
+	if (IS_ERR(dev)) {
+		err = PTR_ERR(dev);
+		dev = NULL;
+		goto errout;
+	}
+
+	/* Ensure this is a supported device */
+	err = -EINVAL;
+	if (!mpls_dev_get(dev))
+		goto errout;
+
+	RCU_INIT_POINTER(nh->nh_dev, dev);
+
+	return 0;
+
+errout:
+	return err;
+}
+
+static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg,
+				  struct mpls_route *rt)
+{
+	struct net *net = cfg->rc_nlinfo.nl_net;
+	struct mpls_nh *nh = rt->rt_nh;
+	int err;
+	int i;
+
+	if (!nh)
+		return -ENOMEM;
+
+	err = -EINVAL;
+	/* Ensure only a supported number of labels are present */
+	if (cfg->rc_output_labels > MAX_NEW_LABELS)
+		goto errout;
+
+	nh->nh_labels = cfg->rc_output_labels;
+	for (i = 0; i < nh->nh_labels; i++)
+		nh->nh_label[i] = cfg->rc_output_label[i];
+
+	nh->nh_via_table = cfg->rc_via_table;
+	memcpy(nh->nh_via, cfg->rc_via, cfg->rc_via_alen);
+	nh->nh_via_alen = cfg->rc_via_alen;
+
+	err = mpls_nh_assign_dev(net, nh, cfg->rc_ifindex);
+	if (err)
+		goto errout;
+
+	return 0;
+
+errout:
+	return err;
+}
+
+static int mpls_nh_build(struct net *net, struct mpls_nh *nh,
+			 int oif, struct nlattr *via, struct nlattr *newdst)
+{
+	int err = -ENOMEM;
+
+	if (!nh)
+		goto errout;
+
+	if (newdst) {
+		err = nla_get_labels(newdst, MAX_NEW_LABELS,
+				     &nh->nh_labels, nh->nh_label);
+		if (err)
+			goto errout;
+	}
+
+	err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table,
+			  nh->nh_via);
+	if (err)
+		goto errout;
+
+	err = mpls_nh_assign_dev(net, nh, oif);
+	if (err)
+		goto errout;
+
+	return 0;
+
+errout:
+	return err;
+}
+
+static int mpls_count_nexthops(struct rtnexthop *rtnh, int len)
+{
+	int nhs = 0;
+	int remaining = len;
+
+	while (rtnh_ok(rtnh, remaining)) {
+		nhs++;
+		rtnh = rtnh_next(rtnh, &remaining);
+	}
+
+	/* leftover implies invalid nexthop configuration, discard it */
+	return remaining > 0 ? 0 : nhs;
+}
+
+static int mpls_nh_build_multi(struct mpls_route_config *cfg,
+			       struct mpls_route *rt)
+{
+	struct rtnexthop *rtnh = cfg->rc_mp;
+	struct nlattr *nla_via, *nla_newdst;
+	int remaining = cfg->rc_mp_len;
+	int nhs = 0;
+	int err = 0;
+
+	change_nexthops(rt) {
+		int attrlen;
+
+		nla_via = NULL;
+		nla_newdst = NULL;
+
+		err = -EINVAL;
+		if (!rtnh_ok(rtnh, remaining))
+			goto errout;
+
+		/* neither weighted multipath nor any flags
+		 * are supported
+		 */
+		if (rtnh->rtnh_hops || rtnh->rtnh_flags)
+			goto errout;
+
+		attrlen = rtnh_attrlen(rtnh);
+		if (attrlen > 0) {
+			struct nlattr *attrs = rtnh_attrs(rtnh);
+
+			nla_via = nla_find(attrs, attrlen, RTA_VIA);
+			nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST);
+		}
+
+		if (!nla_via)
+			goto errout;
+
+		err = mpls_nh_build(cfg->rc_nlinfo.nl_net, nh,
+				    rtnh->rtnh_ifindex, nla_via,
+				    nla_newdst);
+		if (err)
+			goto errout;
+
+		rtnh = rtnh_next(rtnh, &remaining);
+		nhs++;
+	} endfor_nexthops(rt);
+
+	rt->rt_nhn = nhs;
+
+	return 0;
+
+errout:
+	return err;
+}
+
 static int mpls_route_add(struct mpls_route_config *cfg)
 {
 	struct mpls_route __rcu **platform_label;
 	struct net *net = cfg->rc_nlinfo.nl_net;
-	struct net_device *dev = NULL;
 	struct mpls_route *rt, *old;
-	unsigned index;
-	int i;
 	int err = -EINVAL;
+	unsigned index;
+	int nhs = 1; /* default to one nexthop */
 
 	index = cfg->rc_label;
 
@@ -457,27 +674,6 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 	if (index >= net->mpls.platform_labels)
 		goto errout;
 
-	/* Ensure only a supported number of labels are present */
-	if (cfg->rc_output_labels > MAX_NEW_LABELS)
-		goto errout;
-
-	dev = find_outdev(net, cfg);
-	if (IS_ERR(dev)) {
-		err = PTR_ERR(dev);
-		dev = NULL;
-		goto errout;
-	}
-
-	/* Ensure this is a supported device */
-	err = -EINVAL;
-	if (!mpls_dev_get(dev))
-		goto errout;
-
-	err = -EINVAL;
-	if ((cfg->rc_via_table == NEIGH_LINK_TABLE) &&
-	    (dev->addr_len != cfg->rc_via_alen))
-		goto errout;
-
 	/* Append makes no sense with mpls */
 	err = -EOPNOTSUPP;
 	if (cfg->rc_nlflags & NLM_F_APPEND)
@@ -497,28 +693,35 @@ static int mpls_route_add(struct mpls_route_config *cfg)
 	if (!(cfg->rc_nlflags & NLM_F_CREATE) && !old)
 		goto errout;
 
+	if (cfg->rc_mp) {
+		err = -EINVAL;
+		nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len);
+		if (nhs == 0)
+			goto errout;
+	}
+
 	err = -ENOMEM;
-	rt = mpls_rt_alloc(cfg->rc_via_alen);
+	rt = mpls_rt_alloc(nhs);
 	if (!rt)
 		goto errout;
 
-	rt->rt_labels = cfg->rc_output_labels;
-	for (i = 0; i < rt->rt_labels; i++)
-		rt->rt_label[i] = cfg->rc_output_label[i];
 	rt->rt_protocol = cfg->rc_protocol;
-	RCU_INIT_POINTER(rt->rt_dev, dev);
 	rt->rt_payload_type = cfg->rc_payload_type;
-	rt->rt_via_table = cfg->rc_via_table;
-	memcpy(rt->rt_via, cfg->rc_via, cfg->rc_via_alen);
 
-	mpls_route_update(net, index, NULL, rt, &cfg->rc_nlinfo);
+	if (cfg->rc_mp)
+		err = mpls_nh_build_multi(cfg, rt);
+	else
+		err = mpls_nh_build_from_cfg(cfg, rt);
+	if (err)
+		goto freert;
+
+	mpls_route_update(net, index, rt, &cfg->rc_nlinfo);
 
-	dev_put(dev);
 	return 0;
 
+freert:
+	mpls_rt_free(rt);
 errout:
-	if (dev)
-		dev_put(dev);
 	return err;
 }
 
@@ -538,7 +741,7 @@ static int mpls_route_del(struct mpls_route_config *cfg)
 	if (index >= net->mpls.platform_labels)
 		goto errout;
 
-	mpls_route_update(net, index, NULL, NULL, &cfg->rc_nlinfo);
+	mpls_route_update(net, index, NULL, &cfg->rc_nlinfo);
 
 	err = 0;
 errout:
@@ -635,9 +838,11 @@ static void mpls_ifdown(struct net_device *dev)
 		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
 		if (!rt)
 			continue;
-		if (rtnl_dereference(rt->rt_dev) != dev)
-			continue;
-		rt->rt_dev = NULL;
+		for_nexthops(rt) {
+			if (rtnl_dereference(nh->nh_dev) != dev)
+				continue;
+			nh->nh_dev = NULL;
+		} endfor_nexthops(rt);
 	}
 
 	mdev = mpls_dev_get(dev);
@@ -736,7 +941,7 @@ int nla_put_labels(struct sk_buff *skb, int attrtype,
 EXPORT_SYMBOL_GPL(nla_put_labels);
 
 int nla_get_labels(const struct nlattr *nla,
-		   u32 max_labels, u32 *labels, u32 label[])
+		   u32 max_labels, u8 *labels, u32 label[])
 {
 	unsigned len = nla_len(nla);
 	unsigned nla_labels;
@@ -781,6 +986,48 @@ int nla_get_labels(const struct nlattr *nla,
 }
 EXPORT_SYMBOL_GPL(nla_get_labels);
 
+int nla_get_via(const struct nlattr *nla, u8 *via_alen,
+		u8 *via_table, u8 via_addr[])
+{
+	struct rtvia *via = nla_data(nla);
+	int err = -EINVAL;
+	int alen;
+
+	if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
+		goto errout;
+	alen = nla_len(nla) -
+			offsetof(struct rtvia, rtvia_addr);
+	if (alen > MAX_VIA_ALEN)
+		goto errout;
+
+	/* Validate the address family */
+	switch (via->rtvia_family) {
+	case AF_PACKET:
+		*via_table = NEIGH_LINK_TABLE;
+		break;
+	case AF_INET:
+		*via_table = NEIGH_ARP_TABLE;
+		if (alen != 4)
+			goto errout;
+		break;
+	case AF_INET6:
+		*via_table = NEIGH_ND_TABLE;
+		if (alen != 16)
+			goto errout;
+		break;
+	default:
+		/* Unsupported address family */
+		goto errout;
+	}
+
+	memcpy(via_addr, via->rtvia_addr, alen);
+	*via_alen = alen;
+	err = 0;
+
+errout:
+	return err;
+}
+
 static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
 			       struct mpls_route_config *cfg)
 {
@@ -844,7 +1091,7 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
 			break;
 		case RTA_DST:
 		{
-			u32 label_count;
+			u8 label_count;
 			if (nla_get_labels(nla, 1, &label_count,
 					   &cfg->rc_label))
 				goto errout;
@@ -857,35 +1104,15 @@ static int rtm_to_route_config(struct sk_buff *skb,  struct nlmsghdr *nlh,
 		}
 		case RTA_VIA:
 		{
-			struct rtvia *via = nla_data(nla);
-			if (nla_len(nla) < offsetof(struct rtvia, rtvia_addr))
+			if (nla_get_via(nla, &cfg->rc_via_alen,
+					&cfg->rc_via_table, cfg->rc_via))
 				goto errout;
-			cfg->rc_via_alen   = nla_len(nla) -
-				offsetof(struct rtvia, rtvia_addr);
-			if (cfg->rc_via_alen > MAX_VIA_ALEN)
-				goto errout;
-
-			/* Validate the address family */
-			switch(via->rtvia_family) {
-			case AF_PACKET:
-				cfg->rc_via_table = NEIGH_LINK_TABLE;
-				break;
-			case AF_INET:
-				cfg->rc_via_table = NEIGH_ARP_TABLE;
-				if (cfg->rc_via_alen != 4)
-					goto errout;
-				break;
-			case AF_INET6:
-				cfg->rc_via_table = NEIGH_ND_TABLE;
-				if (cfg->rc_via_alen != 16)
-					goto errout;
-				break;
-			default:
-				/* Unsupported address family */
-				goto errout;
-			}
-
-			memcpy(cfg->rc_via, via->rtvia_addr, cfg->rc_via_alen);
+			break;
+		}
+		case RTA_MULTIPATH:
+		{
+			cfg->rc_mp = nla_data(nla);
+			cfg->rc_mp_len = nla_len(nla);
 			break;
 		}
 		default:
@@ -946,16 +1173,52 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
 	rtm->rtm_type = RTN_UNICAST;
 	rtm->rtm_flags = 0;
 
-	if (rt->rt_labels &&
-	    nla_put_labels(skb, RTA_NEWDST, rt->rt_labels, rt->rt_label))
-		goto nla_put_failure;
-	if (nla_put_via(skb, rt->rt_via_table, rt->rt_via, rt->rt_via_alen))
-		goto nla_put_failure;
-	dev = rtnl_dereference(rt->rt_dev);
-	if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
-		goto nla_put_failure;
 	if (nla_put_labels(skb, RTA_DST, 1, &label))
 		goto nla_put_failure;
+	if (rt->rt_nhn == 1) {
+		struct mpls_nh *nh = rt->rt_nh;
+
+		if (nh->nh_labels &&
+		    nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
+				   nh->nh_label))
+			goto nla_put_failure;
+		if (nla_put_via(skb, nh->nh_via_table, nh->nh_via,
+				nh->nh_via_alen))
+			goto nla_put_failure;
+		dev = rtnl_dereference(nh->nh_dev);
+		if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
+			goto nla_put_failure;
+	} else {
+		struct rtnexthop *rtnh;
+		struct nlattr *mp;
+
+		mp = nla_nest_start(skb, RTA_MULTIPATH);
+		if (!mp)
+			goto nla_put_failure;
+
+		for_nexthops(rt) {
+			rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
+			if (!rtnh)
+				goto nla_put_failure;
+
+			dev = rtnl_dereference(nh->nh_dev);
+			if (dev)
+				rtnh->rtnh_ifindex = dev->ifindex;
+			if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST,
+							    nh->nh_labels,
+							    nh->nh_label))
+				goto nla_put_failure;
+			if (nla_put_via(skb, nh->nh_via_table,
+					nh->nh_via,
+					nh->nh_via_alen))
+				goto nla_put_failure;
+
+			/* length of rtnetlink header + attributes */
+			rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
+		} endfor_nexthops(rt);
+
+		nla_nest_end(skb, mp);
+	}
 
 	nlmsg_end(skb, nlh);
 	return 0;
@@ -1000,12 +1263,30 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt)
 {
 	size_t payload =
 		NLMSG_ALIGN(sizeof(struct rtmsg))
-		+ nla_total_size(2 + rt->rt_via_alen)	/* RTA_VIA */
 		+ nla_total_size(4);			/* RTA_DST */
-	if (rt->rt_labels)				/* RTA_NEWDST */
-		payload += nla_total_size(rt->rt_labels * 4);
-	if (rt->rt_dev)					/* RTA_OIF */
-		payload += nla_total_size(4);
+
+	if (rt->rt_nhn == 1) {
+		struct mpls_nh *nh = rt->rt_nh;
+
+		if (nh->nh_dev)
+			payload += nla_total_size(4); /* RTA_OIF */
+		payload += nla_total_size(2 + nh->nh_via_alen); /* RTA_VIA */
+		if (nh->nh_labels) /* RTA_NEWDST */
+			payload += nla_total_size(nh->nh_labels * 4);
+	} else {
+		/* each nexthop is packed in an attribute */
+		size_t nhsize = 0;
+
+		for_nexthops(rt) {
+			nhsize += nla_total_size(sizeof(struct rtnexthop));
+			nhsize += nla_total_size(2 + nh->nh_via_alen);
+			if (nh->nh_labels)
+				nhsize += nla_total_size(nh->nh_labels * 4);
+		} endfor_nexthops(rt);
+		/* nested attribute */
+		payload += nla_total_size(nhsize);
+	}
+
 	return payload;
 }
 
@@ -1057,25 +1338,25 @@ static int resize_platform_label_table(struct net *net, size_t limit)
 	/* In case the predefined labels need to be populated */
 	if (limit > MPLS_LABEL_IPV4NULL) {
 		struct net_device *lo = net->loopback_dev;
-		rt0 = mpls_rt_alloc(lo->addr_len);
+		rt0 = mpls_rt_alloc(1);
 		if (!rt0)
 			goto nort0;
-		RCU_INIT_POINTER(rt0->rt_dev, lo);
+		RCU_INIT_POINTER(rt0->rt_nh->nh_dev, lo);
 		rt0->rt_protocol = RTPROT_KERNEL;
 		rt0->rt_payload_type = MPT_IPV4;
-		rt0->rt_via_table = NEIGH_LINK_TABLE;
-		memcpy(rt0->rt_via, lo->dev_addr, lo->addr_len);
+		rt0->rt_nh->nh_via_table = NEIGH_LINK_TABLE;
+		memcpy(rt0->rt_nh->nh_via, lo->dev_addr, lo->addr_len);
 	}
 	if (limit > MPLS_LABEL_IPV6NULL) {
 		struct net_device *lo = net->loopback_dev;
-		rt2 = mpls_rt_alloc(lo->addr_len);
+		rt2 = mpls_rt_alloc(1);
 		if (!rt2)
 			goto nort2;
-		RCU_INIT_POINTER(rt2->rt_dev, lo);
+		RCU_INIT_POINTER(rt2->rt_nh->nh_dev, lo);
 		rt2->rt_protocol = RTPROT_KERNEL;
 		rt2->rt_payload_type = MPT_IPV6;
-		rt2->rt_via_table = NEIGH_LINK_TABLE;
-		memcpy(rt2->rt_via, lo->dev_addr, lo->addr_len);
+		rt2->rt_nh->nh_via_table = NEIGH_LINK_TABLE;
+		memcpy(rt2->rt_nh->nh_via, lo->dev_addr, lo->addr_len);
 	}
 
 	rtnl_lock();
@@ -1085,7 +1366,7 @@ static int resize_platform_label_table(struct net *net, size_t limit)
 
 	/* Free any labels beyond the new table */
 	for (index = limit; index < old_limit; index++)
-		mpls_route_update(net, index, NULL, NULL, NULL);
+		mpls_route_update(net, index, NULL, NULL);
 
 	/* Copy over the old labels */
 	cp_size = size;
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index 2681a4ba6c37..d7757be39877 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -21,6 +21,54 @@ struct mpls_dev {
 
 struct sk_buff;
 
+#define LABEL_NOT_SPECIFIED (1 << 20)
+#define MAX_NEW_LABELS 2
+
+/* This maximum ha length copied from the definition of struct neighbour */
+#define MAX_VIA_ALEN (ALIGN(MAX_ADDR_LEN, sizeof(unsigned long)))
+
+enum mpls_payload_type {
+	MPT_UNSPEC, /* IPv4 or IPv6 */
+	MPT_IPV4 = 4,
+	MPT_IPV6 = 6,
+
+	/* Other types not implemented:
+	 *  - Pseudo-wire with or without control word (RFC4385)
+	 *  - GAL (RFC5586)
+	 */
+};
+
+struct mpls_nh { /* next hop label forwarding entry */
+	struct net_device __rcu *nh_dev;
+	u32			nh_label[MAX_NEW_LABELS];
+	u8			nh_labels;
+	u8			nh_via_alen;
+	u8			nh_via_table;
+	u8			nh_via[MAX_VIA_ALEN];
+};
+
+struct mpls_route { /* next hop label forwarding entry */
+	struct rcu_head		rt_rcu;
+	u8			rt_protocol;
+	u8			rt_payload_type;
+	int			rt_nhn;
+	struct mpls_nh		rt_nh[0];
+};
+
+#define for_nexthops(rt) {						\
+	int nhsel; struct mpls_nh *nh;			\
+	for (nhsel = 0, nh = (rt)->rt_nh;				\
+	     nhsel < (rt)->rt_nhn;					\
+	     nh++, nhsel++)
+
+#define change_nexthops(rt) {						\
+	int nhsel; struct mpls_nh *nh;				\
+	for (nhsel = 0,	nh = (struct mpls_nh *)((rt)->rt_nh);	\
+	     nhsel < (rt)->rt_nhn;					\
+	     nh++, nhsel++)
+
+#define endfor_nexthops(rt) }
+
 static inline struct mpls_shim_hdr *mpls_hdr(const struct sk_buff *skb)
 {
 	return (struct mpls_shim_hdr *)skb_network_header(skb);
@@ -52,8 +100,10 @@ static inline struct mpls_entry_decoded mpls_entry_decode(struct mpls_shim_hdr *
 
 int nla_put_labels(struct sk_buff *skb, int attrtype,  u8 labels,
 		   const u32 label[]);
-int nla_get_labels(const struct nlattr *nla, u32 max_labels, u32 *labels,
+int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels,
 		   u32 label[]);
+int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table,
+		u8 via[]);
 bool mpls_output_possible(const struct net_device *dev);
 unsigned int mpls_dev_mtu(const struct net_device *dev);
 bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu);