Merge branch 'mpls-fragmentation-and-gso-fixes' David Ahern says: ==================== net: mpls: fragmentation and gso fixes for locally originated traffic This series fixes mtu and fragmentation for tunnels using lwtunnel output redirect, and fixes GSO for MPLS for locally originated traffic reported by Lennert Buytenhek. A follow on series will address fragmentation and GSO for forwarded MPLS traffic. Hardware offload of GSO with MPLS also needs to be addressed. Simon: Can you verify this works with OVS for single and multiple labels? v4 - more updates to mpls_gso_segment per Alex's comments (thanks, Alex) - updates to teaching OVS about marking MPLS labels as the network header v3 - updates to mpls_gso_segment per Alex's comments - dropped skb->encapsulation = 1 from mpls_xmit per Alex's comment v2 - consistent use of network_header in skb to fix GSO for MPLS - update MPLS code in OVS to network_header and inner_network_header ==================== Tested-by: Simon Horman <simon.horman@netronome.com> Signed-off-by: David S. Miller <davem@davemloft.net>

commit: 7ba9d103466e6c8568fae6db1e79bbfcc8f4a680 [log] [tgz]
author: David S. Miller <davem@davemloft.net> Tue Aug 30 22:27:18 2016 -0700
committer: David S. Miller <davem@davemloft.net> Tue Aug 30 22:27:52 2016 -0700
tree: bfa1f1a6174f542d98eba376a54cdd8f568f4f09
parent: 41852497a9205964b958a245a9526040b980926f [diff]
parent: 607fca9acfb62a9caf7bd46771ecbc4d8fec7388 [diff]
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 4bda502..fbc853e 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c

@@ -340,6 +340,7 @@
 
 	dev->hw_features = VETH_FEATURES;
 	dev->hw_enc_features = VETH_FEATURES;
+	dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
 }
 
 /*

diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h
index e9f116e..ea3f80f 100644
--- a/include/net/lwtunnel.h
+++ b/include/net/lwtunnel.h

@@ -13,6 +13,13 @@
 /* lw tunnel state flags */
 #define LWTUNNEL_STATE_OUTPUT_REDIRECT	BIT(0)
 #define LWTUNNEL_STATE_INPUT_REDIRECT	BIT(1)
+#define LWTUNNEL_STATE_XMIT_REDIRECT	BIT(2)
+
+enum {
+	LWTUNNEL_XMIT_DONE,
+	LWTUNNEL_XMIT_CONTINUE,
+};
+
 
 struct lwtunnel_state {
 	__u16		type;
@@ -21,6 +28,7 @@
 	int		(*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb);
 	int		(*orig_input)(struct sk_buff *);
 	int             len;
+	__u16		headroom;
 	__u8            data[0];
 };
 
@@ -34,6 +42,7 @@
 			  struct lwtunnel_state *lwtstate);
 	int (*get_encap_size)(struct lwtunnel_state *lwtstate);
 	int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
+	int (*xmit)(struct sk_buff *skb);
 };
 
 #ifdef CONFIG_LWTUNNEL
@@ -75,6 +84,24 @@
 
 	return false;
 }
+
+static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
+{
+	if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT))
+		return true;
+
+	return false;
+}
+
+static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
+					     unsigned int mtu)
+{
+	if (lwtunnel_xmit_redirect(lwtstate) && lwtstate->headroom < mtu)
+		return lwtstate->headroom;
+
+	return 0;
+}
+
 int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
 			   unsigned int num);
 int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
@@ -90,6 +117,7 @@
 int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
 int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
 int lwtunnel_input(struct sk_buff *skb);
+int lwtunnel_xmit(struct sk_buff *skb);
 
 #else
 
@@ -117,6 +145,17 @@
 	return false;
 }
 
+static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
+{
+	return false;
+}
+
+static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
+					     unsigned int mtu)
+{
+	return 0;
+}
+
 static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
 					 unsigned int num)
 {
@@ -170,6 +209,11 @@
 	return -EOPNOTSUPP;
 }
 
+static inline int lwtunnel_xmit(struct sk_buff *skb)
+{
+	return -EOPNOTSUPP;
+}
+
 #endif /* CONFIG_LWTUNNEL */
 
 #define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type))

diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 669ecc9..e5f84c2 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c

@@ -251,6 +251,41 @@
 }
 EXPORT_SYMBOL(lwtunnel_output);
 
+int lwtunnel_xmit(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	const struct lwtunnel_encap_ops *ops;
+	struct lwtunnel_state *lwtstate;
+	int ret = -EINVAL;
+
+	if (!dst)
+		goto drop;
+
+	lwtstate = dst->lwtstate;
+
+	if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
+	    lwtstate->type > LWTUNNEL_ENCAP_MAX)
+		return 0;
+
+	ret = -EOPNOTSUPP;
+	rcu_read_lock();
+	ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
+	if (likely(ops && ops->xmit))
+		ret = ops->xmit(skb);
+	rcu_read_unlock();
+
+	if (ret == -EOPNOTSUPP)
+		goto drop;
+
+	return ret;
+
+drop:
+	kfree_skb(skb);
+
+	return ret;
+}
+EXPORT_SYMBOL(lwtunnel_xmit);
+
 int lwtunnel_input(struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);

diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index dde37fb..6556927 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c

@@ -73,6 +73,7 @@
 #include <net/icmp.h>
 #include <net/checksum.h>
 #include <net/inetpeer.h>
+#include <net/lwtunnel.h>
 #include <linux/igmp.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_bridge.h>
@@ -197,6 +198,13 @@
 		skb = skb2;
 	}
 
+	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
+		int res = lwtunnel_xmit(skb);
+
+		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
+			return res;
+	}
+
 	rcu_read_lock_bh();
 	nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
 	neigh = __ipv4_neigh_lookup_noref(dev, nexthop);

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index a1f2830..3e99278 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c

@@ -1246,7 +1246,9 @@
 			mtu = 576;
 	}
 
-	return min_t(unsigned int, mtu, IP_MAX_MTU);
+	mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
+
+	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
 }
 
 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1dfc402..993fd96 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c

@@ -56,6 +56,7 @@
 #include <net/checksum.h>
 #include <linux/mroute6.h>
 #include <net/l3mdev.h>
+#include <net/lwtunnel.h>
 
 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
@@ -104,6 +105,13 @@
 		}
 	}
 
+	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
+		int res = lwtunnel_xmit(skb);
+
+		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
+			return res;
+	}
+
 	rcu_read_lock_bh();
 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 4981755..09d43ff 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c

@@ -1604,7 +1604,9 @@
 	rcu_read_unlock();
 
 out:
-	return min_t(unsigned int, mtu, IP6_MAX_MTU);
+	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
+
+	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
 }
 
 static struct dst_entry *icmp6_dst_gc_list;

diff --git a/net/mpls/mpls_gso.c b/net/mpls/mpls_gso.c
index 2055e57..b4da6d8 100644
--- a/net/mpls/mpls_gso.c
+++ b/net/mpls/mpls_gso.c

@@ -23,32 +23,50 @@
 				       netdev_features_t features)
 {
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	u16 mac_offset = skb->mac_header;
 	netdev_features_t mpls_features;
+	u16 mac_len = skb->mac_len;
 	__be16 mpls_protocol;
+	unsigned int mpls_hlen;
+
+	skb_reset_network_header(skb);
+	mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb);
+	if (unlikely(!pskb_may_pull(skb, mpls_hlen)))
+		goto out;
 
 	/* Setup inner SKB. */
 	mpls_protocol = skb->protocol;
 	skb->protocol = skb->inner_protocol;
 
-	/* Push back the mac header that skb_mac_gso_segment() has pulled.
-	 * It will be re-pulled by the call to skb_mac_gso_segment() below
-	 */
-	__skb_push(skb, skb->mac_len);
+	__skb_pull(skb, mpls_hlen);
+
+	skb->mac_len = 0;
+	skb_reset_mac_header(skb);
 
 	/* Segment inner packet. */
 	mpls_features = skb->dev->mpls_features & features;
 	segs = skb_mac_gso_segment(skb, mpls_features);
+	if (IS_ERR_OR_NULL(segs)) {
+		skb_gso_error_unwind(skb, mpls_protocol, mpls_hlen, mac_offset,
+				     mac_len);
+		goto out;
+	}
+	skb = segs;
 
+	mpls_hlen += mac_len;
+	do {
+		skb->mac_len = mac_len;
+		skb->protocol = mpls_protocol;
 
-	/* Restore outer protocol. */
-	skb->protocol = mpls_protocol;
+		skb_reset_inner_network_header(skb);
 
-	/* Re-pull the mac header that the call to skb_mac_gso_segment()
-	 * above pulled.  It will be re-pushed after returning
-	 * skb_mac_gso_segment(), an indirect caller of this function.
-	 */
-	__skb_pull(skb, skb->data - skb_mac_header(skb));
+		__skb_push(skb, mpls_hlen);
 
+		skb_reset_mac_header(skb);
+		skb_set_network_header(skb, mac_len);
+	} while ((skb = skb->next));
+
+out:
 	return segs;
 }
 

diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 644a8da..cf52cf3 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c

@@ -37,7 +37,7 @@
 	return en->labels * sizeof(struct mpls_shim_hdr);
 }
 
-static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int mpls_xmit(struct sk_buff *skb)
 {
 	struct mpls_iptunnel_encap *tun_encap_info;
 	struct mpls_shim_hdr *hdr;
@@ -90,7 +90,11 @@
 	if (skb_cow(skb, hh_len + new_header_size))
 		goto drop;
 
+	skb_set_inner_protocol(skb, skb->protocol);
+	skb_reset_inner_network_header(skb);
+
 	skb_push(skb, new_header_size);
+
 	skb_reset_network_header(skb);
 
 	skb->dev = out_dev;
@@ -115,7 +119,7 @@
 		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
 				    __func__, err);
 
-	return 0;
+	return LWTUNNEL_XMIT_DONE;
 
 drop:
 	kfree_skb(skb);
@@ -153,7 +157,8 @@
 	if (ret)
 		goto errout;
 	newts->type = LWTUNNEL_ENCAP_MPLS;
-	newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT;
+	newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
+	newts->headroom = mpls_encap_size(tun_encap_info);
 
 	*ts = newts;
 
@@ -209,7 +214,7 @@
 
 static const struct lwtunnel_encap_ops mpls_iptun_ops = {
 	.build_state = mpls_build_state,
-	.output = mpls_output,
+	.xmit = mpls_xmit,
 	.fill_encap = mpls_fill_encap_info,
 	.get_encap_size = mpls_encap_nlsize,
 	.cmp_encap = mpls_encap_cmp,

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 1ecbd77..ca91fc3 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c

@@ -162,10 +162,16 @@
 	if (skb_cow_head(skb, MPLS_HLEN) < 0)
 		return -ENOMEM;
 
+	if (!skb->inner_protocol) {
+		skb_set_inner_network_header(skb, skb->mac_len);
+		skb_set_inner_protocol(skb, skb->protocol);
+	}
+
 	skb_push(skb, MPLS_HLEN);
 	memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
 		skb->mac_len);
 	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb->mac_len);
 
 	new_mpls_lse = (__be32 *)skb_mpls_header(skb);
 	*new_mpls_lse = mpls->mpls_lse;
@@ -173,8 +179,6 @@
 	skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
 
 	update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
-	if (!skb->inner_protocol)
-		skb_set_inner_protocol(skb, skb->protocol);
 	skb->protocol = mpls->mpls_ethertype;
 
 	invalidate_flow_key(key);
@@ -198,6 +202,7 @@
 
 	__skb_pull(skb, MPLS_HLEN);
 	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, skb->mac_len);
 
 	/* skb_mpls_header() is used to locate the ethertype
 	 * field correctly in the presence of VLAN tags.
commit	7ba9d103466e6c8568fae6db1e79bbfcc8f4a680	[log] [tgz]
author	David S. Miller <davem@davemloft.net>	Tue Aug 30 22:27:18 2016 -0700
committer	David S. Miller <davem@davemloft.net>	Tue Aug 30 22:27:52 2016 -0700
tree	bfa1f1a6174f542d98eba376a54cdd8f568f4f09
parent	41852497a9205964b958a245a9526040b980926f [diff]
parent	607fca9acfb62a9caf7bd46771ecbc4d8fec7388 [diff]