openvswitch: Add basic MPLS support to kernel
Allow datapath to recognize and extract MPLS labels into flow keys
and execute actions which push, pop, and set labels on packets.
Based heavily on work by Leo Alterman, Ravi K, Isaku Yamahata and Joe Stringer.
Cc: Ravi K <rkerur@gmail.com>
Cc: Leo Alterman <lalterman@nicira.com>
Cc: Isaku Yamahata <yamahata@valinux.co.jp>
Cc: Joe Stringer <joe@wand.net.nz>
Signed-off-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Jesse Gross <jesse@nicira.com>
Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
diff --git a/net/core/dev.c b/net/core/dev.c
index 40be481..70bb609 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -118,6 +118,7 @@
#include <linux/if_vlan.h>
#include <linux/ip.h>
#include <net/ip.h>
+#include <net/mpls.h>
#include <linux/ipv6.h>
#include <linux/in.h>
#include <linux/jhash.h>
@@ -2530,7 +2531,7 @@
netdev_features_t features,
__be16 type)
{
- if (type == htons(ETH_P_MPLS_UC) || type == htons(ETH_P_MPLS_MC))
+ if (eth_p_mpls(type))
features &= skb->dev->mpls_features;
return features;
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 2a9673e..454ce12 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -30,6 +30,7 @@
config OPENVSWITCH_GRE
tristate "Open vSwitch GRE tunneling support"
+ select NET_MPLS_GSO
depends on INET
depends on OPENVSWITCH
depends on NET_IPGRE_DEMUX
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 922c133..930b1b6 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -28,10 +28,12 @@
#include <linux/in6.h>
#include <linux/if_arp.h>
#include <linux/if_vlan.h>
+
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/checksum.h>
#include <net/dsfield.h>
+#include <net/mpls.h>
#include <net/sctp/checksum.h>
#include "datapath.h"
@@ -118,6 +120,92 @@
return pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
}
+static int push_mpls(struct sk_buff *skb,
+ const struct ovs_action_push_mpls *mpls)
+{
+ __be32 *new_mpls_lse;
+ struct ethhdr *hdr;
+
+ /* Networking stack do not allow simultaneous Tunnel and MPLS GSO. */
+ if (skb->encapsulation)
+ return -ENOTSUPP;
+
+ if (skb_cow_head(skb, MPLS_HLEN) < 0)
+ return -ENOMEM;
+
+ skb_push(skb, MPLS_HLEN);
+ memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+ skb->mac_len);
+ skb_reset_mac_header(skb);
+
+ new_mpls_lse = (__be32 *)skb_mpls_header(skb);
+ *new_mpls_lse = mpls->mpls_lse;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse,
+ MPLS_HLEN, 0));
+
+ hdr = eth_hdr(skb);
+ hdr->h_proto = mpls->mpls_ethertype;
+
+ skb_set_inner_protocol(skb, skb->protocol);
+ skb->protocol = mpls->mpls_ethertype;
+
+ return 0;
+}
+
+static int pop_mpls(struct sk_buff *skb, const __be16 ethertype)
+{
+ struct ethhdr *hdr;
+ int err;
+
+ err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_sub(skb->csum,
+ csum_partial(skb_mpls_header(skb),
+ MPLS_HLEN, 0));
+
+ memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+ skb->mac_len);
+
+ __skb_pull(skb, MPLS_HLEN);
+ skb_reset_mac_header(skb);
+
+ /* skb_mpls_header() is used to locate the ethertype
+ * field correctly in the presence of VLAN tags.
+ */
+ hdr = (struct ethhdr *)(skb_mpls_header(skb) - ETH_HLEN);
+ hdr->h_proto = ethertype;
+ if (eth_p_mpls(skb->protocol))
+ skb->protocol = ethertype;
+ return 0;
+}
+
+static int set_mpls(struct sk_buff *skb, const __be32 *mpls_lse)
+{
+ __be32 *stack;
+ int err;
+
+ err = make_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ stack = (__be32 *)skb_mpls_header(skb);
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be32 diff[] = { ~(*stack), *mpls_lse };
+
+ skb->csum = ~csum_partial((char *)diff, sizeof(diff),
+ ~skb->csum);
+ }
+
+ *stack = *mpls_lse;
+
+ return 0;
+}
+
/* remove VLAN header from packet and update csum accordingly. */
static int __pop_vlan_tci(struct sk_buff *skb, __be16 *current_tci)
{
@@ -140,10 +228,12 @@
vlan_set_encap_proto(skb, vhdr);
skb->mac_header += VLAN_HLEN;
+
if (skb_network_offset(skb) < ETH_HLEN)
skb_set_network_header(skb, ETH_HLEN);
- skb_reset_mac_len(skb);
+ /* Update mac_len for subsequent MPLS actions */
+ skb_reset_mac_len(skb);
return 0;
}
@@ -186,6 +276,8 @@
if (!__vlan_put_tag(skb, skb->vlan_proto, current_tag))
return -ENOMEM;
+ /* Update mac_len for subsequent MPLS actions */
+ skb->mac_len += VLAN_HLEN;
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->csum = csum_add(skb->csum, csum_partial(skb->data
@@ -612,6 +704,10 @@
case OVS_KEY_ATTR_SCTP:
err = set_sctp(skb, nla_data(nested_attr));
break;
+
+ case OVS_KEY_ATTR_MPLS:
+ err = set_mpls(skb, nla_data(nested_attr));
+ break;
}
return err;
@@ -690,6 +786,14 @@
execute_hash(skb, key, a);
break;
+ case OVS_ACTION_ATTR_PUSH_MPLS:
+ err = push_mpls(skb, nla_data(a));
+ break;
+
+ case OVS_ACTION_ATTR_POP_MPLS:
+ err = pop_mpls(skb, nla_get_be16(a));
+ break;
+
case OVS_ACTION_ATTR_PUSH_VLAN:
err = push_vlan(skb, nla_data(a));
if (unlikely(err)) /* skb already freed. */
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index f18302f..688cb9b 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -560,7 +560,7 @@
goto err_flow_free;
err = ovs_nla_copy_actions(a[OVS_PACKET_ATTR_ACTIONS],
- &flow->key, 0, &acts);
+ &flow->key, &acts);
if (err)
goto err_flow_free;
@@ -846,7 +846,7 @@
goto err_kfree_flow;
error = ovs_nla_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &new_flow->key,
- 0, &acts);
+ &acts);
if (error) {
OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
goto err_kfree_acts;
@@ -953,7 +953,7 @@
return acts;
ovs_flow_mask_key(&masked_key, key, mask);
- error = ovs_nla_copy_actions(a, &masked_key, 0, &acts);
+ error = ovs_nla_copy_actions(a, &masked_key, &acts);
if (error) {
OVS_NLERR("Flow actions may not be safe on all matching packets.\n");
kfree(acts);
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 2b78789..90a2101 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -32,6 +32,7 @@
#include <linux/if_arp.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/mpls.h>
#include <linux/sctp.h>
#include <linux/smp.h>
#include <linux/tcp.h>
@@ -42,6 +43,7 @@
#include <net/ip.h>
#include <net/ip_tunnels.h>
#include <net/ipv6.h>
+#include <net/mpls.h>
#include <net/ndisc.h>
#include "datapath.h"
@@ -480,6 +482,7 @@
return -ENOMEM;
skb_reset_network_header(skb);
+ skb_reset_mac_len(skb);
__skb_push(skb, skb->data - skb_mac_header(skb));
/* Network layer. */
@@ -584,6 +587,33 @@
memset(&key->ip, 0, sizeof(key->ip));
memset(&key->ipv4, 0, sizeof(key->ipv4));
}
+ } else if (eth_p_mpls(key->eth.type)) {
+ size_t stack_len = MPLS_HLEN;
+
+ /* In the presence of an MPLS label stack the end of the L2
+ * header and the beginning of the L3 header differ.
+ *
+ * Advance network_header to the beginning of the L3
+ * header. mac_len corresponds to the end of the L2 header.
+ */
+ while (1) {
+ __be32 lse;
+
+ error = check_header(skb, skb->mac_len + stack_len);
+ if (unlikely(error))
+ return 0;
+
+ memcpy(&lse, skb_network_header(skb), MPLS_HLEN);
+
+ if (stack_len == MPLS_HLEN)
+ memcpy(&key->mpls.top_lse, &lse, MPLS_HLEN);
+
+ skb_set_network_header(skb, skb->mac_len + stack_len);
+ if (lse & htonl(MPLS_LS_S_MASK))
+ break;
+
+ stack_len += MPLS_HLEN;
+ }
} else if (key->eth.type == htons(ETH_P_IPV6)) {
int nh_len; /* IPv6 Header + Extensions */
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 7181331..4962bee 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -102,12 +102,17 @@
__be16 tci; /* 0 if no VLAN, VLAN_TAG_PRESENT set otherwise. */
__be16 type; /* Ethernet frame type. */
} eth;
- struct {
- u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
- u8 tos; /* IP ToS. */
- u8 ttl; /* IP TTL/hop limit. */
- u8 frag; /* One of OVS_FRAG_TYPE_*. */
- } ip;
+ union {
+ struct {
+ __be32 top_lse; /* top label stack entry */
+ } mpls;
+ struct {
+ u8 proto; /* IP protocol or lower 8 bits of ARP opcode. */
+ u8 tos; /* IP ToS. */
+ u8 ttl; /* IP TTL/hop limit. */
+ u8 frag; /* One of OVS_FRAG_TYPE_*. */
+ } ip;
+ };
struct {
__be16 src; /* TCP/UDP/SCTP source port. */
__be16 dst; /* TCP/UDP/SCTP destination port. */
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 939bcb3..569309c 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -46,6 +46,7 @@
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
+#include <net/mpls.h>
#include "flow_netlink.h"
@@ -134,7 +135,8 @@
| (1 << OVS_KEY_ATTR_ICMP)
| (1 << OVS_KEY_ATTR_ICMPV6)
| (1 << OVS_KEY_ATTR_ARP)
- | (1 << OVS_KEY_ATTR_ND));
+ | (1 << OVS_KEY_ATTR_ND)
+ | (1 << OVS_KEY_ATTR_MPLS));
/* Always allowed mask fields. */
mask_allowed |= ((1 << OVS_KEY_ATTR_TUNNEL)
@@ -149,6 +151,12 @@
mask_allowed |= 1 << OVS_KEY_ATTR_ARP;
}
+ if (eth_p_mpls(match->key->eth.type)) {
+ key_expected |= 1 << OVS_KEY_ATTR_MPLS;
+ if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
+ mask_allowed |= 1 << OVS_KEY_ATTR_MPLS;
+ }
+
if (match->key->eth.type == htons(ETH_P_IP)) {
key_expected |= 1 << OVS_KEY_ATTR_IPV4;
if (match->mask && (match->mask->key.eth.type == htons(0xffff)))
@@ -266,6 +274,7 @@
[OVS_KEY_ATTR_RECIRC_ID] = sizeof(u32),
[OVS_KEY_ATTR_DP_HASH] = sizeof(u32),
[OVS_KEY_ATTR_TUNNEL] = -1,
+ [OVS_KEY_ATTR_MPLS] = sizeof(struct ovs_key_mpls),
};
static bool is_all_zero(const u8 *fp, size_t size)
@@ -735,6 +744,16 @@
attrs &= ~(1 << OVS_KEY_ATTR_ARP);
}
+ if (attrs & (1 << OVS_KEY_ATTR_MPLS)) {
+ const struct ovs_key_mpls *mpls_key;
+
+ mpls_key = nla_data(a[OVS_KEY_ATTR_MPLS]);
+ SW_FLOW_KEY_PUT(match, mpls.top_lse,
+ mpls_key->mpls_lse, is_mask);
+
+ attrs &= ~(1 << OVS_KEY_ATTR_MPLS);
+ }
+
if (attrs & (1 << OVS_KEY_ATTR_TCP)) {
const struct ovs_key_tcp *tcp_key;
@@ -1140,6 +1159,14 @@
arp_key->arp_op = htons(output->ip.proto);
ether_addr_copy(arp_key->arp_sha, output->ipv4.arp.sha);
ether_addr_copy(arp_key->arp_tha, output->ipv4.arp.tha);
+ } else if (eth_p_mpls(swkey->eth.type)) {
+ struct ovs_key_mpls *mpls_key;
+
+ nla = nla_reserve(skb, OVS_KEY_ATTR_MPLS, sizeof(*mpls_key));
+ if (!nla)
+ goto nla_put_failure;
+ mpls_key = nla_data(nla);
+ mpls_key->mpls_lse = output->mpls.top_lse;
}
if ((swkey->eth.type == htons(ETH_P_IP) ||
@@ -1336,9 +1363,15 @@
a->nla_len = sfa->actions_len - st_offset;
}
+static int ovs_nla_copy_actions__(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ int depth, struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci);
+
static int validate_and_copy_sample(const struct nlattr *attr,
const struct sw_flow_key *key, int depth,
- struct sw_flow_actions **sfa)
+ struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci)
{
const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
const struct nlattr *probability, *actions;
@@ -1375,7 +1408,8 @@
if (st_acts < 0)
return st_acts;
- err = ovs_nla_copy_actions(actions, key, depth + 1, sfa);
+ err = ovs_nla_copy_actions__(actions, key, depth + 1, sfa,
+ eth_type, vlan_tci);
if (err)
return err;
@@ -1385,10 +1419,10 @@
return 0;
}
-static int validate_tp_port(const struct sw_flow_key *flow_key)
+static int validate_tp_port(const struct sw_flow_key *flow_key,
+ __be16 eth_type)
{
- if ((flow_key->eth.type == htons(ETH_P_IP) ||
- flow_key->eth.type == htons(ETH_P_IPV6)) &&
+ if ((eth_type == htons(ETH_P_IP) || eth_type == htons(ETH_P_IPV6)) &&
(flow_key->tp.src || flow_key->tp.dst))
return 0;
@@ -1483,7 +1517,7 @@
static int validate_set(const struct nlattr *a,
const struct sw_flow_key *flow_key,
struct sw_flow_actions **sfa,
- bool *set_tun)
+ bool *set_tun, __be16 eth_type)
{
const struct nlattr *ovs_key = nla_data(a);
int key_type = nla_type(ovs_key);
@@ -1508,6 +1542,9 @@
break;
case OVS_KEY_ATTR_TUNNEL:
+ if (eth_p_mpls(eth_type))
+ return -EINVAL;
+
*set_tun = true;
err = validate_and_copy_set_tun(a, sfa);
if (err)
@@ -1515,7 +1552,7 @@
break;
case OVS_KEY_ATTR_IPV4:
- if (flow_key->eth.type != htons(ETH_P_IP))
+ if (eth_type != htons(ETH_P_IP))
return -EINVAL;
if (!flow_key->ip.proto)
@@ -1531,7 +1568,7 @@
break;
case OVS_KEY_ATTR_IPV6:
- if (flow_key->eth.type != htons(ETH_P_IPV6))
+ if (eth_type != htons(ETH_P_IPV6))
return -EINVAL;
if (!flow_key->ip.proto)
@@ -1553,19 +1590,24 @@
if (flow_key->ip.proto != IPPROTO_TCP)
return -EINVAL;
- return validate_tp_port(flow_key);
+ return validate_tp_port(flow_key, eth_type);
case OVS_KEY_ATTR_UDP:
if (flow_key->ip.proto != IPPROTO_UDP)
return -EINVAL;
- return validate_tp_port(flow_key);
+ return validate_tp_port(flow_key, eth_type);
+
+ case OVS_KEY_ATTR_MPLS:
+ if (!eth_p_mpls(eth_type))
+ return -EINVAL;
+ break;
case OVS_KEY_ATTR_SCTP:
if (flow_key->ip.proto != IPPROTO_SCTP)
return -EINVAL;
- return validate_tp_port(flow_key);
+ return validate_tp_port(flow_key, eth_type);
default:
return -EINVAL;
@@ -1609,12 +1651,13 @@
return 0;
}
-int ovs_nla_copy_actions(const struct nlattr *attr,
- const struct sw_flow_key *key,
- int depth,
- struct sw_flow_actions **sfa)
+static int ovs_nla_copy_actions__(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ int depth, struct sw_flow_actions **sfa,
+ __be16 eth_type, __be16 vlan_tci)
{
const struct nlattr *a;
+ bool out_tnl_port = false;
int rem, err;
if (depth >= SAMPLE_ACTION_DEPTH)
@@ -1626,6 +1669,8 @@
[OVS_ACTION_ATTR_OUTPUT] = sizeof(u32),
[OVS_ACTION_ATTR_RECIRC] = sizeof(u32),
[OVS_ACTION_ATTR_USERSPACE] = (u32)-1,
+ [OVS_ACTION_ATTR_PUSH_MPLS] = sizeof(struct ovs_action_push_mpls),
+ [OVS_ACTION_ATTR_POP_MPLS] = sizeof(__be16),
[OVS_ACTION_ATTR_PUSH_VLAN] = sizeof(struct ovs_action_push_vlan),
[OVS_ACTION_ATTR_POP_VLAN] = 0,
[OVS_ACTION_ATTR_SET] = (u32)-1,
@@ -1655,6 +1700,8 @@
case OVS_ACTION_ATTR_OUTPUT:
if (nla_get_u32(a) >= DP_MAX_PORTS)
return -EINVAL;
+ out_tnl_port = false;
+
break;
case OVS_ACTION_ATTR_HASH: {
@@ -1671,6 +1718,7 @@
}
case OVS_ACTION_ATTR_POP_VLAN:
+ vlan_tci = htons(0);
break;
case OVS_ACTION_ATTR_PUSH_VLAN:
@@ -1679,19 +1727,66 @@
return -EINVAL;
if (!(vlan->vlan_tci & htons(VLAN_TAG_PRESENT)))
return -EINVAL;
+ vlan_tci = vlan->vlan_tci;
break;
case OVS_ACTION_ATTR_RECIRC:
break;
+ case OVS_ACTION_ATTR_PUSH_MPLS: {
+ const struct ovs_action_push_mpls *mpls = nla_data(a);
+
+ /* Networking stack do not allow simultaneous Tunnel
+ * and MPLS GSO.
+ */
+ if (out_tnl_port)
+ return -EINVAL;
+
+ if (!eth_p_mpls(mpls->mpls_ethertype))
+ return -EINVAL;
+ /* Prohibit push MPLS other than to a white list
+ * for packets that have a known tag order.
+ */
+ if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+ (eth_type != htons(ETH_P_IP) &&
+ eth_type != htons(ETH_P_IPV6) &&
+ eth_type != htons(ETH_P_ARP) &&
+ eth_type != htons(ETH_P_RARP) &&
+ !eth_p_mpls(eth_type)))
+ return -EINVAL;
+ eth_type = mpls->mpls_ethertype;
+ break;
+ }
+
+ case OVS_ACTION_ATTR_POP_MPLS:
+ if (vlan_tci & htons(VLAN_TAG_PRESENT) ||
+ !eth_p_mpls(eth_type))
+ return -EINVAL;
+
+ /* Disallow subsequent L2.5+ set and mpls_pop actions
+ * as there is no check here to ensure that the new
+ * eth_type is valid and thus set actions could
+ * write off the end of the packet or otherwise
+ * corrupt it.
+ *
+ * Support for these actions is planned using packet
+ * recirculation.
+ */
+ eth_type = htons(0);
+ break;
+
case OVS_ACTION_ATTR_SET:
- err = validate_set(a, key, sfa, &skip_copy);
+ err = validate_set(a, key, sfa,
+ &out_tnl_port, eth_type);
if (err)
return err;
+
+ skip_copy = out_tnl_port;
break;
case OVS_ACTION_ATTR_SAMPLE:
- err = validate_and_copy_sample(a, key, depth, sfa);
+ err = validate_and_copy_sample(a, key, depth, sfa,
+ eth_type, vlan_tci);
if (err)
return err;
skip_copy = true;
@@ -1713,6 +1808,14 @@
return 0;
}
+int ovs_nla_copy_actions(const struct nlattr *attr,
+ const struct sw_flow_key *key,
+ struct sw_flow_actions **sfa)
+{
+ return ovs_nla_copy_actions__(attr, key, 0, sfa, key->eth.type,
+ key->eth.tci);
+}
+
static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
{
const struct nlattr *a;
diff --git a/net/openvswitch/flow_netlink.h b/net/openvswitch/flow_netlink.h
index 206e45a..6355b1d 100644
--- a/net/openvswitch/flow_netlink.h
+++ b/net/openvswitch/flow_netlink.h
@@ -49,7 +49,7 @@
const struct nlattr *);
int ovs_nla_copy_actions(const struct nlattr *attr,
- const struct sw_flow_key *key, int depth,
+ const struct sw_flow_key *key,
struct sw_flow_actions **sfa);
int ovs_nla_put_actions(const struct nlattr *attr,
int len, struct sk_buff *skb);