bpf: make programs see skb->data == L2 for ingress and egress

eBPF programs attached to ingress and egress qdiscs see inconsistent skb->data.
For ingress L2 header is already pulled, whereas for egress it's present.
This is known to program writers which are currently forced to use
BPF_LL_OFF workaround.
Since programs don't change skb internal pointers it is safe to do
pull/push right around invocation of the program and earlier taps and
later pt->func() will not be affected.
Multiple taps via packet_rcv(), tpacket_rcv() are doing the same trick
around run_filter/BPF_PROG_RUN even if skb_shared.

This fix finally allows programs to use optimized LD_ABS/IND instructions
without BPF_LL_OFF for higher performance.
tc ingress + cls_bpf + samples/bpf/tcbpf1_kern.o
       w/o JIT   w/JIT
before  20.5     23.6 Mpps
after   21.8     26.6 Mpps

Old programs with BPF_LL_OFF will still work as-is.

We can now undo most of the earlier workaround commit:
a166151cbe33 ("bpf: fix bpf helpers to use skb->mac_header relative offsets")

Signed-off-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/samples/bpf/tcbpf1_kern.c b/samples/bpf/tcbpf1_kern.c
index 7c27710..9bfb2eb 100644
--- a/samples/bpf/tcbpf1_kern.c
+++ b/samples/bpf/tcbpf1_kern.c
@@ -21,7 +21,7 @@
 
 static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos)
 {
-	__u8 old_tos = load_byte(skb, BPF_LL_OFF + TOS_OFF);
+	__u8 old_tos = load_byte(skb, TOS_OFF);
 
 	bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2);
 	bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0);
@@ -34,7 +34,7 @@
 
 static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip)
 {
-	__u32 old_ip = _htonl(load_word(skb, BPF_LL_OFF + IP_SRC_OFF));
+	__u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF));
 
 	bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip));
 	bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
@@ -44,7 +44,7 @@
 #define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest))
 static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port)
 {
-	__u16 old_port = htons(load_half(skb, BPF_LL_OFF + TCP_DPORT_OFF));
+	__u16 old_port = htons(load_half(skb, TCP_DPORT_OFF));
 
 	bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port));
 	bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0);
@@ -53,7 +53,7 @@
 SEC("classifier")
 int bpf_prog1(struct __sk_buff *skb)
 {
-	__u8 proto = load_byte(skb, BPF_LL_OFF + ETH_HLEN + offsetof(struct iphdr, protocol));
+	__u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
 	long *value;
 
 	if (proto == IPPROTO_TCP) {