[NETFILTER]: connection tracking event notifiers
This adds a notifier chain based event mechanism for ip_conntrack state
changes. As opposed to the previous implementations in patch-o-matic, we
do no longer need a field in the skb to achieve this.
Thanks to the valuable input from Patrick McHardy and Rusty on the idea
of a per_cpu implementation.
Signed-off-by: Harald Welte <laforge@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h
index 4ed720f..ae1270c 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -65,6 +65,63 @@
/* Both together */
IPS_NAT_DONE_MASK = (IPS_DST_NAT_DONE | IPS_SRC_NAT_DONE),
+
+ /* Connection is dying (removed from lists), can not be unset. */
+ IPS_DYING_BIT = 9,
+ IPS_DYING = (1 << IPS_DYING_BIT),
+};
+
+/* Connection tracking event bits */
+enum ip_conntrack_events
+{
+ /* New conntrack */
+ IPCT_NEW_BIT = 0,
+ IPCT_NEW = (1 << IPCT_NEW_BIT),
+
+ /* Expected connection */
+ IPCT_RELATED_BIT = 1,
+ IPCT_RELATED = (1 << IPCT_RELATED_BIT),
+
+ /* Destroyed conntrack */
+ IPCT_DESTROY_BIT = 2,
+ IPCT_DESTROY = (1 << IPCT_DESTROY_BIT),
+
+ /* Timer has been refreshed */
+ IPCT_REFRESH_BIT = 3,
+ IPCT_REFRESH = (1 << IPCT_REFRESH_BIT),
+
+ /* Status has changed */
+ IPCT_STATUS_BIT = 4,
+ IPCT_STATUS = (1 << IPCT_STATUS_BIT),
+
+ /* Update of protocol info */
+ IPCT_PROTOINFO_BIT = 5,
+ IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT),
+
+ /* Volatile protocol info */
+ IPCT_PROTOINFO_VOLATILE_BIT = 6,
+ IPCT_PROTOINFO_VOLATILE = (1 << IPCT_PROTOINFO_VOLATILE_BIT),
+
+ /* New helper for conntrack */
+ IPCT_HELPER_BIT = 7,
+ IPCT_HELPER = (1 << IPCT_HELPER_BIT),
+
+ /* Update of helper info */
+ IPCT_HELPINFO_BIT = 8,
+ IPCT_HELPINFO = (1 << IPCT_HELPINFO_BIT),
+
+ /* Volatile helper info */
+ IPCT_HELPINFO_VOLATILE_BIT = 9,
+ IPCT_HELPINFO_VOLATILE = (1 << IPCT_HELPINFO_VOLATILE_BIT),
+
+ /* NAT info */
+ IPCT_NATINFO_BIT = 10,
+ IPCT_NATINFO = (1 << IPCT_NATINFO_BIT),
+};
+
+enum ip_conntrack_expect_events {
+ IPEXP_NEW_BIT = 0,
+ IPEXP_NEW = (1 << IPEXP_NEW_BIT),
};
#ifdef __KERNEL__
@@ -280,6 +337,11 @@
return test_bit(IPS_CONFIRMED_BIT, &ct->status);
}
+static inline int is_dying(struct ip_conntrack *ct)
+{
+ return test_bit(IPS_DYING_BIT, &ct->status);
+}
+
extern unsigned int ip_conntrack_htable_size;
struct ip_conntrack_stat
@@ -303,6 +365,88 @@
#define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+#include <linux/notifier.h>
+
+struct ip_conntrack_ecache {
+ struct ip_conntrack *ct;
+ unsigned int events;
+};
+DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+
+#define CONNTRACK_ECACHE(x) (__get_cpu_var(ip_conntrack_ecache).x)
+
+extern struct notifier_block *ip_conntrack_chain;
+extern struct notifier_block *ip_conntrack_expect_chain;
+
+static inline int ip_conntrack_register_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_register(&ip_conntrack_chain, nb);
+}
+
+static inline int ip_conntrack_unregister_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_unregister(&ip_conntrack_chain, nb);
+}
+
+static inline int
+ip_conntrack_expect_register_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_register(&ip_conntrack_expect_chain, nb);
+}
+
+static inline int
+ip_conntrack_expect_unregister_notifier(struct notifier_block *nb)
+{
+ return notifier_chain_unregister(&ip_conntrack_expect_chain, nb);
+}
+
+static inline void
+ip_conntrack_event_cache(enum ip_conntrack_events event,
+ const struct sk_buff *skb)
+{
+ struct ip_conntrack_ecache *ecache =
+ &__get_cpu_var(ip_conntrack_ecache);
+
+ if (unlikely((struct ip_conntrack *) skb->nfct != ecache->ct)) {
+ if (net_ratelimit()) {
+ printk(KERN_ERR "ctevent: skb->ct != ecache->ct !!!\n");
+ dump_stack();
+ }
+ }
+ ecache->events |= event;
+}
+
+extern void
+ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct);
+extern void ip_conntrack_event_cache_init(const struct sk_buff *skb);
+
+static inline void ip_conntrack_event(enum ip_conntrack_events event,
+ struct ip_conntrack *ct)
+{
+ if (is_confirmed(ct) && !is_dying(ct))
+ notifier_call_chain(&ip_conntrack_chain, event, ct);
+}
+
+static inline void
+ip_conntrack_expect_event(enum ip_conntrack_expect_events event,
+ struct ip_conntrack_expect *exp)
+{
+ notifier_call_chain(&ip_conntrack_expect_chain, event, exp);
+}
+#else /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+static inline void ip_conntrack_event_cache(enum ip_conntrack_events event,
+ const struct sk_buff *skb) {}
+static inline void ip_conntrack_event(enum ip_conntrack_events event,
+ struct ip_conntrack *ct) {}
+static inline void ip_conntrack_deliver_cached_events_for(
+ struct ip_conntrack *ct) {}
+static inline void ip_conntrack_event_cache_init(const struct sk_buff *skb) {}
+static inline void
+ip_conntrack_expect_event(enum ip_conntrack_expect_events event,
+ struct ip_conntrack_expect *exp) {}
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
#ifdef CONFIG_IP_NF_NAT_NEEDED
static inline int ip_nat_initialized(struct ip_conntrack *conntrack,
enum ip_nat_manip_type manip)
diff --git a/include/linux/netfilter_ipv4/ip_conntrack_core.h b/include/linux/netfilter_ipv4/ip_conntrack_core.h
index 694aec9..46eeea1 100644
--- a/include/linux/netfilter_ipv4/ip_conntrack_core.h
+++ b/include/linux/netfilter_ipv4/ip_conntrack_core.h
@@ -38,12 +38,21 @@
/* Confirm a connection: returns NF_DROP if packet must be dropped. */
static inline int ip_conntrack_confirm(struct sk_buff **pskb)
{
- if ((*pskb)->nfct
- && !is_confirmed((struct ip_conntrack *)(*pskb)->nfct))
- return __ip_conntrack_confirm(pskb);
- return NF_ACCEPT;
+ struct ip_conntrack *ct = (struct ip_conntrack *)(*pskb)->nfct;
+ int ret = NF_ACCEPT;
+
+ if (ct && !is_confirmed(ct))
+ ret = __ip_conntrack_confirm(pskb);
+ ip_conntrack_deliver_cached_events_for(ct);
+
+ return ret;
}
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+struct ip_conntrack_ecache;
+extern void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ec);
+#endif
+
extern struct list_head *ip_conntrack_hash;
extern struct list_head ip_conntrack_expect_list;
extern rwlock_t ip_conntrack_lock;
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 46d4cb1..ff3393e 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -40,6 +40,16 @@
of packets, but this mark value is kept in the conntrack session
instead of the individual packets.
+config IP_NF_CONNTRACK_EVENTS
+ bool "Connection tracking events"
+ depends on IP_NF_CONNTRACK
+ help
+ If this option is enabled, the connection tracking code will
+ provide a notifier chain that can be used by other kernel code
+ to get notified about changes in the connection tracking state.
+
+ IF unsure, say `N'.
+
config IP_NF_CT_PROTO_SCTP
tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)'
depends on IP_NF_CONNTRACK && EXPERIMENTAL
diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c
index 04c3414..caf89de 100644
--- a/net/ipv4/netfilter/ip_conntrack_core.c
+++ b/net/ipv4/netfilter/ip_conntrack_core.c
@@ -37,6 +37,7 @@
#include <linux/err.h>
#include <linux/percpu.h>
#include <linux/moduleparam.h>
+#include <linux/notifier.h>
/* ip_conntrack_lock protects the main hash table, protocol/helper/expected
registrations, conntrack timers*/
@@ -49,7 +50,7 @@
#include <linux/netfilter_ipv4/ip_conntrack_core.h>
#include <linux/netfilter_ipv4/listhelp.h>
-#define IP_CONNTRACK_VERSION "2.1"
+#define IP_CONNTRACK_VERSION "2.2"
#if 0
#define DEBUGP printk
@@ -76,6 +77,81 @@
static LIST_HEAD(unconfirmed);
static int ip_conntrack_vmalloc;
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+struct notifier_block *ip_conntrack_chain;
+struct notifier_block *ip_conntrack_expect_chain;
+
+DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+
+static inline void __deliver_cached_events(struct ip_conntrack_ecache *ecache)
+{
+ if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
+ notifier_call_chain(&ip_conntrack_chain, ecache->events,
+ ecache->ct);
+ ecache->events = 0;
+}
+
+void __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
+{
+ __deliver_cached_events(ecache);
+}
+
+/* Deliver all cached events for a particular conntrack. This is called
+ * by code prior to async packet handling or freeing the skb */
+void
+ip_conntrack_deliver_cached_events_for(const struct ip_conntrack *ct)
+{
+ struct ip_conntrack_ecache *ecache =
+ &__get_cpu_var(ip_conntrack_ecache);
+
+ if (!ct)
+ return;
+
+ if (ecache->ct == ct) {
+ DEBUGP("ecache: delivering event for %p\n", ct);
+ __deliver_cached_events(ecache);
+ } else {
+ if (net_ratelimit())
+ printk(KERN_WARNING "ecache: want to deliver for %p, "
+ "but cache has %p\n", ct, ecache->ct);
+ }
+
+ /* signalize that events have already been delivered */
+ ecache->ct = NULL;
+}
+
+/* Deliver cached events for old pending events, if current conntrack != old */
+void ip_conntrack_event_cache_init(const struct sk_buff *skb)
+{
+ struct ip_conntrack *ct = (struct ip_conntrack *) skb->nfct;
+ struct ip_conntrack_ecache *ecache =
+ &__get_cpu_var(ip_conntrack_ecache);
+
+ /* take care of delivering potentially old events */
+ if (ecache->ct != ct) {
+ enum ip_conntrack_info ctinfo;
+ /* we have to check, since at startup the cache is NULL */
+ if (likely(ecache->ct)) {
+ DEBUGP("ecache: entered for different conntrack: "
+ "ecache->ct=%p, skb->nfct=%p. delivering "
+ "events\n", ecache->ct, ct);
+ __deliver_cached_events(ecache);
+ ip_conntrack_put(ecache->ct);
+ } else {
+ DEBUGP("ecache: entered for conntrack %p, "
+ "cache was clean before\n", ct);
+ }
+
+ /* initialize for this conntrack/packet */
+ ecache->ct = ip_conntrack_get(skb, &ctinfo);
+ /* ecache->events cleared by __deliver_cached_devents() */
+ } else {
+ DEBUGP("ecache: re-entered for conntrack %p.\n", ct);
+ }
+}
+
+#endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
+
DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
void
@@ -223,6 +299,8 @@
IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
IP_NF_ASSERT(!timer_pending(&ct->timeout));
+ set_bit(IPS_DYING_BIT, &ct->status);
+
/* To make sure we don't get any weird locking issues here:
* destroy_conntrack() MUST NOT be called with a write lock
* to ip_conntrack_lock!!! -HW */
@@ -261,6 +339,7 @@
{
struct ip_conntrack *ct = (void *)ul_conntrack;
+ ip_conntrack_event(IPCT_DESTROY, ct);
write_lock_bh(&ip_conntrack_lock);
/* Inside lock so preempt is disabled on module removal path.
* Otherwise we can get spurious warnings. */
@@ -374,6 +453,16 @@
set_bit(IPS_CONFIRMED_BIT, &ct->status);
CONNTRACK_STAT_INC(insert);
write_unlock_bh(&ip_conntrack_lock);
+ if (ct->helper)
+ ip_conntrack_event_cache(IPCT_HELPER, *pskb);
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+ if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
+ test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
+ ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
+#endif
+ ip_conntrack_event_cache(master_ct(ct) ?
+ IPCT_RELATED : IPCT_NEW, *pskb);
+
return NF_ACCEPT;
}
@@ -607,7 +696,7 @@
struct ip_conntrack *ct;
enum ip_conntrack_info ctinfo;
struct ip_conntrack_protocol *proto;
- int set_reply;
+ int set_reply = 0;
int ret;
/* Previously seen (loopback or untracked)? Ignore. */
@@ -666,6 +755,8 @@
IP_NF_ASSERT((*pskb)->nfct);
+ ip_conntrack_event_cache_init(*pskb);
+
ret = proto->packet(ct, *pskb, ctinfo);
if (ret < 0) {
/* Invalid: inverse of the return code tells
@@ -676,8 +767,8 @@
return -ret;
}
- if (set_reply)
- set_bit(IPS_SEEN_REPLY_BIT, &ct->status);
+ if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+ ip_conntrack_event_cache(IPCT_STATUS, *pskb);
return ret;
}
@@ -824,6 +915,7 @@
evict_oldest_expect(expect->master);
ip_conntrack_expect_insert(expect);
+ ip_conntrack_expect_event(IPEXP_NEW, expect);
ret = 0;
out:
write_unlock_bh(&ip_conntrack_lock);
@@ -861,8 +953,10 @@
static inline int unhelp(struct ip_conntrack_tuple_hash *i,
const struct ip_conntrack_helper *me)
{
- if (tuplehash_to_ctrack(i)->helper == me)
+ if (tuplehash_to_ctrack(i)->helper == me) {
+ ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
tuplehash_to_ctrack(i)->helper = NULL;
+ }
return 0;
}
@@ -924,6 +1018,7 @@
if (del_timer(&ct->timeout)) {
ct->timeout.expires = jiffies + extra_jiffies;
add_timer(&ct->timeout);
+ ip_conntrack_event_cache(IPCT_REFRESH, skb);
}
ct_add_counters(ct, ctinfo, skb);
write_unlock_bh(&ip_conntrack_lock);
@@ -1012,6 +1107,23 @@
ip_conntrack_put(ct);
}
+
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+ {
+ /* we need to deliver all cached events in order to drop
+ * the reference counts */
+ int cpu;
+ for_each_cpu(cpu) {
+ struct ip_conntrack_ecache *ecache =
+ &per_cpu(ip_conntrack_ecache, cpu);
+ if (ecache->ct) {
+ __ip_ct_deliver_cached_events(ecache);
+ ip_conntrack_put(ecache->ct);
+ ecache->ct = NULL;
+ }
+ }
+ }
+#endif
}
/* Fast function for those who don't want to parse /proc (and I don't
diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c
index 7a3b773..9658896 100644
--- a/net/ipv4/netfilter/ip_conntrack_ftp.c
+++ b/net/ipv4/netfilter/ip_conntrack_ftp.c
@@ -262,7 +262,8 @@
}
/* We don't update if it's older than what we have. */
-static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir)
+static void update_nl_seq(u32 nl_seq, struct ip_ct_ftp_master *info, int dir,
+ struct sk_buff *skb)
{
unsigned int i, oldest = NUM_SEQ_TO_REMEMBER;
@@ -276,10 +277,13 @@
oldest = i;
}
- if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER)
+ if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) {
info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq;
- else if (oldest != NUM_SEQ_TO_REMEMBER)
+ ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ } else if (oldest != NUM_SEQ_TO_REMEMBER) {
info->seq_aft_nl[dir][oldest] = nl_seq;
+ ip_conntrack_event_cache(IPCT_HELPINFO_VOLATILE, skb);
+ }
}
static int help(struct sk_buff **pskb,
@@ -439,7 +443,7 @@
/* Now if this ends in \n, update ftp info. Seq may have been
* adjusted by NAT code. */
if (ends_in_nl)
- update_nl_seq(seq, ct_ftp_info,dir);
+ update_nl_seq(seq, ct_ftp_info,dir, *pskb);
out:
spin_unlock_bh(&ip_ftp_lock);
return ret;
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
index 602c74db..dca1f63 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c
@@ -102,6 +102,7 @@
ct->timeout.function((unsigned long)ct);
} else {
atomic_inc(&ct->proto.icmp.count);
+ ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout);
}
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
index 31d7539..3d5f878 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c
@@ -404,6 +404,8 @@
}
conntrack->proto.sctp.state = newconntrack;
+ if (oldsctpstate != newconntrack)
+ ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
write_unlock_bh(&sctp_lock);
}
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
index 809dfed..a569ad1 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c
@@ -973,6 +973,10 @@
? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state];
write_unlock_bh(&tcp_lock);
+ ip_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb);
+ if (new_state != old_state)
+ ip_conntrack_event_cache(IPCT_PROTOINFO, skb);
+
if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) {
/* If only reply is a RST, we can consider ourselves not to
have an established connection: this is a fairly common
diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
index 8c1eaba..6066eaf 100644
--- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c
+++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c
@@ -73,7 +73,8 @@
ip_ct_refresh_acct(conntrack, ctinfo, skb,
ip_ct_udp_timeout_stream);
/* Also, more likely to be important, and not a probe */
- set_bit(IPS_ASSURED_BIT, &conntrack->status);
+ if (!test_and_set_bit(IPS_ASSURED_BIT, &conntrack->status))
+ ip_conntrack_event_cache(IPCT_STATUS, skb);
} else
ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout);
diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c
index dccd4ab..f088000 100644
--- a/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ b/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -402,6 +402,7 @@
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
+ ip_conntrack_event_cache_init(*pskb);
/* We've seen it coming out the other side: confirm it */
return ip_conntrack_confirm(pskb);
}
@@ -419,6 +420,7 @@
ct = ip_conntrack_get(*pskb, &ctinfo);
if (ct && ct->helper) {
unsigned int ret;
+ ip_conntrack_event_cache_init(*pskb);
ret = ct->helper->help(pskb, ct, ctinfo);
if (ret != NF_ACCEPT)
return ret;
@@ -889,6 +891,7 @@
return ret;
cleanup:
+ synchronize_net();
#ifdef CONFIG_SYSCTL
unregister_sysctl_table(ip_ct_sysctl_header);
cleanup_localinops:
@@ -971,6 +974,13 @@
{
}
+#ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
+EXPORT_SYMBOL_GPL(ip_conntrack_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_expect_chain);
+EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
+EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
+EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+#endif
EXPORT_SYMBOL(ip_conntrack_protocol_register);
EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
EXPORT_SYMBOL(ip_ct_get_tuple);