Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6
diff --git a/include/linux/list_nulls.h b/include/linux/list_nulls.h
index 93150ec..5d10ae3 100644
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -56,6 +56,18 @@
return is_a_nulls(h->first);
}
+static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
+ struct hlist_nulls_head *h)
+{
+ struct hlist_nulls_node *first = h->first;
+
+ n->next = first;
+ n->pprev = &h->first;
+ h->first = n;
+ if (!is_a_nulls(first))
+ first->pprev = &n->next;
+}
+
static inline void __hlist_nulls_del(struct hlist_nulls_node *n)
{
struct hlist_nulls_node *next = n->next;
@@ -65,6 +77,12 @@
next->pprev = pprev;
}
+static inline void hlist_nulls_del(struct hlist_nulls_node *n)
+{
+ __hlist_nulls_del(n);
+ n->pprev = LIST_POISON2;
+}
+
/**
* hlist_nulls_for_each_entry - iterate over list of given type
* @tpos: the type * to use as a loop cursor.
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index ecc79f9..a632689 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -201,6 +201,8 @@
__nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple);
extern void nf_conntrack_hash_insert(struct nf_conn *ct);
+extern void nf_ct_delete_from_lists(struct nf_conn *ct);
+extern void nf_ct_insert_dying_list(struct nf_conn *ct);
extern void nf_conntrack_flush_report(struct net *net, u32 pid, int report);
diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h
index 1afb907..4f20d58 100644
--- a/include/net/netfilter/nf_conntrack_ecache.h
+++ b/include/net/netfilter/nf_conntrack_ecache.h
@@ -6,61 +6,54 @@
#define _NF_CONNTRACK_ECACHE_H
#include <net/netfilter/nf_conntrack.h>
-#include <linux/interrupt.h>
#include <net/net_namespace.h>
#include <net/netfilter/nf_conntrack_expect.h>
+#include <linux/netfilter/nf_conntrack_common.h>
+#include <linux/netfilter/nf_conntrack_tuple_common.h>
+#include <net/netfilter/nf_conntrack_extend.h>
-/* Connection tracking event bits */
+/* Connection tracking event types */
enum ip_conntrack_events
{
- /* New conntrack */
- IPCT_NEW_BIT = 0,
- IPCT_NEW = (1 << IPCT_NEW_BIT),
-
- /* Expected connection */
- IPCT_RELATED_BIT = 1,
- IPCT_RELATED = (1 << IPCT_RELATED_BIT),
-
- /* Destroyed conntrack */
- IPCT_DESTROY_BIT = 2,
- IPCT_DESTROY = (1 << IPCT_DESTROY_BIT),
-
- /* Status has changed */
- IPCT_STATUS_BIT = 3,
- IPCT_STATUS = (1 << IPCT_STATUS_BIT),
-
- /* Update of protocol info */
- IPCT_PROTOINFO_BIT = 4,
- IPCT_PROTOINFO = (1 << IPCT_PROTOINFO_BIT),
-
- /* New helper for conntrack */
- IPCT_HELPER_BIT = 5,
- IPCT_HELPER = (1 << IPCT_HELPER_BIT),
-
- /* Mark is set */
- IPCT_MARK_BIT = 6,
- IPCT_MARK = (1 << IPCT_MARK_BIT),
-
- /* NAT sequence adjustment */
- IPCT_NATSEQADJ_BIT = 7,
- IPCT_NATSEQADJ = (1 << IPCT_NATSEQADJ_BIT),
-
- /* Secmark is set */
- IPCT_SECMARK_BIT = 8,
- IPCT_SECMARK = (1 << IPCT_SECMARK_BIT),
+ IPCT_NEW = 0, /* new conntrack */
+ IPCT_RELATED = 1, /* related conntrack */
+ IPCT_DESTROY = 2, /* destroyed conntrack */
+ IPCT_STATUS = 3, /* status has changed */
+ IPCT_PROTOINFO = 4, /* protocol information has changed */
+ IPCT_HELPER = 5, /* new helper has been set */
+ IPCT_MARK = 6, /* new mark has been set */
+ IPCT_NATSEQADJ = 7, /* NAT is doing sequence adjustment */
+ IPCT_SECMARK = 8, /* new security mark has been set */
};
enum ip_conntrack_expect_events {
- IPEXP_NEW_BIT = 0,
- IPEXP_NEW = (1 << IPEXP_NEW_BIT),
+ IPEXP_NEW = 0, /* new expectation */
+};
+
+struct nf_conntrack_ecache {
+ unsigned long cache; /* bitops want long */
+ unsigned long missed; /* missed events */
+ u32 pid; /* netlink pid of destroyer */
+};
+
+static inline struct nf_conntrack_ecache *
+nf_ct_ecache_find(const struct nf_conn *ct)
+{
+ return nf_ct_ext_find(ct, NF_CT_EXT_ECACHE);
+}
+
+static inline struct nf_conntrack_ecache *
+nf_ct_ecache_ext_add(struct nf_conn *ct, gfp_t gfp)
+{
+ struct net *net = nf_ct_net(ct);
+
+ if (!net->ct.sysctl_events)
+ return NULL;
+
+ return nf_ct_ext_add(ct, NF_CT_EXT_ECACHE, gfp);
};
#ifdef CONFIG_NF_CONNTRACK_EVENTS
-struct nf_conntrack_ecache {
- struct nf_conn *ct;
- unsigned int events;
-};
-
/* This structure is passed to event handler */
struct nf_ct_event {
struct nf_conn *ct;
@@ -76,53 +69,88 @@
extern int nf_conntrack_register_notifier(struct nf_ct_event_notifier *nb);
extern void nf_conntrack_unregister_notifier(struct nf_ct_event_notifier *nb);
-extern void nf_ct_deliver_cached_events(const struct nf_conn *ct);
-extern void __nf_ct_event_cache_init(struct nf_conn *ct);
-extern void nf_ct_event_cache_flush(struct net *net);
+extern void nf_ct_deliver_cached_events(struct nf_conn *ct);
static inline void
nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct)
{
- struct net *net = nf_ct_net(ct);
- struct nf_conntrack_ecache *ecache;
+ struct nf_conntrack_ecache *e;
- local_bh_disable();
- ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
- if (ct != ecache->ct)
- __nf_ct_event_cache_init(ct);
- ecache->events |= event;
- local_bh_enable();
+ if (nf_conntrack_event_cb == NULL)
+ return;
+
+ e = nf_ct_ecache_find(ct);
+ if (e == NULL)
+ return;
+
+ set_bit(event, &e->cache);
}
-static inline void
-nf_conntrack_event_report(enum ip_conntrack_events event,
- struct nf_conn *ct,
- u32 pid,
- int report)
+static inline int
+nf_conntrack_eventmask_report(unsigned int eventmask,
+ struct nf_conn *ct,
+ u32 pid,
+ int report)
{
+ int ret = 0;
+ struct net *net = nf_ct_net(ct);
struct nf_ct_event_notifier *notify;
+ struct nf_conntrack_ecache *e;
rcu_read_lock();
notify = rcu_dereference(nf_conntrack_event_cb);
if (notify == NULL)
goto out_unlock;
+ if (!net->ct.sysctl_events)
+ goto out_unlock;
+
+ e = nf_ct_ecache_find(ct);
+ if (e == NULL)
+ goto out_unlock;
+
if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) {
struct nf_ct_event item = {
.ct = ct,
- .pid = pid,
+ .pid = e->pid ? e->pid : pid,
.report = report
};
- notify->fcn(event, &item);
+ /* This is a resent of a destroy event? If so, skip missed */
+ unsigned long missed = e->pid ? 0 : e->missed;
+
+ ret = notify->fcn(eventmask | missed, &item);
+ if (unlikely(ret < 0 || missed)) {
+ spin_lock_bh(&ct->lock);
+ if (ret < 0) {
+ /* This is a destroy event that has been
+ * triggered by a process, we store the PID
+ * to include it in the retransmission. */
+ if (eventmask & (1 << IPCT_DESTROY) &&
+ e->pid == 0 && pid != 0)
+ e->pid = pid;
+ else
+ e->missed |= eventmask;
+ } else
+ e->missed &= ~missed;
+ spin_unlock_bh(&ct->lock);
+ }
}
out_unlock:
rcu_read_unlock();
+ return ret;
}
-static inline void
+static inline int
+nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct,
+ u32 pid, int report)
+{
+ return nf_conntrack_eventmask_report(1 << event, ct, pid, report);
+}
+
+static inline int
nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct)
{
- nf_conntrack_event_report(event, ct, 0, 0);
+ return nf_conntrack_eventmask_report(1 << event, ct, 0, 0);
}
struct nf_exp_event {
@@ -145,6 +173,7 @@
u32 pid,
int report)
{
+ struct net *net = nf_ct_exp_net(exp);
struct nf_exp_event_notifier *notify;
rcu_read_lock();
@@ -152,13 +181,16 @@
if (notify == NULL)
goto out_unlock;
+ if (!net->ct.sysctl_events)
+ goto out_unlock;
+
{
struct nf_exp_event item = {
.exp = exp,
.pid = pid,
.report = report
};
- notify->fcn(event, &item);
+ notify->fcn(1 << event, &item);
}
out_unlock:
rcu_read_unlock();
@@ -178,12 +210,16 @@
static inline void nf_conntrack_event_cache(enum ip_conntrack_events event,
struct nf_conn *ct) {}
-static inline void nf_conntrack_event(enum ip_conntrack_events event,
- struct nf_conn *ct) {}
-static inline void nf_conntrack_event_report(enum ip_conntrack_events event,
- struct nf_conn *ct,
- u32 pid,
- int report) {}
+static inline int nf_conntrack_eventmask_report(unsigned int eventmask,
+ struct nf_conn *ct,
+ u32 pid,
+ int report) { return 0; }
+static inline int nf_conntrack_event(enum ip_conntrack_events event,
+ struct nf_conn *ct) { return 0; }
+static inline int nf_conntrack_event_report(enum ip_conntrack_events event,
+ struct nf_conn *ct,
+ u32 pid,
+ int report) { return 0; }
static inline void nf_ct_deliver_cached_events(const struct nf_conn *ct) {}
static inline void nf_ct_expect_event(enum ip_conntrack_expect_events event,
struct nf_conntrack_expect *exp) {}
@@ -191,7 +227,6 @@
struct nf_conntrack_expect *exp,
u32 pid,
int report) {}
-static inline void nf_ct_event_cache_flush(struct net *net) {}
static inline int nf_conntrack_ecache_init(struct net *net)
{
diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h
index da8ee52..7f8fc5d 100644
--- a/include/net/netfilter/nf_conntrack_extend.h
+++ b/include/net/netfilter/nf_conntrack_extend.h
@@ -8,12 +8,14 @@
NF_CT_EXT_HELPER,
NF_CT_EXT_NAT,
NF_CT_EXT_ACCT,
+ NF_CT_EXT_ECACHE,
NF_CT_EXT_NUM,
};
#define NF_CT_EXT_HELPER_TYPE struct nf_conn_help
#define NF_CT_EXT_NAT_TYPE struct nf_conn_nat
#define NF_CT_EXT_ACCT_TYPE struct nf_conn_counter
+#define NF_CT_EXT_ECACHE_TYPE struct nf_conntrack_ecache
/* Extensions: optional stuff which isn't permanently in struct. */
struct nf_ct_ext {
diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index ee2a4b3..1b70680 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -50,6 +50,8 @@
extern int __nf_ct_try_assign_helper(struct nf_conn *ct, gfp_t flags);
+extern void nf_ct_helper_destroy(struct nf_conn *ct);
+
static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct)
{
return nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index 9dc5840..ba1ba0c 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -14,16 +14,17 @@
struct hlist_nulls_head *hash;
struct hlist_head *expect_hash;
struct hlist_nulls_head unconfirmed;
+ struct hlist_nulls_head dying;
struct ip_conntrack_stat *stat;
-#ifdef CONFIG_NF_CONNTRACK_EVENTS
- struct nf_conntrack_ecache *ecache;
-#endif
+ int sysctl_events;
+ unsigned int sysctl_events_retry_timeout;
int sysctl_acct;
int sysctl_checksum;
unsigned int sysctl_log_invalid; /* Log invalid packets */
#ifdef CONFIG_SYSCTL
struct ctl_table_header *sysctl_header;
struct ctl_table_header *acct_sysctl_header;
+ struct ctl_table_header *event_sysctl_header;
#endif
int hash_vmalloc;
int expect_vmalloc;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index edf9569..5f72b94 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -39,6 +39,7 @@
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_extend.h>
#include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_ecache.h>
#include <net/netfilter/nf_nat.h>
#include <net/netfilter/nf_nat_core.h>
@@ -182,10 +183,6 @@
NF_CT_ASSERT(atomic_read(&nfct->use) == 0);
NF_CT_ASSERT(!timer_pending(&ct->timeout));
- if (!test_bit(IPS_DYING_BIT, &ct->status))
- nf_conntrack_event(IPCT_DESTROY, ct);
- set_bit(IPS_DYING_BIT, &ct->status);
-
/* To make sure we don't get any weird locking issues here:
* destroy_conntrack() MUST NOT be called with a write lock
* to nf_conntrack_lock!!! -HW */
@@ -219,27 +216,70 @@
nf_conntrack_free(ct);
}
-static void death_by_timeout(unsigned long ul_conntrack)
+void nf_ct_delete_from_lists(struct nf_conn *ct)
{
- struct nf_conn *ct = (void *)ul_conntrack;
struct net *net = nf_ct_net(ct);
- struct nf_conn_help *help = nfct_help(ct);
- struct nf_conntrack_helper *helper;
- if (help) {
- rcu_read_lock();
- helper = rcu_dereference(help->helper);
- if (helper && helper->destroy)
- helper->destroy(ct);
- rcu_read_unlock();
- }
-
+ nf_ct_helper_destroy(ct);
spin_lock_bh(&nf_conntrack_lock);
/* Inside lock so preempt is disabled on module removal path.
* Otherwise we can get spurious warnings. */
NF_CT_STAT_INC(net, delete_list);
clean_from_lists(ct);
spin_unlock_bh(&nf_conntrack_lock);
+}
+EXPORT_SYMBOL_GPL(nf_ct_delete_from_lists);
+
+static void death_by_event(unsigned long ul_conntrack)
+{
+ struct nf_conn *ct = (void *)ul_conntrack;
+ struct net *net = nf_ct_net(ct);
+
+ if (nf_conntrack_event(IPCT_DESTROY, ct) < 0) {
+ /* bad luck, let's retry again */
+ ct->timeout.expires = jiffies +
+ (random32() % net->ct.sysctl_events_retry_timeout);
+ add_timer(&ct->timeout);
+ return;
+ }
+ /* we've got the event delivered, now it's dying */
+ set_bit(IPS_DYING_BIT, &ct->status);
+ spin_lock(&nf_conntrack_lock);
+ hlist_nulls_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode);
+ spin_unlock(&nf_conntrack_lock);
+ nf_ct_put(ct);
+}
+
+void nf_ct_insert_dying_list(struct nf_conn *ct)
+{
+ struct net *net = nf_ct_net(ct);
+
+ /* add this conntrack to the dying list */
+ spin_lock_bh(&nf_conntrack_lock);
+ hlist_nulls_add_head(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode,
+ &net->ct.dying);
+ spin_unlock_bh(&nf_conntrack_lock);
+ /* set a new timer to retry event delivery */
+ setup_timer(&ct->timeout, death_by_event, (unsigned long)ct);
+ ct->timeout.expires = jiffies +
+ (random32() % net->ct.sysctl_events_retry_timeout);
+ add_timer(&ct->timeout);
+}
+EXPORT_SYMBOL_GPL(nf_ct_insert_dying_list);
+
+static void death_by_timeout(unsigned long ul_conntrack)
+{
+ struct nf_conn *ct = (void *)ul_conntrack;
+
+ if (!test_bit(IPS_DYING_BIT, &ct->status) &&
+ unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+ /* destroy event was not delivered */
+ nf_ct_delete_from_lists(ct);
+ nf_ct_insert_dying_list(ct);
+ return;
+ }
+ set_bit(IPS_DYING_BIT, &ct->status);
+ nf_ct_delete_from_lists(ct);
nf_ct_put(ct);
}
@@ -577,6 +617,7 @@
}
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_ecache_ext_add(ct, GFP_ATOMIC);
spin_lock_bh(&nf_conntrack_lock);
exp = nf_ct_find_expectation(net, tuple);
@@ -807,8 +848,6 @@
NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
NF_CT_ASSERT(skb);
- spin_lock_bh(&nf_conntrack_lock);
-
/* Only update if this is not a fixed timeout */
if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
goto acct;
@@ -822,11 +861,8 @@
/* Only update the timeout if the new timeout is at least
HZ jiffies from the old timeout. Need del_timer for race
avoidance (may already be dying). */
- if (newtime - ct->timeout.expires >= HZ
- && del_timer(&ct->timeout)) {
- ct->timeout.expires = newtime;
- add_timer(&ct->timeout);
- }
+ if (newtime - ct->timeout.expires >= HZ)
+ mod_timer_pending(&ct->timeout, newtime);
}
acct:
@@ -835,13 +871,13 @@
acct = nf_conn_acct_find(ct);
if (acct) {
+ spin_lock_bh(&ct->lock);
acct[CTINFO2DIR(ctinfo)].packets++;
acct[CTINFO2DIR(ctinfo)].bytes +=
skb->len - skb_network_offset(skb);
+ spin_unlock_bh(&ct->lock);
}
}
-
- spin_unlock_bh(&nf_conntrack_lock);
}
EXPORT_SYMBOL_GPL(__nf_ct_refresh_acct);
@@ -853,14 +889,14 @@
if (do_acct) {
struct nf_conn_counter *acct;
- spin_lock_bh(&nf_conntrack_lock);
acct = nf_conn_acct_find(ct);
if (acct) {
+ spin_lock_bh(&ct->lock);
acct[CTINFO2DIR(ctinfo)].packets++;
acct[CTINFO2DIR(ctinfo)].bytes +=
skb->len - skb_network_offset(skb);
+ spin_unlock_bh(&ct->lock);
}
- spin_unlock_bh(&nf_conntrack_lock);
}
if (del_timer(&ct->timeout)) {
@@ -994,11 +1030,13 @@
{
struct __nf_ct_flush_report *fr = (struct __nf_ct_flush_report *)data;
- /* get_next_corpse sets the dying bit for us */
- nf_conntrack_event_report(IPCT_DESTROY,
- i,
- fr->pid,
- fr->report);
+ /* If we fail to deliver the event, death_by_timeout() will retry */
+ if (nf_conntrack_event_report(IPCT_DESTROY, i,
+ fr->pid, fr->report) < 0)
+ return 1;
+
+ /* Avoid the delivery of the destroy event in death_by_timeout(). */
+ set_bit(IPS_DYING_BIT, &i->status);
return 1;
}
@@ -1027,6 +1065,21 @@
}
EXPORT_SYMBOL_GPL(nf_conntrack_flush_report);
+static void nf_ct_release_dying_list(void)
+{
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conn *ct;
+ struct hlist_nulls_node *n;
+
+ spin_lock_bh(&nf_conntrack_lock);
+ hlist_nulls_for_each_entry(h, n, &init_net.ct.dying, hnnode) {
+ ct = nf_ct_tuplehash_to_ctrack(h);
+ /* never fails to remove them, no listeners at this point */
+ nf_ct_kill(ct);
+ }
+ spin_unlock_bh(&nf_conntrack_lock);
+}
+
static void nf_conntrack_cleanup_init_net(void)
{
nf_conntrack_helper_fini();
@@ -1036,10 +1089,9 @@
static void nf_conntrack_cleanup_net(struct net *net)
{
- nf_ct_event_cache_flush(net);
- nf_conntrack_ecache_fini(net);
i_see_dead_people:
nf_ct_iterate_cleanup(net, kill_all, NULL);
+ nf_ct_release_dying_list();
if (atomic_read(&net->ct.count) != 0) {
schedule();
goto i_see_dead_people;
@@ -1050,6 +1102,7 @@
nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
nf_conntrack_htable_size);
+ nf_conntrack_ecache_fini(net);
nf_conntrack_acct_fini(net);
nf_conntrack_expect_fini(net);
free_percpu(net->ct.stat);
@@ -1220,14 +1273,12 @@
atomic_set(&net->ct.count, 0);
INIT_HLIST_NULLS_HEAD(&net->ct.unconfirmed, 0);
+ INIT_HLIST_NULLS_HEAD(&net->ct.dying, 0);
net->ct.stat = alloc_percpu(struct ip_conntrack_stat);
if (!net->ct.stat) {
ret = -ENOMEM;
goto err_stat;
}
- ret = nf_conntrack_ecache_init(net);
- if (ret < 0)
- goto err_ecache;
net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size,
&net->ct.hash_vmalloc, 1);
if (!net->ct.hash) {
@@ -1241,6 +1292,9 @@
ret = nf_conntrack_acct_init(net);
if (ret < 0)
goto err_acct;
+ ret = nf_conntrack_ecache_init(net);
+ if (ret < 0)
+ goto err_ecache;
/* Set up fake conntrack:
- to never be deleted, not in any hashes */
@@ -1253,14 +1307,14 @@
return 0;
+err_ecache:
+ nf_conntrack_acct_fini(net);
err_acct:
nf_conntrack_expect_fini(net);
err_expect:
nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc,
nf_conntrack_htable_size);
err_hash:
- nf_conntrack_ecache_fini(net);
-err_ecache:
free_percpu(net->ct.stat);
err_stat:
return ret;
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index 5516b3e..aee560b 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -21,6 +21,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_extend.h>
static DEFINE_MUTEX(nf_ct_ecache_mutex);
@@ -32,94 +33,51 @@
/* deliver cached events and clear cache entry - must be called with locally
* disabled softirqs */
-static inline void
-__nf_ct_deliver_cached_events(struct nf_conntrack_ecache *ecache)
+void nf_ct_deliver_cached_events(struct nf_conn *ct)
{
+ unsigned long events;
struct nf_ct_event_notifier *notify;
+ struct nf_conntrack_ecache *e;
rcu_read_lock();
notify = rcu_dereference(nf_conntrack_event_cb);
if (notify == NULL)
goto out_unlock;
- if (nf_ct_is_confirmed(ecache->ct) && !nf_ct_is_dying(ecache->ct)
- && ecache->events) {
+ e = nf_ct_ecache_find(ct);
+ if (e == NULL)
+ goto out_unlock;
+
+ events = xchg(&e->cache, 0);
+
+ if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct) && events) {
struct nf_ct_event item = {
- .ct = ecache->ct,
+ .ct = ct,
.pid = 0,
.report = 0
};
+ int ret;
+ /* We make a copy of the missed event cache without taking
+ * the lock, thus we may send missed events twice. However,
+ * this does not harm and it happens very rarely. */
+ unsigned long missed = e->missed;
- notify->fcn(ecache->events, &item);
+ ret = notify->fcn(events | missed, &item);
+ if (unlikely(ret < 0 || missed)) {
+ spin_lock_bh(&ct->lock);
+ if (ret < 0)
+ e->missed |= events;
+ else
+ e->missed &= ~missed;
+ spin_unlock_bh(&ct->lock);
+ }
}
- ecache->events = 0;
- nf_ct_put(ecache->ct);
- ecache->ct = NULL;
-
out_unlock:
rcu_read_unlock();
}
-
-/* Deliver all cached events for a particular conntrack. This is called
- * by code prior to async packet handling for freeing the skb */
-void nf_ct_deliver_cached_events(const struct nf_conn *ct)
-{
- struct net *net = nf_ct_net(ct);
- struct nf_conntrack_ecache *ecache;
-
- local_bh_disable();
- ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
- if (ecache->ct == ct)
- __nf_ct_deliver_cached_events(ecache);
- local_bh_enable();
-}
EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events);
-/* Deliver cached events for old pending events, if current conntrack != old */
-void __nf_ct_event_cache_init(struct nf_conn *ct)
-{
- struct net *net = nf_ct_net(ct);
- struct nf_conntrack_ecache *ecache;
-
- /* take care of delivering potentially old events */
- ecache = per_cpu_ptr(net->ct.ecache, raw_smp_processor_id());
- BUG_ON(ecache->ct == ct);
- if (ecache->ct)
- __nf_ct_deliver_cached_events(ecache);
- /* initialize for this conntrack/packet */
- ecache->ct = ct;
- nf_conntrack_get(&ct->ct_general);
-}
-EXPORT_SYMBOL_GPL(__nf_ct_event_cache_init);
-
-/* flush the event cache - touches other CPU's data and must not be called
- * while packets are still passing through the code */
-void nf_ct_event_cache_flush(struct net *net)
-{
- struct nf_conntrack_ecache *ecache;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- ecache = per_cpu_ptr(net->ct.ecache, cpu);
- if (ecache->ct)
- nf_ct_put(ecache->ct);
- }
-}
-
-int nf_conntrack_ecache_init(struct net *net)
-{
- net->ct.ecache = alloc_percpu(struct nf_conntrack_ecache);
- if (!net->ct.ecache)
- return -ENOMEM;
- return 0;
-}
-
-void nf_conntrack_ecache_fini(struct net *net)
-{
- free_percpu(net->ct.ecache);
-}
-
int nf_conntrack_register_notifier(struct nf_ct_event_notifier *new)
{
int ret = 0;
@@ -185,3 +143,118 @@
mutex_unlock(&nf_ct_ecache_mutex);
}
EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier);
+
+#define NF_CT_EVENTS_DEFAULT 1
+static int nf_ct_events __read_mostly = NF_CT_EVENTS_DEFAULT;
+static int nf_ct_events_retry_timeout __read_mostly = 15*HZ;
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table event_sysctl_table[] = {
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nf_conntrack_events",
+ .data = &init_net.ct.sysctl_events,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .ctl_name = CTL_UNNUMBERED,
+ .procname = "nf_conntrack_events_retry_timeout",
+ .data = &init_net.ct.sysctl_events_retry_timeout,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {}
+};
+#endif /* CONFIG_SYSCTL */
+
+static struct nf_ct_ext_type event_extend __read_mostly = {
+ .len = sizeof(struct nf_conntrack_ecache),
+ .align = __alignof__(struct nf_conntrack_ecache),
+ .id = NF_CT_EXT_ECACHE,
+};
+
+#ifdef CONFIG_SYSCTL
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = kmemdup(event_sysctl_table, sizeof(event_sysctl_table),
+ GFP_KERNEL);
+ if (!table)
+ goto out;
+
+ table[0].data = &net->ct.sysctl_events;
+ table[1].data = &net->ct.sysctl_events_retry_timeout;
+
+ net->ct.event_sysctl_header =
+ register_net_sysctl_table(net,
+ nf_net_netfilter_sysctl_path, table);
+ if (!net->ct.event_sysctl_header) {
+ printk(KERN_ERR "nf_ct_event: can't register to sysctl.\n");
+ goto out_register;
+ }
+ return 0;
+
+out_register:
+ kfree(table);
+out:
+ return -ENOMEM;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+ struct ctl_table *table;
+
+ table = net->ct.event_sysctl_header->ctl_table_arg;
+ unregister_net_sysctl_table(net->ct.event_sysctl_header);
+ kfree(table);
+}
+#else
+static int nf_conntrack_event_init_sysctl(struct net *net)
+{
+ return 0;
+}
+
+static void nf_conntrack_event_fini_sysctl(struct net *net)
+{
+}
+#endif /* CONFIG_SYSCTL */
+
+int nf_conntrack_ecache_init(struct net *net)
+{
+ int ret;
+
+ net->ct.sysctl_events = nf_ct_events;
+ net->ct.sysctl_events_retry_timeout = nf_ct_events_retry_timeout;
+
+ if (net_eq(net, &init_net)) {
+ ret = nf_ct_extend_register(&event_extend);
+ if (ret < 0) {
+ printk(KERN_ERR "nf_ct_event: Unable to register "
+ "event extension.\n");
+ goto out_extend_register;
+ }
+ }
+
+ ret = nf_conntrack_event_init_sysctl(net);
+ if (ret < 0)
+ goto out_sysctl;
+
+ return 0;
+
+out_sysctl:
+ if (net_eq(net, &init_net))
+ nf_ct_extend_unregister(&event_extend);
+out_extend_register:
+ return ret;
+}
+
+void nf_conntrack_ecache_fini(struct net *net)
+{
+ nf_conntrack_event_fini_sysctl(net);
+ if (net_eq(net, &init_net))
+ nf_ct_extend_unregister(&event_extend);
+}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 0fa5a42..65c2a7b 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -136,6 +136,20 @@
return 0;
}
+void nf_ct_helper_destroy(struct nf_conn *ct)
+{
+ struct nf_conn_help *help = nfct_help(ct);
+ struct nf_conntrack_helper *helper;
+
+ if (help) {
+ rcu_read_lock();
+ helper = rcu_dereference(help->helper);
+ if (helper && helper->destroy)
+ helper->destroy(ct);
+ rcu_read_unlock();
+ }
+}
+
int nf_conntrack_helper_register(struct nf_conntrack_helper *me)
{
unsigned int h = helper_hash(&me->tuple);
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 4e503ad..49479d1 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -463,15 +463,16 @@
struct sk_buff *skb;
unsigned int type;
unsigned int flags = 0, group;
+ int err;
/* ignore our fake conntrack entry */
if (ct == &nf_conntrack_untracked)
return 0;
- if (events & IPCT_DESTROY) {
+ if (events & (1 << IPCT_DESTROY)) {
type = IPCTNL_MSG_CT_DELETE;
group = NFNLGRP_CONNTRACK_DESTROY;
- } else if (events & (IPCT_NEW | IPCT_RELATED)) {
+ } else if (events & ((1 << IPCT_NEW) | (1 << IPCT_RELATED))) {
type = IPCTNL_MSG_CT_NEW;
flags = NLM_F_CREATE|NLM_F_EXCL;
group = NFNLGRP_CONNTRACK_NEW;
@@ -519,7 +520,7 @@
if (ctnetlink_dump_status(skb, ct) < 0)
goto nla_put_failure;
- if (events & IPCT_DESTROY) {
+ if (events & (1 << IPCT_DESTROY)) {
if (ctnetlink_dump_counters(skb, ct, IP_CT_DIR_ORIGINAL) < 0 ||
ctnetlink_dump_counters(skb, ct, IP_CT_DIR_REPLY) < 0)
goto nla_put_failure;
@@ -527,38 +528,41 @@
if (ctnetlink_dump_timeout(skb, ct) < 0)
goto nla_put_failure;
- if (events & IPCT_PROTOINFO
+ if (events & (1 << IPCT_PROTOINFO)
&& ctnetlink_dump_protoinfo(skb, ct) < 0)
goto nla_put_failure;
- if ((events & IPCT_HELPER || nfct_help(ct))
+ if ((events & (1 << IPCT_HELPER) || nfct_help(ct))
&& ctnetlink_dump_helpinfo(skb, ct) < 0)
goto nla_put_failure;
#ifdef CONFIG_NF_CONNTRACK_SECMARK
- if ((events & IPCT_SECMARK || ct->secmark)
+ if ((events & (1 << IPCT_SECMARK) || ct->secmark)
&& ctnetlink_dump_secmark(skb, ct) < 0)
goto nla_put_failure;
#endif
- if (events & IPCT_RELATED &&
+ if (events & (1 << IPCT_RELATED) &&
ctnetlink_dump_master(skb, ct) < 0)
goto nla_put_failure;
- if (events & IPCT_NATSEQADJ &&
+ if (events & (1 << IPCT_NATSEQADJ) &&
ctnetlink_dump_nat_seq_adj(skb, ct) < 0)
goto nla_put_failure;
}
#ifdef CONFIG_NF_CONNTRACK_MARK
- if ((events & IPCT_MARK || ct->mark)
+ if ((events & (1 << IPCT_MARK) || ct->mark)
&& ctnetlink_dump_mark(skb, ct) < 0)
goto nla_put_failure;
#endif
rcu_read_unlock();
nlmsg_end(skb, nlh);
- nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
+ err = nfnetlink_send(skb, item->pid, group, item->report, GFP_ATOMIC);
+ if (err == -ENOBUFS || err == -EAGAIN)
+ return -ENOBUFS;
+
return 0;
nla_put_failure:
@@ -798,10 +802,15 @@
}
}
- nf_conntrack_event_report(IPCT_DESTROY,
- ct,
- NETLINK_CB(skb).pid,
- nlmsg_report(nlh));
+ if (nf_conntrack_event_report(IPCT_DESTROY, ct,
+ NETLINK_CB(skb).pid,
+ nlmsg_report(nlh)) < 0) {
+ nf_ct_delete_from_lists(ct);
+ /* we failed to report the event, try later */
+ nf_ct_insert_dying_list(ct);
+ nf_ct_put(ct);
+ return 0;
+ }
/* death_by_timeout would report the event again */
set_bit(IPS_DYING_BIT, &ct->status);
@@ -1253,6 +1262,7 @@
}
nf_ct_acct_ext_add(ct, GFP_ATOMIC);
+ nf_ct_ecache_ext_add(ct, GFP_ATOMIC);
#if defined(CONFIG_NF_CONNTRACK_MARK)
if (cda[CTA_MARK])
@@ -1340,13 +1350,13 @@
else
events = IPCT_NEW;
- nf_conntrack_event_report(IPCT_STATUS |
- IPCT_HELPER |
- IPCT_PROTOINFO |
- IPCT_NATSEQADJ |
- IPCT_MARK | events,
- ct, NETLINK_CB(skb).pid,
- nlmsg_report(nlh));
+ nf_conntrack_eventmask_report((1 << IPCT_STATUS) |
+ (1 << IPCT_HELPER) |
+ (1 << IPCT_PROTOINFO) |
+ (1 << IPCT_NATSEQADJ) |
+ (1 << IPCT_MARK) | events,
+ ct, NETLINK_CB(skb).pid,
+ nlmsg_report(nlh));
nf_ct_put(ct);
} else
spin_unlock_bh(&nf_conntrack_lock);
@@ -1365,13 +1375,13 @@
if (err == 0) {
nf_conntrack_get(&ct->ct_general);
spin_unlock_bh(&nf_conntrack_lock);
- nf_conntrack_event_report(IPCT_STATUS |
- IPCT_HELPER |
- IPCT_PROTOINFO |
- IPCT_NATSEQADJ |
- IPCT_MARK,
- ct, NETLINK_CB(skb).pid,
- nlmsg_report(nlh));
+ nf_conntrack_eventmask_report((1 << IPCT_STATUS) |
+ (1 << IPCT_HELPER) |
+ (1 << IPCT_PROTOINFO) |
+ (1 << IPCT_NATSEQADJ) |
+ (1 << IPCT_MARK),
+ ct, NETLINK_CB(skb).pid,
+ nlmsg_report(nlh));
nf_ct_put(ct);
} else
spin_unlock_bh(&nf_conntrack_lock);
@@ -1515,7 +1525,7 @@
unsigned int type;
int flags = 0;
- if (events & IPEXP_NEW) {
+ if (events & (1 << IPEXP_NEW)) {
type = IPCTNL_MSG_EXP_NEW;
flags = NLM_F_CREATE|NLM_F_EXCL;
} else
diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c
index beb3731..2fefe14 100644
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -248,14 +248,14 @@
rcu_assign_pointer(nf_loggers[tindex], logger);
mutex_unlock(&nf_log_mutex);
} else {
- rcu_read_lock();
- logger = rcu_dereference(nf_loggers[tindex]);
+ mutex_lock(&nf_log_mutex);
+ logger = nf_loggers[tindex];
if (!logger)
table->data = "NONE";
else
table->data = logger->name;
r = proc_dostring(table, write, filp, buffer, lenp, ppos);
- rcu_read_unlock();
+ mutex_unlock(&nf_log_mutex);
}
return r;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 46dba5f..025d1a0 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -364,14 +364,14 @@
* ebt_among is exempt from centralized matchsize checking
* because it uses a dynamic-size data set.
*/
- printk("%s_tables: %s match: invalid size %Zu != %u\n",
+ pr_err("%s_tables: %s match: invalid size %Zu != %u\n",
xt_prefix[par->family], par->match->name,
XT_ALIGN(par->match->matchsize), size);
return -EINVAL;
}
if (par->match->table != NULL &&
strcmp(par->match->table, par->table) != 0) {
- printk("%s_tables: %s match: only valid in %s table, not %s\n",
+ pr_err("%s_tables: %s match: only valid in %s table, not %s\n",
xt_prefix[par->family], par->match->name,
par->match->table, par->table);
return -EINVAL;
@@ -379,7 +379,7 @@
if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) {
char used[64], allow[64];
- printk("%s_tables: %s match: used from hooks %s, but only "
+ pr_err("%s_tables: %s match: used from hooks %s, but only "
"valid from %s\n",
xt_prefix[par->family], par->match->name,
textify_hooks(used, sizeof(used), par->hook_mask),
@@ -387,7 +387,7 @@
return -EINVAL;
}
if (par->match->proto && (par->match->proto != proto || inv_proto)) {
- printk("%s_tables: %s match: only valid for protocol %u\n",
+ pr_err("%s_tables: %s match: only valid for protocol %u\n",
xt_prefix[par->family], par->match->name,
par->match->proto);
return -EINVAL;
@@ -514,14 +514,14 @@
unsigned int size, u_int8_t proto, bool inv_proto)
{
if (XT_ALIGN(par->target->targetsize) != size) {
- printk("%s_tables: %s target: invalid size %Zu != %u\n",
+ pr_err("%s_tables: %s target: invalid size %Zu != %u\n",
xt_prefix[par->family], par->target->name,
XT_ALIGN(par->target->targetsize), size);
return -EINVAL;
}
if (par->target->table != NULL &&
strcmp(par->target->table, par->table) != 0) {
- printk("%s_tables: %s target: only valid in %s table, not %s\n",
+ pr_err("%s_tables: %s target: only valid in %s table, not %s\n",
xt_prefix[par->family], par->target->name,
par->target->table, par->table);
return -EINVAL;
@@ -529,7 +529,7 @@
if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) {
char used[64], allow[64];
- printk("%s_tables: %s target: used from hooks %s, but only "
+ pr_err("%s_tables: %s target: used from hooks %s, but only "
"usable from %s\n",
xt_prefix[par->family], par->target->name,
textify_hooks(used, sizeof(used), par->hook_mask),
@@ -537,7 +537,7 @@
return -EINVAL;
}
if (par->target->proto && (par->target->proto != proto || inv_proto)) {
- printk("%s_tables: %s target: only valid for protocol %u\n",
+ pr_err("%s_tables: %s target: only valid for protocol %u\n",
xt_prefix[par->family], par->target->name,
par->target->proto);
return -EINVAL;