net dst: use a percpu_counter to track entries
struct dst_ops tracks number of allocated dst in an atomic_t field,
subject to high cache line contention in stress workload.
Switch to a percpu_counter, to reduce number of time we need to dirty a
central location. Place it on a separate cache line to avoid dirtying
read only fields.
Stress test :
(Sending 160.000.000 UDP frames,
IP route cache disabled, dual E5540 @2.53GHz,
32bit kernel, FIB_TRIE, SLUB/NUMA)
Before:
real 0m51.179s
user 0m15.329s
sys 10m15.942s
After:
real 0m45.570s
user 0m15.525s
sys 9m56.669s
With a small reordering of struct neighbour fields, subject of a
following patch, (to separate refcnt from other read mostly fields)
real 0m41.841s
user 0m15.261s
sys 8m45.949s
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 17e2179..25661f9 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -109,7 +109,6 @@
.link_failure = ip6_link_failure,
.update_pmtu = ip6_rt_update_pmtu,
.local_out = __ip6_local_out,
- .entries = ATOMIC_INIT(0),
};
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
@@ -122,7 +121,6 @@
.destroy = ip6_dst_destroy,
.check = ip6_dst_check,
.update_pmtu = ip6_rt_blackhole_update_pmtu,
- .entries = ATOMIC_INIT(0),
};
static struct rt6_info ip6_null_entry_template = {
@@ -1058,19 +1056,22 @@
int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+ int entries;
+ entries = dst_entries_get_fast(ops);
if (time_after(rt_last_gc + rt_min_interval, now) &&
- atomic_read(&ops->entries) <= rt_max_size)
+ entries <= rt_max_size)
goto out;
net->ipv6.ip6_rt_gc_expire++;
fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
net->ipv6.ip6_rt_last_gc = now;
- if (atomic_read(&ops->entries) < ops->gc_thresh)
+ entries = dst_entries_get_slow(ops);
+ if (entries < ops->gc_thresh)
net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
out:
net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
- return atomic_read(&ops->entries) > rt_max_size;
+ return entries > rt_max_size;
}
/* Clean host part of a prefix. Not necessary in radix tree,
@@ -2524,7 +2525,7 @@
net->ipv6.rt6_stats->fib_rt_alloc,
net->ipv6.rt6_stats->fib_rt_entries,
net->ipv6.rt6_stats->fib_rt_cache,
- atomic_read(&net->ipv6.ip6_dst_ops.entries),
+ dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
net->ipv6.rt6_stats->fib_discarded_routes);
return 0;
@@ -2666,11 +2667,14 @@
memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
sizeof(net->ipv6.ip6_dst_ops));
+ if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
+ goto out_ip6_dst_ops;
+
net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
sizeof(*net->ipv6.ip6_null_entry),
GFP_KERNEL);
if (!net->ipv6.ip6_null_entry)
- goto out_ip6_dst_ops;
+ goto out_ip6_dst_entries;
net->ipv6.ip6_null_entry->dst.path =
(struct dst_entry *)net->ipv6.ip6_null_entry;
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
@@ -2720,6 +2724,8 @@
out_ip6_null_entry:
kfree(net->ipv6.ip6_null_entry);
#endif
+out_ip6_dst_entries:
+ dst_entries_destroy(&net->ipv6.ip6_dst_ops);
out_ip6_dst_ops:
goto out;
}
@@ -2758,10 +2764,14 @@
if (!ip6_dst_ops_template.kmem_cachep)
goto out;
- ret = register_pernet_subsys(&ip6_route_net_ops);
+ ret = dst_entries_init(&ip6_dst_blackhole_ops);
if (ret)
goto out_kmem_cache;
+ ret = register_pernet_subsys(&ip6_route_net_ops);
+ if (ret)
+ goto out_dst_entries;
+
ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
/* Registering of the loopback is done before this portion of code,
@@ -2808,6 +2818,8 @@
fib6_gc_cleanup();
out_register_subsys:
unregister_pernet_subsys(&ip6_route_net_ops);
+out_dst_entries:
+ dst_entries_destroy(&ip6_dst_blackhole_ops);
out_kmem_cache:
kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
goto out;