From 056ff3e3bd1563969a311697323ff929df94415c Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Wed, 3 Feb 2010 12:58:06 +0100 Subject: [PATCH 1/9] netfilter: nf_conntrack: fix memory corruption with multiple namespaces As discovered by Jon Masters , the "untracked" conntrack, which is located in the data section, might be accidentally freed when a new namespace is instantiated while the untracked conntrack is attached to a skb because the reference count it re-initialized. The best fix would be to use a seperate untracked conntrack per namespace since it includes a namespace pointer. Unfortunately this is not possible without larger changes since the namespace is not easily available everywhere we need it. For now move the untracked conntrack initialization to the init_net setup function to make sure the reference count is not re-initialized and handle cleanup in the init_net cleanup function to make sure namespaces can exit properly while the untracked conntrack is in use in other namespaces. Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0e98c3282d42..37e2b88313f2 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1113,6 +1113,10 @@ static void nf_ct_release_dying_list(struct net *net) static void nf_conntrack_cleanup_init_net(void) { + /* wait until all references to nf_conntrack_untracked are dropped */ + while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) + schedule(); + nf_conntrack_helper_fini(); nf_conntrack_proto_fini(); kmem_cache_destroy(nf_conntrack_cachep); @@ -1127,9 +1131,6 @@ static void nf_conntrack_cleanup_net(struct net *net) schedule(); goto i_see_dead_people; } - /* wait until all references to nf_conntrack_untracked are dropped */ - while (atomic_read(&nf_conntrack_untracked.ct_general.use) > 1) - schedule(); nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); @@ -1288,6 +1289,14 @@ static int nf_conntrack_init_init_net(void) if (ret < 0) goto err_helper; + /* Set up fake conntrack: to never be deleted, not in any hashes */ +#ifdef CONFIG_NET_NS + nf_conntrack_untracked.ct_net = &init_net; +#endif + atomic_set(&nf_conntrack_untracked.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); + return 0; err_helper: @@ -1333,15 +1342,6 @@ static int nf_conntrack_init_net(struct net *net) if (ret < 0) goto err_ecache; - /* Set up fake conntrack: - - to never be deleted, not in any hashes */ -#ifdef CONFIG_NET_NS - nf_conntrack_untracked.ct_net = &init_net; -#endif - atomic_set(&nf_conntrack_untracked.ct_general.use, 1); - /* - and look it like as a confirmed connection */ - set_bit(IPS_CONFIRMED_BIT, &nf_conntrack_untracked.status); - return 0; err_ecache: From ab59b19be78aac65cdd599fb5002c9019885e061 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 4 Feb 2010 14:54:05 +0100 Subject: [PATCH 2/9] netfilter: nf_conntrack: per netns nf_conntrack_cachep nf_conntrack_cachep is currently shared by all netns instances, but because of SLAB_DESTROY_BY_RCU special semantics, this is wrong. If we use a shared slab cache, one object can instantly flight between one hash table (netns ONE) to another one (netns TWO), and concurrent reader (doing a lookup in netns ONE, 'finding' an object of netns TWO) can be fooled without notice, because no RCU grace period has to be observed between object freeing and its reuse. We dont have this problem with UDP/TCP slab caches because TCP/UDP hashtables are global to the machine (and each object has a pointer to its netns). If we use per netns conntrack hash tables, we also *must* use per netns conntrack slab caches, to guarantee an object can not escape from one namespace to another one. Signed-off-by: Eric Dumazet [Patrick: added unique slab name allocation] Signed-off-by: Patrick McHardy --- include/net/netns/conntrack.h | 2 ++ net/netfilter/nf_conntrack_core.c | 39 ++++++++++++++++++------------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index ba1ba0c5efd1..aed23b6c8478 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -11,6 +11,7 @@ struct nf_conntrack_ecache; struct netns_ct { atomic_t count; unsigned int expect_count; + struct kmem_cache *nf_conntrack_cachep; struct hlist_nulls_head *hash; struct hlist_head *expect_hash; struct hlist_nulls_head unconfirmed; @@ -28,5 +29,6 @@ struct netns_ct { #endif int hash_vmalloc; int expect_vmalloc; + char *slabname; }; #endif diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 37e2b88313f2..9de4bd4c0dd7 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -63,8 +63,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max); struct nf_conn nf_conntrack_untracked __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_untracked); -static struct kmem_cache *nf_conntrack_cachep __read_mostly; - static int nf_conntrack_hash_rnd_initted; static unsigned int nf_conntrack_hash_rnd; @@ -572,7 +570,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, * Do not use kmem_cache_zalloc(), as this cache uses * SLAB_DESTROY_BY_RCU. */ - ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); + ct = kmem_cache_alloc(net->ct.nf_conntrack_cachep, gfp); if (ct == NULL) { pr_debug("nf_conntrack_alloc: Can't alloc conntrack.\n"); atomic_dec(&net->ct.count); @@ -611,7 +609,7 @@ void nf_conntrack_free(struct nf_conn *ct) nf_ct_ext_destroy(ct); atomic_dec(&net->ct.count); nf_ct_ext_free(ct); - kmem_cache_free(nf_conntrack_cachep, ct); + kmem_cache_free(net->ct.nf_conntrack_cachep, ct); } EXPORT_SYMBOL_GPL(nf_conntrack_free); @@ -1119,7 +1117,6 @@ static void nf_conntrack_cleanup_init_net(void) nf_conntrack_helper_fini(); nf_conntrack_proto_fini(); - kmem_cache_destroy(nf_conntrack_cachep); } static void nf_conntrack_cleanup_net(struct net *net) @@ -1137,6 +1134,8 @@ static void nf_conntrack_cleanup_net(struct net *net) nf_conntrack_ecache_fini(net); nf_conntrack_acct_fini(net); nf_conntrack_expect_fini(net); + kmem_cache_destroy(net->ct.nf_conntrack_cachep); + kfree(net->ct.slabname); free_percpu(net->ct.stat); } @@ -1272,15 +1271,6 @@ static int nf_conntrack_init_init_net(void) NF_CONNTRACK_VERSION, nf_conntrack_htable_size, nf_conntrack_max); - nf_conntrack_cachep = kmem_cache_create("nf_conntrack", - sizeof(struct nf_conn), - 0, SLAB_DESTROY_BY_RCU, NULL); - if (!nf_conntrack_cachep) { - printk(KERN_ERR "Unable to create nf_conn slab cache\n"); - ret = -ENOMEM; - goto err_cache; - } - ret = nf_conntrack_proto_init(); if (ret < 0) goto err_proto; @@ -1302,8 +1292,6 @@ static int nf_conntrack_init_init_net(void) err_helper: nf_conntrack_proto_fini(); err_proto: - kmem_cache_destroy(nf_conntrack_cachep); -err_cache: return ret; } @@ -1325,6 +1313,21 @@ static int nf_conntrack_init_net(struct net *net) ret = -ENOMEM; goto err_stat; } + + net->ct.slabname = kasprintf(GFP_KERNEL, "nf_conntrack_%p", net); + if (!net->ct.slabname) { + ret = -ENOMEM; + goto err_slabname; + } + + net->ct.nf_conntrack_cachep = kmem_cache_create(net->ct.slabname, + sizeof(struct nf_conn), 0, + SLAB_DESTROY_BY_RCU, NULL); + if (!net->ct.nf_conntrack_cachep) { + printk(KERN_ERR "Unable to create nf_conn slab cache\n"); + ret = -ENOMEM; + goto err_cache; + } net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, &net->ct.hash_vmalloc, 1); if (!net->ct.hash) { @@ -1352,6 +1355,10 @@ static int nf_conntrack_init_net(struct net *net) nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, nf_conntrack_htable_size); err_hash: + kmem_cache_destroy(net->ct.nf_conntrack_cachep); +err_cache: + kfree(net->ct.slabname); +err_slabname: free_percpu(net->ct.stat); err_stat: return ret; From 38c7233b287481dfb3327dde136801ce500aba58 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 4 Feb 2010 18:24:06 +0100 Subject: [PATCH 3/9] netfilter: nf_conntrack: restrict runtime expect hashsize modifications Expectation hashtable size was simply glued to a variable with no code to rehash expectations, so it was a bug to allow writing to it. Make "expect_hashsize" readonly. Signed-off-by: Alexey Dobriyan Cc: stable@kernel.org Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_expect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index fdf5d2a1d9b4..4ad7d1d809af 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -569,7 +569,7 @@ static void exp_proc_remove(struct net *net) #endif /* CONFIG_PROC_FS */ } -module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0600); +module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400); int nf_conntrack_expect_init(struct net *net) { From dab1531a07ad7c5be4ebe715a3d08742f0c638e3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 8 Feb 2010 15:44:07 +0100 Subject: [PATCH 4/9] netfilter: xtables: compat out of scope fix As per C99 6.2.4(2) when temporary table data goes out of scope, the behaviour is undefined: if (compat) { struct foo tmp; ... private = &tmp; } [dereference private] Signed-off-by: Alexey Dobriyan Cc: stable@kernel.org Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/arp_tables.c | 4 ++-- net/ipv4/netfilter/ip_tables.c | 4 ++-- net/ipv6/netfilter/ip6_tables.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 06632762ba5f..90203e1b9187 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -925,10 +925,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (t && !IS_ERR(t)) { struct arpt_getinfo info; const struct xt_table_info *private = t->private; - #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(NFPROTO_ARP); private = &tmp; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 572330a552ef..3ce53cf13d5a 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1132,10 +1132,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (t && !IS_ERR(t)) { struct ipt_getinfo info; const struct xt_table_info *private = t->private; - #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET); private = &tmp; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 480d7f8c9802..8a7e0f52e177 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1164,10 +1164,10 @@ static int get_info(struct net *net, void __user *user, int *len, int compat) if (t && !IS_ERR(t)) { struct ip6t_getinfo info; const struct xt_table_info *private = t->private; - #ifdef CONFIG_COMPAT + struct xt_table_info tmp; + if (compat) { - struct xt_table_info tmp; ret = compat_table_info(private, &tmp); xt_compat_flush_offsets(AF_INET6); private = &tmp; From 9ab48ddcb144fdee908708669448dd136cf4894a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 8 Feb 2010 17:35:23 +0100 Subject: [PATCH 5/9] netfilter: nf_conntrack: fix hash resizing with namespaces As noticed by Jon Masters , the conntrack hash size is global and not per namespace, but modifiable at runtime through /sys/module/nf_conntrack/hashsize. Changing the hash size will only resize the hash in the current namespace however, so other namespaces will use an invalid hash size. This can cause crashes when enlarging the hashsize, or false negative lookups when shrinking it. Move the hash size into the per-namespace data and only use the global hash size to initialize the per-namespace value when instanciating a new namespace. Additionally restrict hash resizing to init_net for now as other namespaces are not handled currently. Cc: stable@kernel.org Signed-off-by: Patrick McHardy --- include/net/netns/conntrack.h | 1 + include/net/netns/ipv4.h | 1 + .../netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- .../nf_conntrack_l3proto_ipv4_compat.c | 4 +- net/ipv4/netfilter/nf_nat_core.c | 22 ++++---- net/netfilter/nf_conntrack_core.c | 53 ++++++++++--------- net/netfilter/nf_conntrack_expect.c | 2 +- net/netfilter/nf_conntrack_helper.c | 2 +- net/netfilter/nf_conntrack_netlink.c | 2 +- net/netfilter/nf_conntrack_standalone.c | 7 +-- 10 files changed, 49 insertions(+), 47 deletions(-) diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index aed23b6c8478..63d449807d9b 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -11,6 +11,7 @@ struct nf_conntrack_ecache; struct netns_ct { atomic_t count; unsigned int expect_count; + unsigned int htable_size; struct kmem_cache *nf_conntrack_cachep; struct hlist_nulls_head *hash; struct hlist_head *expect_hash; diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2eb3814d6258..9a4b8b714079 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -40,6 +40,7 @@ struct netns_ipv4 { struct xt_table *iptable_security; struct xt_table *nat_table; struct hlist_head *nat_bysource; + unsigned int nat_htable_size; int nat_vmalloced; #endif diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index d171b123a656..d1ea38a7c490 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -210,7 +210,7 @@ static ctl_table ip_ct_sysctl_table[] = { }, { .procname = "ip_conntrack_buckets", - .data = &nf_conntrack_htable_size, + .data = &init_net.ct.htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c index 8668a3defda6..2fb7b76da94f 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c @@ -32,7 +32,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < net->ct.htable_size; st->bucket++) { n = rcu_dereference(net->ct.hash[st->bucket].first); if (!is_a_nulls(n)) @@ -50,7 +50,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(head->next); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= net->ct.htable_size) return NULL; } head = rcu_dereference(net->ct.hash[st->bucket].first); diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index fe1a64479dd0..26066a2327ad 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -35,9 +35,6 @@ static DEFINE_SPINLOCK(nf_nat_lock); static struct nf_conntrack_l3proto *l3proto __read_mostly; -/* Calculated at init based on memory size */ -static unsigned int nf_nat_htable_size __read_mostly; - #define MAX_IP_NAT_PROTO 256 static const struct nf_nat_protocol *nf_nat_protos[MAX_IP_NAT_PROTO] __read_mostly; @@ -72,7 +69,7 @@ EXPORT_SYMBOL_GPL(nf_nat_proto_put); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -hash_by_src(const struct nf_conntrack_tuple *tuple) +hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple) { unsigned int hash; @@ -80,7 +77,7 @@ hash_by_src(const struct nf_conntrack_tuple *tuple) hash = jhash_3words((__force u32)tuple->src.u3.ip, (__force u32)tuple->src.u.all, tuple->dst.protonum, 0); - return ((u64)hash * nf_nat_htable_size) >> 32; + return ((u64)hash * net->ipv4.nat_htable_size) >> 32; } /* Is this tuple already taken? (not by us) */ @@ -147,7 +144,7 @@ find_appropriate_src(struct net *net, struct nf_conntrack_tuple *result, const struct nf_nat_range *range) { - unsigned int h = hash_by_src(tuple); + unsigned int h = hash_by_src(net, tuple); const struct nf_conn_nat *nat; const struct nf_conn *ct; const struct hlist_node *n; @@ -330,7 +327,7 @@ nf_nat_setup_info(struct nf_conn *ct, if (have_to_hash) { unsigned int srchash; - srchash = hash_by_src(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + srchash = hash_by_src(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); spin_lock_bh(&nf_nat_lock); /* nf_conntrack_alter_reply might re-allocate exntension aera */ nat = nfct_nat(ct); @@ -679,8 +676,10 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, static int __net_init nf_nat_net_init(struct net *net) { - net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, - &net->ipv4.nat_vmalloced, 0); + /* Leave them the same for the moment. */ + net->ipv4.nat_htable_size = net->ct.htable_size; + net->ipv4.nat_bysource = nf_ct_alloc_hashtable(&net->ipv4.nat_htable_size, + &net->ipv4.nat_vmalloced, 0); if (!net->ipv4.nat_bysource) return -ENOMEM; return 0; @@ -703,7 +702,7 @@ static void __net_exit nf_nat_net_exit(struct net *net) nf_ct_iterate_cleanup(net, &clean_nat, NULL); synchronize_rcu(); nf_ct_free_hashtable(net->ipv4.nat_bysource, net->ipv4.nat_vmalloced, - nf_nat_htable_size); + net->ipv4.nat_htable_size); } static struct pernet_operations nf_nat_net_ops = { @@ -724,9 +723,6 @@ static int __init nf_nat_init(void) return ret; } - /* Leave them the same for the moment. */ - nf_nat_htable_size = nf_conntrack_htable_size; - ret = register_pernet_subsys(&nf_nat_net_ops); if (ret < 0) goto cleanup_extend; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 9de4bd4c0dd7..4d79e3c1616c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -84,9 +85,10 @@ static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, return ((u64)h * size) >> 32; } -static inline u_int32_t hash_conntrack(const struct nf_conntrack_tuple *tuple) +static inline u_int32_t hash_conntrack(const struct net *net, + const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(tuple, nf_conntrack_htable_size, + return __hash_conntrack(tuple, net->ct.htable_size, nf_conntrack_hash_rnd); } @@ -294,7 +296,7 @@ __nf_conntrack_find(struct net *net, const struct nf_conntrack_tuple *tuple) { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - unsigned int hash = hash_conntrack(tuple); + unsigned int hash = hash_conntrack(net, tuple); /* Disable BHs the entire time since we normally need to disable them * at least once for the stats anyway. @@ -364,10 +366,11 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, void nf_conntrack_hash_insert(struct nf_conn *ct) { + struct net *net = nf_ct_net(ct); unsigned int hash, repl_hash; - hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); __nf_conntrack_hash_insert(ct, hash, repl_hash); } @@ -395,8 +398,8 @@ __nf_conntrack_confirm(struct sk_buff *skb) if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) return NF_ACCEPT; - hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(net, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); /* We're not in hash table, and we refuse to set up related connections for unconfirmed conns. But packet copies and @@ -466,7 +469,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple, struct net *net = nf_ct_net(ignored_conntrack); struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - unsigned int hash = hash_conntrack(tuple); + unsigned int hash = hash_conntrack(net, tuple); /* Disable BHs the entire time since we need to disable them at * least once for the stats anyway. @@ -501,7 +504,7 @@ static noinline int early_drop(struct net *net, unsigned int hash) int dropped = 0; rcu_read_lock(); - for (i = 0; i < nf_conntrack_htable_size; i++) { + for (i = 0; i < net->ct.htable_size; i++) { hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { tmp = nf_ct_tuplehash_to_ctrack(h); @@ -521,7 +524,7 @@ static noinline int early_drop(struct net *net, unsigned int hash) if (cnt >= NF_CT_EVICTION_RANGE) break; - hash = (hash + 1) % nf_conntrack_htable_size; + hash = (hash + 1) % net->ct.htable_size; } rcu_read_unlock(); @@ -555,7 +558,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, if (nf_conntrack_max && unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { - unsigned int hash = hash_conntrack(orig); + unsigned int hash = hash_conntrack(net, orig); if (!early_drop(net, hash)) { atomic_dec(&net->ct.count); if (net_ratelimit()) @@ -1012,7 +1015,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), struct hlist_nulls_node *n; spin_lock_bh(&nf_conntrack_lock); - for (; *bucket < nf_conntrack_htable_size; (*bucket)++) { + for (; *bucket < net->ct.htable_size; (*bucket)++) { hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) @@ -1130,7 +1133,7 @@ static void nf_conntrack_cleanup_net(struct net *net) } nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, - nf_conntrack_htable_size); + net->ct.htable_size); nf_conntrack_ecache_fini(net); nf_conntrack_acct_fini(net); nf_conntrack_expect_fini(net); @@ -1190,10 +1193,12 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) { int i, bucket, vmalloced, old_vmalloced; unsigned int hashsize, old_size; - int rnd; struct hlist_nulls_head *hash, *old_hash; struct nf_conntrack_tuple_hash *h; + if (current->nsproxy->net_ns != &init_net) + return -EOPNOTSUPP; + /* On boot, we can set this without any fancy locking. */ if (!nf_conntrack_htable_size) return param_set_uint(val, kp); @@ -1206,33 +1211,29 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) if (!hash) return -ENOMEM; - /* We have to rehahs for the new table anyway, so we also can - * use a newrandom seed */ - get_random_bytes(&rnd, sizeof(rnd)); - /* Lookups in the old hash might happen in parallel, which means we * might get false negatives during connection lookup. New connections * created because of a false negative won't make it into the hash * though since that required taking the lock. */ spin_lock_bh(&nf_conntrack_lock); - for (i = 0; i < nf_conntrack_htable_size; i++) { + for (i = 0; i < init_net.ct.htable_size; i++) { while (!hlist_nulls_empty(&init_net.ct.hash[i])) { h = hlist_nulls_entry(init_net.ct.hash[i].first, struct nf_conntrack_tuple_hash, hnnode); hlist_nulls_del_rcu(&h->hnnode); - bucket = __hash_conntrack(&h->tuple, hashsize, rnd); + bucket = __hash_conntrack(&h->tuple, hashsize, + nf_conntrack_hash_rnd); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } - old_size = nf_conntrack_htable_size; + old_size = init_net.ct.htable_size; old_vmalloced = init_net.ct.hash_vmalloc; old_hash = init_net.ct.hash; - nf_conntrack_htable_size = hashsize; + init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; init_net.ct.hash_vmalloc = vmalloced; init_net.ct.hash = hash; - nf_conntrack_hash_rnd = rnd; spin_unlock_bh(&nf_conntrack_lock); nf_ct_free_hashtable(old_hash, old_vmalloced, old_size); @@ -1328,7 +1329,9 @@ static int nf_conntrack_init_net(struct net *net) ret = -ENOMEM; goto err_cache; } - net->ct.hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, + + net->ct.htable_size = nf_conntrack_htable_size; + net->ct.hash = nf_ct_alloc_hashtable(&net->ct.htable_size, &net->ct.hash_vmalloc, 1); if (!net->ct.hash) { ret = -ENOMEM; @@ -1353,7 +1356,7 @@ static int nf_conntrack_init_net(struct net *net) nf_conntrack_expect_fini(net); err_expect: nf_ct_free_hashtable(net->ct.hash, net->ct.hash_vmalloc, - nf_conntrack_htable_size); + net->ct.htable_size); err_hash: kmem_cache_destroy(net->ct.nf_conntrack_cachep); err_cache: diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 4ad7d1d809af..2f25ff610982 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -577,7 +577,7 @@ int nf_conntrack_expect_init(struct net *net) if (net_eq(net, &init_net)) { if (!nf_ct_expect_hsize) { - nf_ct_expect_hsize = nf_conntrack_htable_size / 256; + nf_ct_expect_hsize = net->ct.htable_size / 256; if (!nf_ct_expect_hsize) nf_ct_expect_hsize = 1; } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 65c2a7bc3afc..4b1a56bd074c 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -192,7 +192,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, /* Get rid of expecteds, set helpers to NULL. */ hlist_nulls_for_each_entry(h, nn, &net->ct.unconfirmed, hnnode) unhelp(h, me); - for (i = 0; i < nf_conntrack_htable_size; i++) { + for (i = 0; i < net->ct.htable_size; i++) { hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) unhelp(h, me); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 42f21c01a93e..0ffe689dfe97 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -594,7 +594,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); last = (struct nf_conn *)cb->args[1]; - for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { + for (; cb->args[0] < init_net.ct.htable_size; cb->args[0]++) { restart: hlist_nulls_for_each_entry_rcu(h, n, &init_net.ct.hash[cb->args[0]], hnnode) { diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 028aba667ef7..e310f1561bb2 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -51,7 +51,7 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq) struct hlist_nulls_node *n; for (st->bucket = 0; - st->bucket < nf_conntrack_htable_size; + st->bucket < net->ct.htable_size; st->bucket++) { n = rcu_dereference(net->ct.hash[st->bucket].first); if (!is_a_nulls(n)) @@ -69,7 +69,7 @@ static struct hlist_nulls_node *ct_get_next(struct seq_file *seq, head = rcu_dereference(head->next); while (is_a_nulls(head)) { if (likely(get_nulls_value(head) == st->bucket)) { - if (++st->bucket >= nf_conntrack_htable_size) + if (++st->bucket >= net->ct.htable_size) return NULL; } head = rcu_dereference(net->ct.hash[st->bucket].first); @@ -355,7 +355,7 @@ static ctl_table nf_ct_sysctl_table[] = { }, { .procname = "nf_conntrack_buckets", - .data = &nf_conntrack_htable_size, + .data = &init_net.ct.htable_size, .maxlen = sizeof(unsigned int), .mode = 0444, .proc_handler = proc_dointvec, @@ -421,6 +421,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) goto out_kmemdup; table[1].data = &net->ct.count; + table[2].data = &net->ct.htable_size; table[3].data = &net->ct.sysctl_checksum; table[4].data = &net->ct.sysctl_log_invalid; From 63e690caf24e8f43ba019fe1107669746b072d80 Mon Sep 17 00:00:00 2001 From: Andrea Gelmini Date: Mon, 8 Mar 2010 13:13:07 +0100 Subject: [PATCH 6/9] netfilter: include/linux/netfilter/nf_conntrack_tuple_common.h: Checkpatch cleanup include/linux/netfilter/nf_conntrack_tuple_common.h:5: ERROR: open brace '{' following enum go on the same line Signed-off-by: Andrea Gelmini Acked-by: David S. Miller Signed-off-by: Patrick McHardy --- include/linux/netfilter/nf_conntrack_tuple_common.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/netfilter/nf_conntrack_tuple_common.h b/include/linux/netfilter/nf_conntrack_tuple_common.h index 8e145f0d61cb..2ea22b018a87 100644 --- a/include/linux/netfilter/nf_conntrack_tuple_common.h +++ b/include/linux/netfilter/nf_conntrack_tuple_common.h @@ -1,8 +1,7 @@ #ifndef _NF_CONNTRACK_TUPLE_COMMON_H #define _NF_CONNTRACK_TUPLE_COMMON_H -enum ip_conntrack_dir -{ +enum ip_conntrack_dir { IP_CT_DIR_ORIGINAL, IP_CT_DIR_REPLY, IP_CT_DIR_MAX From 0898f99a267f89a7dc72cc687955f17613a711b8 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Mon, 8 Mar 2010 13:15:59 +0100 Subject: [PATCH 7/9] netfilter: ebt_ip6: Use ipv6_masked_addr_cmp() Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: Bart De Schuymer Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebt_ip6.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c index bbf2534ef026..4644cc9c0579 100644 --- a/net/bridge/netfilter/ebt_ip6.c +++ b/net/bridge/netfilter/ebt_ip6.c @@ -35,8 +35,6 @@ ebt_ip6_mt(const struct sk_buff *skb, const struct xt_match_param *par) struct ipv6hdr _ip6h; const struct tcpudphdr *pptr; struct tcpudphdr _ports; - struct in6_addr tmp_addr; - int i; ih6 = skb_header_pointer(skb, 0, sizeof(_ip6h), &_ip6h); if (ih6 == NULL) @@ -44,18 +42,10 @@ ebt_ip6_mt(const struct sk_buff *skb, const struct xt_match_param *par) if (info->bitmask & EBT_IP6_TCLASS && FWINV(info->tclass != ipv6_get_dsfield(ih6), EBT_IP6_TCLASS)) return false; - for (i = 0; i < 4; i++) - tmp_addr.in6_u.u6_addr32[i] = ih6->saddr.in6_u.u6_addr32[i] & - info->smsk.in6_u.u6_addr32[i]; - if (info->bitmask & EBT_IP6_SOURCE && - FWINV((ipv6_addr_cmp(&tmp_addr, &info->saddr) != 0), - EBT_IP6_SOURCE)) - return false; - for (i = 0; i < 4; i++) - tmp_addr.in6_u.u6_addr32[i] = ih6->daddr.in6_u.u6_addr32[i] & - info->dmsk.in6_u.u6_addr32[i]; - if (info->bitmask & EBT_IP6_DEST && - FWINV((ipv6_addr_cmp(&tmp_addr, &info->daddr) != 0), EBT_IP6_DEST)) + if (FWINV(ipv6_masked_addr_cmp(&ih6->saddr, &info->smsk, + &info->saddr), EBT_IP6_SOURCE) || + FWINV(ipv6_masked_addr_cmp(&ih6->daddr, &info->dmsk, + &info->daddr), EBT_IP6_DEST)) return false; if (info->bitmask & EBT_IP6_PROTO) { uint8_t nexthdr = ih6->nexthdr; From 7b4df05537f4e6c0c3524055ece7f99b5c98cc87 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Mon, 8 Mar 2010 13:17:01 +0100 Subject: [PATCH 8/9] netfilter: remove stale declaration for ip6_masked_addrcmp() Commit f2ffd9ee... ("[NETFILTER]: Move ip6_masked_addrcmp to include/net/ipv6.h") replaced ip6_masked_addrcmp() with ipv6_masked_addr_cmp(). Function definition went away. Let's remove its declaration as well in header file. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: Patrick McHardy --- include/linux/netfilter_ipv6/ip6_tables.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/linux/netfilter_ipv6/ip6_tables.h b/include/linux/netfilter_ipv6/ip6_tables.h index e5ba03d783c6..18442ff19c07 100644 --- a/include/linux/netfilter_ipv6/ip6_tables.h +++ b/include/linux/netfilter_ipv6/ip6_tables.h @@ -316,10 +316,6 @@ extern int ip6t_ext_hdr(u8 nexthdr); extern int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff); -extern int ip6_masked_addrcmp(const struct in6_addr *addr1, - const struct in6_addr *mask, - const struct in6_addr *addr2); - #define IP6T_ALIGN(s) XT_ALIGN(s) #ifdef CONFIG_COMPAT From 1da05f50f6a766c7611102382f85183b4db96c2d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 15 Mar 2010 18:03:05 +0100 Subject: [PATCH 9/9] netfilter: net/netfilter/ipvs/ip_vs_ftp.c: Remove use of NIPQUAD NIPQUAD has very few uses left. Remove this use and make the code have the identical form of the only other use of "%u,%u,%u,%u,%u,%u" in net/ipv4/netfilter/nf_nat_ftp.c Signed-off-by: Joe Perches Acked-by: Simon Horman Signed-off-by: Patrick McHardy --- net/netfilter/ipvs/ip_vs_ftp.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 73f38ea98f25..9f6328303844 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -208,8 +208,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, */ from.ip = n_cp->vaddr.ip; port = n_cp->vport; - sprintf(buf, "%u,%u,%u,%u,%u,%u", NIPQUAD(from.ip), - (ntohs(port)>>8)&255, ntohs(port)&255); + snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u", + ((unsigned char *)&from.ip)[0], + ((unsigned char *)&from.ip)[1], + ((unsigned char *)&from.ip)[2], + ((unsigned char *)&from.ip)[3], + ntohs(port) >> 8, + ntohs(port) & 0xFF); + buf_len = strlen(buf); /*