ipv4: tcp: get rid of ugly unicast_sock
In commitbe9f4a44e7
("ipv4: tcp: remove per net tcp_sock") I tried to address contention on a socket lock, but the solution I chose was horrible : commit3a7c384ffd
("ipv4: tcp: unicast_sock should not land outside of TCP stack") addressed a selinux regression. commit0980e56e50
("ipv4: tcp: set unicast_sock uc_ttl to -1") took care of another regression. commitb5ec8eeac4
("ipv4: fix ip_send_skb()") fixed another regression. commit811230cd85
("tcp: ipv4: initialize unicast_sock sk_pacing_rate") was another shot in the dark. Really, just use a proper socket per cpu, and remove the skb_orphan() call, to re-enable flow control. This solves a serious problem with FQ packet scheduler when used in hostile environments, as we do not want to allocate a flow structure for every RST packet sent in response to a spoofed packet. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
0d32ef8cef
commit
bdbbb8527b
4 changed files with 40 additions and 36 deletions
|
@ -181,7 +181,7 @@ static inline __u8 ip_reply_arg_flowi_flags(const struct ip_reply_arg *arg)
|
||||||
return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
|
return (arg->flags & IP_REPLY_ARG_NOSRCCHECK) ? FLOWI_FLAG_ANYSRC : 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
|
void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
|
||||||
const struct ip_options *sopt,
|
const struct ip_options *sopt,
|
||||||
__be32 daddr, __be32 saddr,
|
__be32 daddr, __be32 saddr,
|
||||||
const struct ip_reply_arg *arg,
|
const struct ip_reply_arg *arg,
|
||||||
|
|
|
@ -52,6 +52,7 @@ struct netns_ipv4 {
|
||||||
struct inet_peer_base *peers;
|
struct inet_peer_base *peers;
|
||||||
struct tcpm_hash_bucket *tcp_metrics_hash;
|
struct tcpm_hash_bucket *tcp_metrics_hash;
|
||||||
unsigned int tcp_metrics_hash_log;
|
unsigned int tcp_metrics_hash_log;
|
||||||
|
struct sock * __percpu *tcp_sk;
|
||||||
struct netns_frags frags;
|
struct netns_frags frags;
|
||||||
#ifdef CONFIG_NETFILTER
|
#ifdef CONFIG_NETFILTER
|
||||||
struct xt_table *iptable_filter;
|
struct xt_table *iptable_filter;
|
||||||
|
|
|
@ -1506,24 +1506,8 @@ static int ip_reply_glue_bits(void *dptr, char *to, int offset,
|
||||||
/*
|
/*
|
||||||
* Generic function to send a packet as reply to another packet.
|
* Generic function to send a packet as reply to another packet.
|
||||||
* Used to send some TCP resets/acks so far.
|
* Used to send some TCP resets/acks so far.
|
||||||
*
|
|
||||||
* Use a fake percpu inet socket to avoid false sharing and contention.
|
|
||||||
*/
|
*/
|
||||||
static DEFINE_PER_CPU(struct inet_sock, unicast_sock) = {
|
void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
|
||||||
.sk = {
|
|
||||||
.__sk_common = {
|
|
||||||
.skc_refcnt = ATOMIC_INIT(1),
|
|
||||||
},
|
|
||||||
.sk_wmem_alloc = ATOMIC_INIT(1),
|
|
||||||
.sk_allocation = GFP_ATOMIC,
|
|
||||||
.sk_flags = (1UL << SOCK_USE_WRITE_QUEUE),
|
|
||||||
.sk_pacing_rate = ~0U,
|
|
||||||
},
|
|
||||||
.pmtudisc = IP_PMTUDISC_WANT,
|
|
||||||
.uc_ttl = -1,
|
|
||||||
};
|
|
||||||
|
|
||||||
void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
|
|
||||||
const struct ip_options *sopt,
|
const struct ip_options *sopt,
|
||||||
__be32 daddr, __be32 saddr,
|
__be32 daddr, __be32 saddr,
|
||||||
const struct ip_reply_arg *arg,
|
const struct ip_reply_arg *arg,
|
||||||
|
@ -1533,9 +1517,8 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
|
||||||
struct ipcm_cookie ipc;
|
struct ipcm_cookie ipc;
|
||||||
struct flowi4 fl4;
|
struct flowi4 fl4;
|
||||||
struct rtable *rt = skb_rtable(skb);
|
struct rtable *rt = skb_rtable(skb);
|
||||||
|
struct net *net = sock_net(sk);
|
||||||
struct sk_buff *nskb;
|
struct sk_buff *nskb;
|
||||||
struct sock *sk;
|
|
||||||
struct inet_sock *inet;
|
|
||||||
int err;
|
int err;
|
||||||
|
|
||||||
if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
|
if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
|
||||||
|
@ -1566,15 +1549,11 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
|
||||||
if (IS_ERR(rt))
|
if (IS_ERR(rt))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
inet = &get_cpu_var(unicast_sock);
|
inet_sk(sk)->tos = arg->tos;
|
||||||
|
|
||||||
inet->tos = arg->tos;
|
|
||||||
sk = &inet->sk;
|
|
||||||
sk->sk_priority = skb->priority;
|
sk->sk_priority = skb->priority;
|
||||||
sk->sk_protocol = ip_hdr(skb)->protocol;
|
sk->sk_protocol = ip_hdr(skb)->protocol;
|
||||||
sk->sk_bound_dev_if = arg->bound_dev_if;
|
sk->sk_bound_dev_if = arg->bound_dev_if;
|
||||||
sock_net_set(sk, net);
|
|
||||||
__skb_queue_head_init(&sk->sk_write_queue);
|
|
||||||
sk->sk_sndbuf = sysctl_wmem_default;
|
sk->sk_sndbuf = sysctl_wmem_default;
|
||||||
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
|
err = ip_append_data(sk, &fl4, ip_reply_glue_bits, arg->iov->iov_base,
|
||||||
len, 0, &ipc, &rt, MSG_DONTWAIT);
|
len, 0, &ipc, &rt, MSG_DONTWAIT);
|
||||||
|
@ -1590,13 +1569,10 @@ void ip_send_unicast_reply(struct net *net, struct sk_buff *skb,
|
||||||
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
|
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
|
||||||
arg->csum));
|
arg->csum));
|
||||||
nskb->ip_summed = CHECKSUM_NONE;
|
nskb->ip_summed = CHECKSUM_NONE;
|
||||||
skb_orphan(nskb);
|
|
||||||
skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
|
skb_set_queue_mapping(nskb, skb_get_queue_mapping(skb));
|
||||||
ip_push_pending_frames(sk, &fl4);
|
ip_push_pending_frames(sk, &fl4);
|
||||||
}
|
}
|
||||||
out:
|
out:
|
||||||
put_cpu_var(unicast_sock);
|
|
||||||
|
|
||||||
ip_rt_put(rt);
|
ip_rt_put(rt);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -683,7 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
|
||||||
arg.bound_dev_if = sk->sk_bound_dev_if;
|
arg.bound_dev_if = sk->sk_bound_dev_if;
|
||||||
|
|
||||||
arg.tos = ip_hdr(skb)->tos;
|
arg.tos = ip_hdr(skb)->tos;
|
||||||
ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
|
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
|
||||||
|
skb, &TCP_SKB_CB(skb)->header.h4.opt,
|
||||||
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
|
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
|
||||||
&arg, arg.iov[0].iov_len);
|
&arg, arg.iov[0].iov_len);
|
||||||
|
|
||||||
|
@ -767,7 +768,8 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
|
||||||
if (oif)
|
if (oif)
|
||||||
arg.bound_dev_if = oif;
|
arg.bound_dev_if = oif;
|
||||||
arg.tos = tos;
|
arg.tos = tos;
|
||||||
ip_send_unicast_reply(net, skb, &TCP_SKB_CB(skb)->header.h4.opt,
|
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
|
||||||
|
skb, &TCP_SKB_CB(skb)->header.h4.opt,
|
||||||
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
|
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
|
||||||
&arg, arg.iov[0].iov_len);
|
&arg, arg.iov[0].iov_len);
|
||||||
|
|
||||||
|
@ -2428,14 +2430,39 @@ struct proto tcp_prot = {
|
||||||
};
|
};
|
||||||
EXPORT_SYMBOL(tcp_prot);
|
EXPORT_SYMBOL(tcp_prot);
|
||||||
|
|
||||||
static int __net_init tcp_sk_init(struct net *net)
|
|
||||||
{
|
|
||||||
net->ipv4.sysctl_tcp_ecn = 2;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void __net_exit tcp_sk_exit(struct net *net)
|
static void __net_exit tcp_sk_exit(struct net *net)
|
||||||
{
|
{
|
||||||
|
int cpu;
|
||||||
|
|
||||||
|
for_each_possible_cpu(cpu)
|
||||||
|
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
|
||||||
|
free_percpu(net->ipv4.tcp_sk);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int __net_init tcp_sk_init(struct net *net)
|
||||||
|
{
|
||||||
|
int res, cpu;
|
||||||
|
|
||||||
|
net->ipv4.tcp_sk = alloc_percpu(struct sock *);
|
||||||
|
if (!net->ipv4.tcp_sk)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
for_each_possible_cpu(cpu) {
|
||||||
|
struct sock *sk;
|
||||||
|
|
||||||
|
res = inet_ctl_sock_create(&sk, PF_INET, SOCK_RAW,
|
||||||
|
IPPROTO_TCP, net);
|
||||||
|
if (res)
|
||||||
|
goto fail;
|
||||||
|
*per_cpu_ptr(net->ipv4.tcp_sk, cpu) = sk;
|
||||||
|
}
|
||||||
|
net->ipv4.sysctl_tcp_ecn = 2;
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
fail:
|
||||||
|
tcp_sk_exit(net);
|
||||||
|
|
||||||
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
|
static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
|
||||||
|
|
Loading…
Reference in a new issue