soreuseport: TCP/IPv6 implementation

Motivation for soreuseport would be something like a web server
binding to port 80 running with multiple threads, where each thread
might have it's own listener socket.  This could be done as an
alternative to other models: 1) have one listener thread which
dispatches completed connections to workers. 2) accept on a single
listener socket from multiple threads.  In case #1 the listener thread
can easily become the bottleneck with high connection turn-over rate.
In case #2, the proportion of connections accepted per thread tends
to be uneven under high connection load (assuming simple event loop:
while (1) { accept(); process() }, wakeup does not promote fairness
among the sockets.  We have seen the  disproportion to be as high
as 3:1 ratio between thread accepting most connections and the one
accepting the fewest.  With so_reusport the distribution is
uniform.

Signed-off-by: Tom Herbert <therbert@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Tom Herbert 2013-01-22 09:50:39 +00:00 committed by David S. Miller
parent ba418fa357
commit 5ba24953e9
5 changed files with 38 additions and 10 deletions

View file

@ -71,6 +71,8 @@ extern struct sock *__inet6_lookup_established(struct net *net,
extern struct sock *inet6_lookup_listener(struct net *net, extern struct sock *inet6_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo, struct inet_hashinfo *hashinfo,
const struct in6_addr *saddr,
const __be16 sport,
const struct in6_addr *daddr, const struct in6_addr *daddr,
const unsigned short hnum, const unsigned short hnum,
const int dif); const int dif);
@ -88,7 +90,8 @@ static inline struct sock *__inet6_lookup(struct net *net,
if (sk) if (sk)
return sk; return sk;
return inet6_lookup_listener(net, hashinfo, daddr, hnum, dif); return inet6_lookup_listener(net, hashinfo, saddr, sport,
daddr, hnum, dif);
} }
static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo, static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,

View file

@ -152,6 +152,7 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
break; break;
case NFT_LOOKUP_LISTENER: case NFT_LOOKUP_LISTENER:
sk = inet6_lookup_listener(net, &tcp_hashinfo, sk = inet6_lookup_listener(net, &tcp_hashinfo,
saddr, sport,
daddr, ntohs(dport), daddr, ntohs(dport),
in->ifindex); in->ifindex);

View file

@ -32,6 +32,9 @@ int inet6_csk_bind_conflict(const struct sock *sk,
{ {
const struct sock *sk2; const struct sock *sk2;
const struct hlist_node *node; const struct hlist_node *node;
int reuse = sk->sk_reuse;
int reuseport = sk->sk_reuseport;
int uid = sock_i_uid((struct sock *)sk);
/* We must walk the whole port owner list in this case. -DaveM */ /* We must walk the whole port owner list in this case. -DaveM */
/* /*
@ -42,12 +45,18 @@ int inet6_csk_bind_conflict(const struct sock *sk,
if (sk != sk2 && if (sk != sk2 &&
(!sk->sk_bound_dev_if || (!sk->sk_bound_dev_if ||
!sk2->sk_bound_dev_if || !sk2->sk_bound_dev_if ||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
(!sk->sk_reuse || !sk2->sk_reuse || if ((!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) && sk2->sk_state == TCP_LISTEN) &&
ipv6_rcv_saddr_equal(sk, sk2)) (!reuseport || !sk2->sk_reuseport ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid,
sock_i_uid((struct sock *)sk2))))) {
if (ipv6_rcv_saddr_equal(sk, sk2))
break; break;
} }
}
}
return node != NULL; return node != NULL;
} }

View file

@ -158,25 +158,38 @@ static inline int compute_score(struct sock *sk, struct net *net,
} }
struct sock *inet6_lookup_listener(struct net *net, struct sock *inet6_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, struct inet_hashinfo *hashinfo, const struct in6_addr *saddr,
const __be16 sport, const struct in6_addr *daddr,
const unsigned short hnum, const int dif) const unsigned short hnum, const int dif)
{ {
struct sock *sk; struct sock *sk;
const struct hlist_nulls_node *node; const struct hlist_nulls_node *node;
struct sock *result; struct sock *result;
int score, hiscore; int score, hiscore, matches = 0, reuseport = 0;
u32 phash = 0;
unsigned int hash = inet_lhashfn(net, hnum); unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash]; struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
rcu_read_lock(); rcu_read_lock();
begin: begin:
result = NULL; result = NULL;
hiscore = -1; hiscore = 0;
sk_nulls_for_each(sk, node, &ilb->head) { sk_nulls_for_each(sk, node, &ilb->head) {
score = compute_score(sk, net, hnum, daddr, dif); score = compute_score(sk, net, hnum, daddr, dif);
if (score > hiscore) { if (score > hiscore) {
hiscore = score; hiscore = score;
result = sk; result = sk;
reuseport = sk->sk_reuseport;
if (reuseport) {
phash = inet6_ehashfn(net, daddr, hnum,
saddr, sport);
matches = 1;
}
} else if (score == hiscore && reuseport) {
matches++;
if (((u64)phash * matches) >> 32 == 0)
result = sk;
phash = next_pseudo_random32(phash);
} }
} }
/* /*

View file

@ -834,7 +834,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
* no RST generated if md5 hash doesn't match. * no RST generated if md5 hash doesn't match.
*/ */
sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev), sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
&tcp_hashinfo, &ipv6h->daddr, &tcp_hashinfo, &ipv6h->saddr,
th->source, &ipv6h->daddr,
ntohs(th->source), inet6_iif(skb)); ntohs(th->source), inet6_iif(skb));
if (!sk1) if (!sk1)
return; return;
@ -1598,6 +1599,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
struct sock *sk2; struct sock *sk2;
sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr, &ipv6_hdr(skb)->daddr,
ntohs(th->dest), inet6_iif(skb)); ntohs(th->dest), inet6_iif(skb));
if (sk2 != NULL) { if (sk2 != NULL) {