soreuseport: TCP/IPv6 implementation
Motivation for soreuseport would be something like a web server binding to port 80 running with multiple threads, where each thread might have it's own listener socket. This could be done as an alternative to other models: 1) have one listener thread which dispatches completed connections to workers. 2) accept on a single listener socket from multiple threads. In case #1 the listener thread can easily become the bottleneck with high connection turn-over rate. In case #2, the proportion of connections accepted per thread tends to be uneven under high connection load (assuming simple event loop: while (1) { accept(); process() }, wakeup does not promote fairness among the sockets. We have seen the disproportion to be as high as 3:1 ratio between thread accepting most connections and the one accepting the fewest. With so_reusport the distribution is uniform. Signed-off-by: Tom Herbert <therbert@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
ba418fa357
commit
5ba24953e9
5 changed files with 38 additions and 10 deletions
|
@ -71,6 +71,8 @@ extern struct sock *__inet6_lookup_established(struct net *net,
|
||||||
|
|
||||||
extern struct sock *inet6_lookup_listener(struct net *net,
|
extern struct sock *inet6_lookup_listener(struct net *net,
|
||||||
struct inet_hashinfo *hashinfo,
|
struct inet_hashinfo *hashinfo,
|
||||||
|
const struct in6_addr *saddr,
|
||||||
|
const __be16 sport,
|
||||||
const struct in6_addr *daddr,
|
const struct in6_addr *daddr,
|
||||||
const unsigned short hnum,
|
const unsigned short hnum,
|
||||||
const int dif);
|
const int dif);
|
||||||
|
@ -88,7 +90,8 @@ static inline struct sock *__inet6_lookup(struct net *net,
|
||||||
if (sk)
|
if (sk)
|
||||||
return sk;
|
return sk;
|
||||||
|
|
||||||
return inet6_lookup_listener(net, hashinfo, daddr, hnum, dif);
|
return inet6_lookup_listener(net, hashinfo, saddr, sport,
|
||||||
|
daddr, hnum, dif);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
|
static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
|
||||||
|
|
|
@ -152,6 +152,7 @@ nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
|
||||||
break;
|
break;
|
||||||
case NFT_LOOKUP_LISTENER:
|
case NFT_LOOKUP_LISTENER:
|
||||||
sk = inet6_lookup_listener(net, &tcp_hashinfo,
|
sk = inet6_lookup_listener(net, &tcp_hashinfo,
|
||||||
|
saddr, sport,
|
||||||
daddr, ntohs(dport),
|
daddr, ntohs(dport),
|
||||||
in->ifindex);
|
in->ifindex);
|
||||||
|
|
||||||
|
|
|
@ -32,6 +32,9 @@ int inet6_csk_bind_conflict(const struct sock *sk,
|
||||||
{
|
{
|
||||||
const struct sock *sk2;
|
const struct sock *sk2;
|
||||||
const struct hlist_node *node;
|
const struct hlist_node *node;
|
||||||
|
int reuse = sk->sk_reuse;
|
||||||
|
int reuseport = sk->sk_reuseport;
|
||||||
|
int uid = sock_i_uid((struct sock *)sk);
|
||||||
|
|
||||||
/* We must walk the whole port owner list in this case. -DaveM */
|
/* We must walk the whole port owner list in this case. -DaveM */
|
||||||
/*
|
/*
|
||||||
|
@ -42,11 +45,17 @@ int inet6_csk_bind_conflict(const struct sock *sk,
|
||||||
if (sk != sk2 &&
|
if (sk != sk2 &&
|
||||||
(!sk->sk_bound_dev_if ||
|
(!sk->sk_bound_dev_if ||
|
||||||
!sk2->sk_bound_dev_if ||
|
!sk2->sk_bound_dev_if ||
|
||||||
sk->sk_bound_dev_if == sk2->sk_bound_dev_if) &&
|
sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
|
||||||
(!sk->sk_reuse || !sk2->sk_reuse ||
|
if ((!reuse || !sk2->sk_reuse ||
|
||||||
sk2->sk_state == TCP_LISTEN) &&
|
sk2->sk_state == TCP_LISTEN) &&
|
||||||
ipv6_rcv_saddr_equal(sk, sk2))
|
(!reuseport || !sk2->sk_reuseport ||
|
||||||
break;
|
(sk2->sk_state != TCP_TIME_WAIT &&
|
||||||
|
!uid_eq(uid,
|
||||||
|
sock_i_uid((struct sock *)sk2))))) {
|
||||||
|
if (ipv6_rcv_saddr_equal(sk, sk2))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return node != NULL;
|
return node != NULL;
|
||||||
|
|
|
@ -158,25 +158,38 @@ static inline int compute_score(struct sock *sk, struct net *net,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct sock *inet6_lookup_listener(struct net *net,
|
struct sock *inet6_lookup_listener(struct net *net,
|
||||||
struct inet_hashinfo *hashinfo, const struct in6_addr *daddr,
|
struct inet_hashinfo *hashinfo, const struct in6_addr *saddr,
|
||||||
|
const __be16 sport, const struct in6_addr *daddr,
|
||||||
const unsigned short hnum, const int dif)
|
const unsigned short hnum, const int dif)
|
||||||
{
|
{
|
||||||
struct sock *sk;
|
struct sock *sk;
|
||||||
const struct hlist_nulls_node *node;
|
const struct hlist_nulls_node *node;
|
||||||
struct sock *result;
|
struct sock *result;
|
||||||
int score, hiscore;
|
int score, hiscore, matches = 0, reuseport = 0;
|
||||||
|
u32 phash = 0;
|
||||||
unsigned int hash = inet_lhashfn(net, hnum);
|
unsigned int hash = inet_lhashfn(net, hnum);
|
||||||
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
|
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
|
||||||
|
|
||||||
rcu_read_lock();
|
rcu_read_lock();
|
||||||
begin:
|
begin:
|
||||||
result = NULL;
|
result = NULL;
|
||||||
hiscore = -1;
|
hiscore = 0;
|
||||||
sk_nulls_for_each(sk, node, &ilb->head) {
|
sk_nulls_for_each(sk, node, &ilb->head) {
|
||||||
score = compute_score(sk, net, hnum, daddr, dif);
|
score = compute_score(sk, net, hnum, daddr, dif);
|
||||||
if (score > hiscore) {
|
if (score > hiscore) {
|
||||||
hiscore = score;
|
hiscore = score;
|
||||||
result = sk;
|
result = sk;
|
||||||
|
reuseport = sk->sk_reuseport;
|
||||||
|
if (reuseport) {
|
||||||
|
phash = inet6_ehashfn(net, daddr, hnum,
|
||||||
|
saddr, sport);
|
||||||
|
matches = 1;
|
||||||
|
}
|
||||||
|
} else if (score == hiscore && reuseport) {
|
||||||
|
matches++;
|
||||||
|
if (((u64)phash * matches) >> 32 == 0)
|
||||||
|
result = sk;
|
||||||
|
phash = next_pseudo_random32(phash);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -834,7 +834,8 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb)
|
||||||
* no RST generated if md5 hash doesn't match.
|
* no RST generated if md5 hash doesn't match.
|
||||||
*/
|
*/
|
||||||
sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
|
sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
|
||||||
&tcp_hashinfo, &ipv6h->daddr,
|
&tcp_hashinfo, &ipv6h->saddr,
|
||||||
|
th->source, &ipv6h->daddr,
|
||||||
ntohs(th->source), inet6_iif(skb));
|
ntohs(th->source), inet6_iif(skb));
|
||||||
if (!sk1)
|
if (!sk1)
|
||||||
return;
|
return;
|
||||||
|
@ -1598,6 +1599,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
|
||||||
struct sock *sk2;
|
struct sock *sk2;
|
||||||
|
|
||||||
sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
|
sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
|
||||||
|
&ipv6_hdr(skb)->saddr, th->source,
|
||||||
&ipv6_hdr(skb)->daddr,
|
&ipv6_hdr(skb)->daddr,
|
||||||
ntohs(th->dest), inet6_iif(skb));
|
ntohs(th->dest), inet6_iif(skb));
|
||||||
if (sk2 != NULL) {
|
if (sk2 != NULL) {
|
||||||
|
|
Loading…
Reference in a new issue