bpf: sockmap, add sock close() hook to remove socks
The selftests test_maps program was leaving dangling BPF sockmap
programs around because not all psock elements were removed from
the map. The elements in turn hold a reference on the BPF program
they are attached to causing BPF programs to stay open even after
test_maps has completed.
The original intent was that sk_state_change() would be called
when TCP socks went through TCP_CLOSE state. However, because
socks may be in SOCK_DEAD state or the sock may be a listening
socket the event is not always triggered.
To resolve this use the ULP infrastructure and register our own
proto close() handler. This fixes the above case.
Fixes: 174a79ff95
("bpf: sockmap with sk redirect support")
Reported-by: Prashant Bhole <bhole_prashant_q7@lab.ntt.co.jp>
Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
This commit is contained in:
parent
b11a632c44
commit
1aa12bdf1b
2 changed files with 103 additions and 67 deletions
|
@ -1985,6 +1985,7 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer);
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
TCP_ULP_TLS,
|
TCP_ULP_TLS,
|
||||||
|
TCP_ULP_BPF,
|
||||||
};
|
};
|
||||||
|
|
||||||
struct tcp_ulp_ops {
|
struct tcp_ulp_ops {
|
||||||
|
@ -2003,6 +2004,7 @@ struct tcp_ulp_ops {
|
||||||
int tcp_register_ulp(struct tcp_ulp_ops *type);
|
int tcp_register_ulp(struct tcp_ulp_ops *type);
|
||||||
void tcp_unregister_ulp(struct tcp_ulp_ops *type);
|
void tcp_unregister_ulp(struct tcp_ulp_ops *type);
|
||||||
int tcp_set_ulp(struct sock *sk, const char *name);
|
int tcp_set_ulp(struct sock *sk, const char *name);
|
||||||
|
int tcp_set_ulp_id(struct sock *sk, const int ulp);
|
||||||
void tcp_get_available_ulp(char *buf, size_t len);
|
void tcp_get_available_ulp(char *buf, size_t len);
|
||||||
void tcp_cleanup_ulp(struct sock *sk);
|
void tcp_cleanup_ulp(struct sock *sk);
|
||||||
|
|
||||||
|
|
|
@ -86,9 +86,10 @@ struct smap_psock {
|
||||||
struct work_struct tx_work;
|
struct work_struct tx_work;
|
||||||
struct work_struct gc_work;
|
struct work_struct gc_work;
|
||||||
|
|
||||||
|
struct proto *sk_proto;
|
||||||
|
void (*save_close)(struct sock *sk, long timeout);
|
||||||
void (*save_data_ready)(struct sock *sk);
|
void (*save_data_ready)(struct sock *sk);
|
||||||
void (*save_write_space)(struct sock *sk);
|
void (*save_write_space)(struct sock *sk);
|
||||||
void (*save_state_change)(struct sock *sk);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
|
static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
|
||||||
|
@ -96,12 +97,102 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
|
||||||
return rcu_dereference_sk_user_data(sk);
|
return rcu_dereference_sk_user_data(sk);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct proto tcp_bpf_proto;
|
||||||
|
static int bpf_tcp_init(struct sock *sk)
|
||||||
|
{
|
||||||
|
struct smap_psock *psock;
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
psock = smap_psock_sk(sk);
|
||||||
|
if (unlikely(!psock)) {
|
||||||
|
rcu_read_unlock();
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (unlikely(psock->sk_proto)) {
|
||||||
|
rcu_read_unlock();
|
||||||
|
return -EBUSY;
|
||||||
|
}
|
||||||
|
|
||||||
|
psock->save_close = sk->sk_prot->close;
|
||||||
|
psock->sk_proto = sk->sk_prot;
|
||||||
|
sk->sk_prot = &tcp_bpf_proto;
|
||||||
|
rcu_read_unlock();
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void bpf_tcp_release(struct sock *sk)
|
||||||
|
{
|
||||||
|
struct smap_psock *psock;
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
psock = smap_psock_sk(sk);
|
||||||
|
|
||||||
|
if (likely(psock)) {
|
||||||
|
sk->sk_prot = psock->sk_proto;
|
||||||
|
psock->sk_proto = NULL;
|
||||||
|
}
|
||||||
|
rcu_read_unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
|
||||||
|
|
||||||
|
static void bpf_tcp_close(struct sock *sk, long timeout)
|
||||||
|
{
|
||||||
|
void (*close_fun)(struct sock *sk, long timeout);
|
||||||
|
struct smap_psock_map_entry *e, *tmp;
|
||||||
|
struct smap_psock *psock;
|
||||||
|
struct sock *osk;
|
||||||
|
|
||||||
|
rcu_read_lock();
|
||||||
|
psock = smap_psock_sk(sk);
|
||||||
|
if (unlikely(!psock)) {
|
||||||
|
rcu_read_unlock();
|
||||||
|
return sk->sk_prot->close(sk, timeout);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The psock may be destroyed anytime after exiting the RCU critial
|
||||||
|
* section so by the time we use close_fun the psock may no longer
|
||||||
|
* be valid. However, bpf_tcp_close is called with the sock lock
|
||||||
|
* held so the close hook and sk are still valid.
|
||||||
|
*/
|
||||||
|
close_fun = psock->save_close;
|
||||||
|
|
||||||
|
write_lock_bh(&sk->sk_callback_lock);
|
||||||
|
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
|
||||||
|
osk = cmpxchg(e->entry, sk, NULL);
|
||||||
|
if (osk == sk) {
|
||||||
|
list_del(&e->list);
|
||||||
|
smap_release_sock(psock, sk);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
write_unlock_bh(&sk->sk_callback_lock);
|
||||||
|
rcu_read_unlock();
|
||||||
|
close_fun(sk, timeout);
|
||||||
|
}
|
||||||
|
|
||||||
enum __sk_action {
|
enum __sk_action {
|
||||||
__SK_DROP = 0,
|
__SK_DROP = 0,
|
||||||
__SK_PASS,
|
__SK_PASS,
|
||||||
__SK_REDIRECT,
|
__SK_REDIRECT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
|
||||||
|
.name = "bpf_tcp",
|
||||||
|
.uid = TCP_ULP_BPF,
|
||||||
|
.user_visible = false,
|
||||||
|
.owner = NULL,
|
||||||
|
.init = bpf_tcp_init,
|
||||||
|
.release = bpf_tcp_release,
|
||||||
|
};
|
||||||
|
|
||||||
|
static int bpf_tcp_ulp_register(void)
|
||||||
|
{
|
||||||
|
tcp_bpf_proto = tcp_prot;
|
||||||
|
tcp_bpf_proto.close = bpf_tcp_close;
|
||||||
|
return tcp_register_ulp(&bpf_tcp_ulp_ops);
|
||||||
|
}
|
||||||
|
|
||||||
static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
|
static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
|
||||||
{
|
{
|
||||||
struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
|
struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
|
||||||
|
@ -166,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err)
|
||||||
sk->sk_error_report(sk);
|
sk->sk_error_report(sk);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
|
|
||||||
|
|
||||||
/* Called with lock_sock(sk) held */
|
|
||||||
static void smap_state_change(struct sock *sk)
|
|
||||||
{
|
|
||||||
struct smap_psock_map_entry *e, *tmp;
|
|
||||||
struct smap_psock *psock;
|
|
||||||
struct socket_wq *wq;
|
|
||||||
struct sock *osk;
|
|
||||||
|
|
||||||
rcu_read_lock();
|
|
||||||
|
|
||||||
/* Allowing transitions into an established syn_recv states allows
|
|
||||||
* for early binding sockets to a smap object before the connection
|
|
||||||
* is established.
|
|
||||||
*/
|
|
||||||
switch (sk->sk_state) {
|
|
||||||
case TCP_SYN_SENT:
|
|
||||||
case TCP_SYN_RECV:
|
|
||||||
case TCP_ESTABLISHED:
|
|
||||||
break;
|
|
||||||
case TCP_CLOSE_WAIT:
|
|
||||||
case TCP_CLOSING:
|
|
||||||
case TCP_LAST_ACK:
|
|
||||||
case TCP_FIN_WAIT1:
|
|
||||||
case TCP_FIN_WAIT2:
|
|
||||||
case TCP_LISTEN:
|
|
||||||
break;
|
|
||||||
case TCP_CLOSE:
|
|
||||||
/* Only release if the map entry is in fact the sock in
|
|
||||||
* question. There is a case where the operator deletes
|
|
||||||
* the sock from the map, but the TCP sock is closed before
|
|
||||||
* the psock is detached. Use cmpxchg to verify correct
|
|
||||||
* sock is removed.
|
|
||||||
*/
|
|
||||||
psock = smap_psock_sk(sk);
|
|
||||||
if (unlikely(!psock))
|
|
||||||
break;
|
|
||||||
write_lock_bh(&sk->sk_callback_lock);
|
|
||||||
list_for_each_entry_safe(e, tmp, &psock->maps, list) {
|
|
||||||
osk = cmpxchg(e->entry, sk, NULL);
|
|
||||||
if (osk == sk) {
|
|
||||||
list_del(&e->list);
|
|
||||||
smap_release_sock(psock, sk);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
write_unlock_bh(&sk->sk_callback_lock);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
psock = smap_psock_sk(sk);
|
|
||||||
if (unlikely(!psock))
|
|
||||||
break;
|
|
||||||
smap_report_sk_error(psock, EPIPE);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
wq = rcu_dereference(sk->sk_wq);
|
|
||||||
if (skwq_has_sleeper(wq))
|
|
||||||
wake_up_interruptible_all(&wq->wait);
|
|
||||||
rcu_read_unlock();
|
|
||||||
}
|
|
||||||
|
|
||||||
static void smap_read_sock_strparser(struct strparser *strp,
|
static void smap_read_sock_strparser(struct strparser *strp,
|
||||||
struct sk_buff *skb)
|
struct sk_buff *skb)
|
||||||
{
|
{
|
||||||
|
@ -322,10 +351,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
|
||||||
return;
|
return;
|
||||||
sk->sk_data_ready = psock->save_data_ready;
|
sk->sk_data_ready = psock->save_data_ready;
|
||||||
sk->sk_write_space = psock->save_write_space;
|
sk->sk_write_space = psock->save_write_space;
|
||||||
sk->sk_state_change = psock->save_state_change;
|
|
||||||
psock->save_data_ready = NULL;
|
psock->save_data_ready = NULL;
|
||||||
psock->save_write_space = NULL;
|
psock->save_write_space = NULL;
|
||||||
psock->save_state_change = NULL;
|
|
||||||
strp_stop(&psock->strp);
|
strp_stop(&psock->strp);
|
||||||
psock->strp_enabled = false;
|
psock->strp_enabled = false;
|
||||||
}
|
}
|
||||||
|
@ -350,6 +377,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
|
||||||
if (psock->refcnt)
|
if (psock->refcnt)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
|
tcp_cleanup_ulp(sock);
|
||||||
smap_stop_sock(psock, sock);
|
smap_stop_sock(psock, sock);
|
||||||
clear_bit(SMAP_TX_RUNNING, &psock->state);
|
clear_bit(SMAP_TX_RUNNING, &psock->state);
|
||||||
rcu_assign_sk_user_data(sock, NULL);
|
rcu_assign_sk_user_data(sock, NULL);
|
||||||
|
@ -427,10 +455,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
|
||||||
return;
|
return;
|
||||||
psock->save_data_ready = sk->sk_data_ready;
|
psock->save_data_ready = sk->sk_data_ready;
|
||||||
psock->save_write_space = sk->sk_write_space;
|
psock->save_write_space = sk->sk_write_space;
|
||||||
psock->save_state_change = sk->sk_state_change;
|
|
||||||
sk->sk_data_ready = smap_data_ready;
|
sk->sk_data_ready = smap_data_ready;
|
||||||
sk->sk_write_space = smap_write_space;
|
sk->sk_write_space = smap_write_space;
|
||||||
sk->sk_state_change = smap_state_change;
|
|
||||||
psock->strp_enabled = true;
|
psock->strp_enabled = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -509,6 +535,10 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
|
||||||
if (attr->value_size > KMALLOC_MAX_SIZE)
|
if (attr->value_size > KMALLOC_MAX_SIZE)
|
||||||
return ERR_PTR(-E2BIG);
|
return ERR_PTR(-E2BIG);
|
||||||
|
|
||||||
|
err = bpf_tcp_ulp_register();
|
||||||
|
if (err && err != -EEXIST)
|
||||||
|
return ERR_PTR(err);
|
||||||
|
|
||||||
stab = kzalloc(sizeof(*stab), GFP_USER);
|
stab = kzalloc(sizeof(*stab), GFP_USER);
|
||||||
if (!stab)
|
if (!stab)
|
||||||
return ERR_PTR(-ENOMEM);
|
return ERR_PTR(-ENOMEM);
|
||||||
|
@ -754,6 +784,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
|
||||||
goto out_progs;
|
goto out_progs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
|
||||||
|
if (err)
|
||||||
|
goto out_progs;
|
||||||
|
|
||||||
set_bit(SMAP_TX_RUNNING, &psock->state);
|
set_bit(SMAP_TX_RUNNING, &psock->state);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue