From 7ec318feeed10a64c0359ec4d10889cb4defa39a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 7 Nov 2017 15:15:04 -0800 Subject: [PATCH] tcp: gso: avoid refcount_t warning from tcp_gso_segment() When a GSO skb of truesize O is segmented into 2 new skbs of truesize N1 and N2, we want to transfer socket ownership to the new fresh skbs. In order to avoid expensive atomic operations on a cache line subject to cache bouncing, we replace the sequence : refcount_add(N1, &sk->sk_wmem_alloc); refcount_add(N2, &sk->sk_wmem_alloc); // repeated by number of segments refcount_sub(O, &sk->sk_wmem_alloc); by a single refcount_add(sum_of(N) - O, &sk->sk_wmem_alloc); Problem is : In some pathological cases, sum(N) - O might be a negative number, and syzkaller bot was apparently able to trigger this trace [1] atomic_t was ok with this construct, but we need to take care of the negative delta with refcount_t [1] refcount_t: saturated; leaking memory. ------------[ cut here ]------------ WARNING: CPU: 0 PID: 8404 at lib/refcount.c:77 refcount_add_not_zero+0x198/0x200 lib/refcount.c:77 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 8404 Comm: syz-executor2 Not tainted 4.14.0-rc5-mm1+ #20 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x41c kernel/panic.c:183 __warn+0x1c4/0x1e0 kernel/panic.c:546 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:177 do_trap_no_signal arch/x86/kernel/traps.c:211 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:260 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:297 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:310 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905 RIP: 0010:refcount_add_not_zero+0x198/0x200 lib/refcount.c:77 RSP: 0018:ffff8801c606e3a0 EFLAGS: 00010282 RAX: 0000000000000026 RBX: 0000000000001401 RCX: 0000000000000000 RDX: 0000000000000026 RSI: ffffc900036fc000 RDI: ffffed0038c0dc68 RBP: ffff8801c606e430 R08: 0000000000000001 R09: 0000000000000000 R10: ffff8801d97f5eba R11: 0000000000000000 R12: ffff8801d5acf73c R13: 1ffff10038c0dc75 R14: 00000000ffffffff R15: 00000000fffff72f refcount_add+0x1b/0x60 lib/refcount.c:101 tcp_gso_segment+0x10d0/0x16b0 net/ipv4/tcp_offload.c:155 tcp4_gso_segment+0xd4/0x310 net/ipv4/tcp_offload.c:51 inet_gso_segment+0x60c/0x11c0 net/ipv4/af_inet.c:1271 skb_mac_gso_segment+0x33f/0x660 net/core/dev.c:2749 __skb_gso_segment+0x35f/0x7f0 net/core/dev.c:2821 skb_gso_segment include/linux/netdevice.h:3971 [inline] validate_xmit_skb+0x4ba/0xb20 net/core/dev.c:3074 __dev_queue_xmit+0xe49/0x2070 net/core/dev.c:3497 dev_queue_xmit+0x17/0x20 net/core/dev.c:3538 neigh_hh_output include/net/neighbour.h:471 [inline] neigh_output include/net/neighbour.h:479 [inline] ip_finish_output2+0xece/0x1460 net/ipv4/ip_output.c:229 ip_finish_output+0x85e/0xd10 net/ipv4/ip_output.c:317 NF_HOOK_COND include/linux/netfilter.h:238 [inline] ip_output+0x1cc/0x860 net/ipv4/ip_output.c:405 dst_output include/net/dst.h:459 [inline] ip_local_out+0x95/0x160 net/ipv4/ip_output.c:124 ip_queue_xmit+0x8c6/0x18e0 net/ipv4/ip_output.c:504 tcp_transmit_skb+0x1ab7/0x3840 net/ipv4/tcp_output.c:1137 tcp_write_xmit+0x663/0x4de0 net/ipv4/tcp_output.c:2341 __tcp_push_pending_frames+0xa0/0x250 net/ipv4/tcp_output.c:2513 tcp_push_pending_frames include/net/tcp.h:1722 [inline] tcp_data_snd_check net/ipv4/tcp_input.c:5050 [inline] tcp_rcv_established+0x8c7/0x18a0 net/ipv4/tcp_input.c:5497 tcp_v4_do_rcv+0x2ab/0x7d0 net/ipv4/tcp_ipv4.c:1460 sk_backlog_rcv include/net/sock.h:909 [inline] __release_sock+0x124/0x360 net/core/sock.c:2264 release_sock+0xa4/0x2a0 net/core/sock.c:2776 tcp_sendmsg+0x3a/0x50 net/ipv4/tcp.c:1462 inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:763 sock_sendmsg_nosec net/socket.c:632 [inline] sock_sendmsg+0xca/0x110 net/socket.c:642 ___sys_sendmsg+0x31c/0x890 net/socket.c:2048 __sys_sendmmsg+0x1e6/0x5f0 net/socket.c:2138 Fixes: 14afee4b6092 ("net: convert sock.sk_wmem_alloc from atomic_t to refcount_t") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv4/tcp_offload.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 11f69bbf9307..b6a2aa1dcf56 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -149,11 +149,19 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb, * is freed by GSO engine */ if (copy_destructor) { + int delta; + swap(gso_skb->sk, skb->sk); swap(gso_skb->destructor, skb->destructor); sum_truesize += skb->truesize; - refcount_add(sum_truesize - gso_skb->truesize, - &skb->sk->sk_wmem_alloc); + delta = sum_truesize - gso_skb->truesize; + /* In some pathological cases, delta can be negative. + * We need to either use refcount_add() or refcount_sub_and_test() + */ + if (likely(delta >= 0)) + refcount_add(delta, &skb->sk->sk_wmem_alloc); + else + WARN_ON_ONCE(refcount_sub_and_test(-delta, &skb->sk->sk_wmem_alloc)); } delta = htonl(oldlen + (skb_tail_pointer(skb) -