From 1f4f554a72be0d8c164c2f5bc6ba939a1c624fb4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 9 Sep 2017 11:58:03 +0300 Subject: [PATCH 01/43] net: qualcomm: rmnet: Fix a double free There is a typo here so we accidentally free "skb" instead of "skbn". It leads to a double free and a leak. After discussing with Subash, it's better to just move the check before the allocation and avoid the need to free. Fixes: ceed73a2cf4a ("drivers: net: ethernet: qualcomm: rmnet: Initial implementation") Signed-off-by: Dan Carpenter Acked-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c index 557c9bf1a469..86b8c758f94e 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c @@ -84,6 +84,10 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb) if (((int)skb->len - (int)packet_len) < 0) return NULL; + /* Some hardware can send us empty frames. Catch them */ + if (ntohs(maph->pkt_len) == 0) + return NULL; + skbn = alloc_skb(packet_len + RMNET_MAP_DEAGGR_SPACING, GFP_ATOMIC); if (!skbn) return NULL; @@ -94,11 +98,5 @@ struct sk_buff *rmnet_map_deaggregate(struct sk_buff *skb) memcpy(skbn->data, skb->data, packet_len); skb_pull(skb, packet_len); - /* Some hardware can send us empty frames. Catch them */ - if (ntohs(maph->pkt_len) == 0) { - kfree_skb(skb); - return NULL; - } - return skbn; } From 4400081b631af69abc63cea3352680e3d85e0c39 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Mon, 11 Sep 2017 09:42:26 +0200 Subject: [PATCH 02/43] mlxsw: spectrum: Fix EEPROM access in case of SFP/SFP+ The current code does not handle correctly the access to the upper page in case of SFP/SFP+ EEPROM. In that case the offset should be local and the I2C address should be changed. Fixes: 2ea109039cd3 ("mlxsw: spectrum: Add support for access cable info via ethtool") Reported-by: Florian Klink Signed-off-by: Arkadi Sharshevsky Reviewed-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index ed7cd6c48019..e0804599fcae 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -2545,7 +2545,9 @@ static int mlxsw_sp_flash_device(struct net_device *dev, return err; } -#define MLXSW_SP_QSFP_I2C_ADDR 0x50 +#define MLXSW_SP_I2C_ADDR_LOW 0x50 +#define MLXSW_SP_I2C_ADDR_HIGH 0x51 +#define MLXSW_SP_EEPROM_PAGE_LENGTH 256 static int mlxsw_sp_query_module_eeprom(struct mlxsw_sp_port *mlxsw_sp_port, u16 offset, u16 size, void *data, @@ -2554,12 +2556,25 @@ static int mlxsw_sp_query_module_eeprom(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; char eeprom_tmp[MLXSW_SP_REG_MCIA_EEPROM_SIZE]; char mcia_pl[MLXSW_REG_MCIA_LEN]; + u16 i2c_addr; int status; int err; size = min_t(u16, size, MLXSW_SP_REG_MCIA_EEPROM_SIZE); + + if (offset < MLXSW_SP_EEPROM_PAGE_LENGTH && + offset + size > MLXSW_SP_EEPROM_PAGE_LENGTH) + /* Cross pages read, read until offset 256 in low page */ + size = MLXSW_SP_EEPROM_PAGE_LENGTH - offset; + + i2c_addr = MLXSW_SP_I2C_ADDR_LOW; + if (offset >= MLXSW_SP_EEPROM_PAGE_LENGTH) { + i2c_addr = MLXSW_SP_I2C_ADDR_HIGH; + offset -= MLXSW_SP_EEPROM_PAGE_LENGTH; + } + mlxsw_reg_mcia_pack(mcia_pl, mlxsw_sp_port->mapping.module, - 0, 0, offset, size, MLXSW_SP_QSFP_I2C_ADDR); + 0, 0, offset, size, i2c_addr); err = mlxsw_reg_query(mlxsw_sp->core, MLXSW_REG(mcia), mcia_pl); if (err) From 8195b1396ec86dddbba443c74b2188b423556c74 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 6 Sep 2017 13:53:05 -0700 Subject: [PATCH 03/43] hv_netvsc: fix deadlock on hotplug When a virtual device is added dynamically (via host console), then the vmbus sends an offer message for the primary channel. The processing of this message for networking causes the network device to then initialize the sub channels. The problem is that setting up the sub channels needs to wait until the subsequent subchannel offers have been processed. These offers come in on the same ring buffer and work queue as where the primary offer is being processed; leading to a deadlock. This did not happen in older kernels, because the sub channel waiting logic was broken (it wasn't really waiting). The solution is to do the sub channel setup in its own work queue context that is scheduled by the primary channel setup; and then happens later. Fixes: 732e49850c5e ("netvsc: fix race on sub channel creation") Reported-by: Dexuan Cui Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 3 + drivers/net/hyperv/netvsc.c | 3 + drivers/net/hyperv/netvsc_drv.c | 11 +-- drivers/net/hyperv/rndis_filter.c | 122 +++++++++++++++++++++--------- 4 files changed, 94 insertions(+), 45 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index ec546da86683..d98cdfb1536b 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -204,6 +204,8 @@ int netvsc_recv_callback(struct net_device *net, const struct ndis_pkt_8021q_info *vlan); void netvsc_channel_cb(void *context); int netvsc_poll(struct napi_struct *napi, int budget); + +void rndis_set_subchannel(struct work_struct *w); bool rndis_filter_opened(const struct netvsc_device *nvdev); int rndis_filter_open(struct netvsc_device *nvdev); int rndis_filter_close(struct netvsc_device *nvdev); @@ -782,6 +784,7 @@ struct netvsc_device { u32 num_chn; atomic_t open_chn; + struct work_struct subchan_work; wait_queue_head_t subchan_open; struct rndis_device *extension; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 0062b802676f..a5511b7326af 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -81,6 +81,7 @@ static struct netvsc_device *alloc_net_device(void) init_completion(&net_device->channel_init_wait); init_waitqueue_head(&net_device->subchan_open); + INIT_WORK(&net_device->subchan_work, rndis_set_subchannel); return net_device; } @@ -557,6 +558,8 @@ void netvsc_device_remove(struct hv_device *device) = rtnl_dereference(net_device_ctx->nvdev); int i; + cancel_work_sync(&net_device->subchan_work); + netvsc_disconnect_vsp(device); RCU_INIT_POINTER(net_device_ctx->nvdev, NULL); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 165ba4b3b423..c538a4f15f3b 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -853,10 +853,7 @@ static int netvsc_set_channels(struct net_device *net, rndis_filter_device_remove(dev, nvdev); nvdev = rndis_filter_device_add(dev, &device_info); - if (!IS_ERR(nvdev)) { - netif_set_real_num_tx_queues(net, nvdev->num_chn); - netif_set_real_num_rx_queues(net, nvdev->num_chn); - } else { + if (IS_ERR(nvdev)) { ret = PTR_ERR(nvdev); device_info.num_chn = orig; nvdev = rndis_filter_device_add(dev, &device_info); @@ -1954,9 +1951,6 @@ static int netvsc_probe(struct hv_device *dev, NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; net->vlan_features = net->features; - netif_set_real_num_tx_queues(net, nvdev->num_chn); - netif_set_real_num_rx_queues(net, nvdev->num_chn); - netdev_lockdep_set_classes(net); /* MTU range: 68 - 1500 or 65521 */ @@ -2012,9 +2006,10 @@ static int netvsc_remove(struct hv_device *dev) if (vf_netdev) netvsc_unregister_vf(vf_netdev); + unregister_netdevice(net); + rndis_filter_device_remove(dev, rtnl_dereference(ndev_ctx->nvdev)); - unregister_netdevice(net); rtnl_unlock(); hv_set_drvdata(dev, NULL); diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 69c40b8fccc3..731bc7cc6f43 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1039,8 +1039,6 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) /* Set the channel before opening.*/ nvchan->channel = new_sc; - netif_napi_add(ndev, &nvchan->napi, - netvsc_poll, NAPI_POLL_WEIGHT); ret = vmbus_open(new_sc, nvscdev->ring_size * PAGE_SIZE, nvscdev->ring_size * PAGE_SIZE, NULL, 0, @@ -1048,12 +1046,88 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) if (ret == 0) napi_enable(&nvchan->napi); else - netif_napi_del(&nvchan->napi); + netdev_notice(ndev, "sub channel open failed: %d\n", ret); atomic_inc(&nvscdev->open_chn); wake_up(&nvscdev->subchan_open); } +/* Open sub-channels after completing the handling of the device probe. + * This breaks overlap of processing the host message for the + * new primary channel with the initialization of sub-channels. + */ +void rndis_set_subchannel(struct work_struct *w) +{ + struct netvsc_device *nvdev + = container_of(w, struct netvsc_device, subchan_work); + struct nvsp_message *init_packet = &nvdev->channel_init_pkt; + struct net_device_context *ndev_ctx; + struct rndis_device *rdev; + struct net_device *ndev; + struct hv_device *hv_dev; + int i, ret; + + if (!rtnl_trylock()) { + schedule_work(w); + return; + } + + rdev = nvdev->extension; + if (!rdev) + goto unlock; /* device was removed */ + + ndev = rdev->ndev; + ndev_ctx = netdev_priv(ndev); + hv_dev = ndev_ctx->device_ctx; + + memset(init_packet, 0, sizeof(struct nvsp_message)); + init_packet->hdr.msg_type = NVSP_MSG5_TYPE_SUBCHANNEL; + init_packet->msg.v5_msg.subchn_req.op = NVSP_SUBCHANNEL_ALLOCATE; + init_packet->msg.v5_msg.subchn_req.num_subchannels = + nvdev->num_chn - 1; + ret = vmbus_sendpacket(hv_dev->channel, init_packet, + sizeof(struct nvsp_message), + (unsigned long)init_packet, + VM_PKT_DATA_INBAND, + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + if (ret) { + netdev_err(ndev, "sub channel allocate send failed: %d\n", ret); + goto failed; + } + + wait_for_completion(&nvdev->channel_init_wait); + if (init_packet->msg.v5_msg.subchn_comp.status != NVSP_STAT_SUCCESS) { + netdev_err(ndev, "sub channel request failed\n"); + goto failed; + } + + nvdev->num_chn = 1 + + init_packet->msg.v5_msg.subchn_comp.num_subchannels; + + /* wait for all sub channels to open */ + wait_event(nvdev->subchan_open, + atomic_read(&nvdev->open_chn) == nvdev->num_chn); + + /* ignore failues from setting rss parameters, still have channels */ + rndis_filter_set_rss_param(rdev, netvsc_hash_key); + + netif_set_real_num_tx_queues(ndev, nvdev->num_chn); + netif_set_real_num_rx_queues(ndev, nvdev->num_chn); + + rtnl_unlock(); + return; + +failed: + /* fallback to only primary channel */ + for (i = 1; i < nvdev->num_chn; i++) + netif_napi_del(&nvdev->chan_table[i].napi); + + nvdev->max_chn = 1; + nvdev->num_chn = 1; +unlock: + rtnl_unlock(); +} + struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, struct netvsc_device_info *device_info) { @@ -1063,7 +1137,6 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, struct rndis_device *rndis_device; struct ndis_offload hwcaps; struct ndis_offload_params offloads; - struct nvsp_message *init_packet; struct ndis_recv_scale_cap rsscap; u32 rsscap_size = sizeof(struct ndis_recv_scale_cap); unsigned int gso_max_size = GSO_MAX_SIZE; @@ -1215,9 +1288,7 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, net_device->num_chn); atomic_set(&net_device->open_chn, 1); - - if (net_device->num_chn == 1) - return net_device; + vmbus_set_sc_create_callback(dev->channel, netvsc_sc_open); for (i = 1; i < net_device->num_chn; i++) { ret = netvsc_alloc_recv_comp_ring(net_device, i); @@ -1228,38 +1299,15 @@ struct netvsc_device *rndis_filter_device_add(struct hv_device *dev, } } - vmbus_set_sc_create_callback(dev->channel, netvsc_sc_open); + for (i = 1; i < net_device->num_chn; i++) + netif_napi_add(net, &net_device->chan_table[i].napi, + netvsc_poll, NAPI_POLL_WEIGHT); - init_packet = &net_device->channel_init_pkt; - memset(init_packet, 0, sizeof(struct nvsp_message)); - init_packet->hdr.msg_type = NVSP_MSG5_TYPE_SUBCHANNEL; - init_packet->msg.v5_msg.subchn_req.op = NVSP_SUBCHANNEL_ALLOCATE; - init_packet->msg.v5_msg.subchn_req.num_subchannels = - net_device->num_chn - 1; - ret = vmbus_sendpacket(dev->channel, init_packet, - sizeof(struct nvsp_message), - (unsigned long)init_packet, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); - if (ret) - goto out; + if (net_device->num_chn > 1) + schedule_work(&net_device->subchan_work); - wait_for_completion(&net_device->channel_init_wait); - if (init_packet->msg.v5_msg.subchn_comp.status != NVSP_STAT_SUCCESS) { - ret = -ENODEV; - goto out; - } - - net_device->num_chn = 1 + - init_packet->msg.v5_msg.subchn_comp.num_subchannels; - - /* wait for all sub channels to open */ - wait_event(net_device->subchan_open, - atomic_read(&net_device->open_chn) == net_device->num_chn); - - /* ignore failues from setting rss parameters, still have channels */ - rndis_filter_set_rss_param(rndis_device, netvsc_hash_key); out: + /* if unavailable, just proceed with one queue */ if (ret) { net_device->max_chn = 1; net_device->num_chn = 1; @@ -1280,10 +1328,10 @@ void rndis_filter_device_remove(struct hv_device *dev, /* Halt and release the rndis device */ rndis_filter_halt_device(rndis_dev); - kfree(rndis_dev); net_dev->extension = NULL; netvsc_device_remove(dev); + kfree(rndis_dev); } int rndis_filter_open(struct netvsc_device *nvdev) From 8f2bb1de73344dbedd4195016b782bee7bf3598f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 6 Sep 2017 13:53:06 -0700 Subject: [PATCH 04/43] hv_netvsc: avoid unnecessary wakeups on subchannel creation Only need to wakeup the initiator after all sub-channels are opened. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/rndis_filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 731bc7cc6f43..065b204d8e17 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1048,8 +1048,8 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) else netdev_notice(ndev, "sub channel open failed: %d\n", ret); - atomic_inc(&nvscdev->open_chn); - wake_up(&nvscdev->subchan_open); + if (atomic_inc_return(&nvscdev->open_chn) == nvscdev->num_chn) + wake_up(&nvscdev->subchan_open); } /* Open sub-channels after completing the handling of the device probe. From c6644d07eff6588b2dedf881279fb0d1c7783970 Mon Sep 17 00:00:00 2001 From: Kosuke Tatsukawa Date: Wed, 6 Sep 2017 22:47:59 +0000 Subject: [PATCH 05/43] net: bonding: Fix transmit load balancing in balance-alb mode if specified by sysfs Commit cbf5ecb30560 ("net: bonding: Fix transmit load balancing in balance-alb mode") tried to fix transmit dynamic load balancing in balance-alb mode, which wasn't working after commit 8b426dc54cf4 ("bonding: remove hardcoded value"). It turned out that my previous patch only fixed the case when balance-alb was specified as bonding module parameter, and not when balance-alb mode was set using /sys/class/net/*/bonding/mode (the most common usage). In the latter case, tlb_dynamic_lb was set up according to the default mode of the bonding interface, which happens to be balance-rr. This additional patch addresses this issue by setting up tlb_dynamic_lb to 1 if "mode" is set to balance-alb through the sysfs interface. I didn't add code to change tlb_balance_lb back to the default value for other modes, because "mode" is usually set up only once during initialization, and it's not worthwhile to change the static variable bonding_defaults in bond_main.c to a global variable just for this purpose. Commit 8b426dc54cf4 also changes the value of tlb_dynamic_lb for balance-tlb mode if it is set up using the sysfs interface. I didn't change that behavior, because the value of tlb_balance_lb can be changed using the sysfs interface for balance-tlb, and I didn't like changing the default value back and forth for balance-tlb. As for balance-alb, /sys/class/net/*/bonding/tlb_balance_lb cannot be written to. However, I think balance-alb with tlb_dynamic_lb set to 0 is not an intended usage, so there is little use making it writable at this moment. Fixes: 8b426dc54cf4 ("bonding: remove hardcoded value") Reported-by: Reinis Rozitis Signed-off-by: Kosuke Tatsukawa Cc: stable@vger.kernel.org # v4.12+ Acked-by: Nikolay Aleksandrov Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_options.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index a12d603d41c6..5931aa2fe997 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -754,6 +754,9 @@ static int bond_option_mode_set(struct bonding *bond, bond->params.miimon); } + if (newval->value == BOND_MODE_ALB) + bond->params.tlb_dynamic_lb = 1; + /* don't cache arp_validate between modes */ bond->params.arp_validate = BOND_ARP_VALIDATE_NONE; bond->params.mode = newval->value; From 609320c8a22715b74b39796930c3542719f8ab62 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 7 Sep 2017 18:36:15 -0700 Subject: [PATCH 06/43] perf/bpf: fix a clang compilation issue clang does not support variable length array for structure member. It has the following error during compilation: kernel/trace/trace_syscalls.c:568:17: error: fields must have a constant size: 'variable length array in structure' extension will never be supported unsigned long args[sys_data->nb_args]; ^ The fix is to use a fixed array length instead. Reported-by: Nick Desaulniers Signed-off-by: Yonghong Song Signed-off-by: David S. Miller --- include/linux/syscalls.h | 2 ++ kernel/trace/trace_syscalls.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 88951b795ee3..95606a2d556f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -200,6 +200,8 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) #define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__) #define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__) +#define SYSCALL_DEFINE_MAXARGS 6 + #define SYSCALL_DEFINEx(x, sname, ...) \ SYSCALL_METADATA(sname, x, __VA_ARGS__) \ __SYSCALL_DEFINEx(x, sname, __VA_ARGS__) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 9c4eef20301c..696afe72d3b1 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -565,7 +565,7 @@ static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs, struct syscall_tp_t { unsigned long long regs; unsigned long syscall_nr; - unsigned long args[sys_data->nb_args]; + unsigned long args[SYSCALL_DEFINE_MAXARGS]; } param; int i; From 96c5508e3012ed0984ab93821d64ac1ff3279c09 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Sun, 10 Sep 2017 09:47:02 +0200 Subject: [PATCH 07/43] xdp: implement xdp_redirect_map for generic XDP Using bpf_redirect_map is allowed for generic XDP programs, but the appropriate map lookup was never performed in xdp_do_generic_redirect(). Instead the map-index is directly used as the ifindex. For the xdp_redirect_map sample in SKB-mode '-S', this resulted in trying sending on ifindex 0 which isn't valid, resulting in getting SKB packets dropped. Thus, the reported performance numbers are wrong in commit 24251c264798 ("samples/bpf: add option for native and skb mode for redirect apps") for the 'xdp_redirect_map -S' case. Before commit 109980b894e9 ("bpf: don't select potentially stale ri->map from buggy xdp progs") it could crash the kernel. Like this commit also check that the map_owner owner is correct before dereferencing the map pointer. But make sure that this API misusage can be caught by a tracepoint. Thus, allowing userspace via tracepoints to detect misbehaving bpf_progs. Fixes: 6103aa96ec07 ("net: implement XDP_REDIRECT for xdp generic") Fixes: 24251c264798 ("samples/bpf: add option for native and skb mode for redirect apps") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/trace/events/xdp.h | 4 ++-- net/core/filter.c | 38 ++++++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 862575ac8da9..4e16c43fba10 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -138,11 +138,11 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err, #define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \ trace_xdp_redirect_map(dev, xdp, fwd ? fwd->ifindex : 0, \ - 0, map, idx); + 0, map, idx) #define _trace_xdp_redirect_map_err(dev, xdp, fwd, map, idx, err) \ trace_xdp_redirect_map_err(dev, xdp, fwd ? fwd->ifindex : 0, \ - err, map, idx); + err, map, idx) #endif /* _TRACE_XDP_H */ diff --git a/net/core/filter.c b/net/core/filter.c index 3a50a9b021e2..24dd33dd9f04 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2506,21 +2506,19 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, struct redirect_info *ri = this_cpu_ptr(&redirect_info); const struct bpf_prog *map_owner = ri->map_owner; struct bpf_map *map = ri->map; + struct net_device *fwd = NULL; u32 index = ri->ifindex; - struct net_device *fwd; int err; ri->ifindex = 0; ri->map = NULL; ri->map_owner = NULL; - /* This is really only caused by a deliberately crappy - * BPF program, normally we would never hit that case, - * so no need to inform someone via tracepoints either, - * just bail out. - */ - if (unlikely(map_owner != xdp_prog)) - return -EINVAL; + if (unlikely(map_owner != xdp_prog)) { + err = -EFAULT; + map = NULL; + goto err; + } fwd = __dev_map_lookup_elem(map, index); if (!fwd) { @@ -2576,13 +2574,27 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); + const struct bpf_prog *map_owner = ri->map_owner; + struct bpf_map *map = ri->map; + struct net_device *fwd = NULL; u32 index = ri->ifindex; - struct net_device *fwd; unsigned int len; int err = 0; - fwd = dev_get_by_index_rcu(dev_net(dev), index); ri->ifindex = 0; + ri->map = NULL; + ri->map_owner = NULL; + + if (map) { + if (unlikely(map_owner != xdp_prog)) { + err = -EFAULT; + map = NULL; + goto err; + } + fwd = __dev_map_lookup_elem(map, index); + } else { + fwd = dev_get_by_index_rcu(dev_net(dev), index); + } if (unlikely(!fwd)) { err = -EINVAL; goto err; @@ -2600,10 +2612,12 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, } skb->dev = fwd; - _trace_xdp_redirect(dev, xdp_prog, index); + map ? _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index) + : _trace_xdp_redirect(dev, xdp_prog, index); return 0; err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); + map ? _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err) + : _trace_xdp_redirect_err(dev, xdp_prog, index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_generic_redirect); From 33e34e735fd4227b49735f1fc059dc9646abd1c6 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Sun, 10 Sep 2017 14:22:01 +0100 Subject: [PATCH 08/43] ipv6: sr: remove duplicate routing header type check As seg6_validate_srh() already checks that the Routing Header type is correct, it is not necessary to do it again in get_srh(). Fixes: 5829d70b ("ipv6: sr: fix get_srh() to comply with IPv6 standard "RFC 8200") Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- net/ipv6/seg6_local.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index 7ff54db73a48..825b8e01f947 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -72,10 +72,6 @@ static struct ipv6_sr_hdr *get_srh(struct sk_buff *skb) srh = (struct ipv6_sr_hdr *)(skb->data + srhoff); - /* make sure it's a Segment Routing header (Routing Type 4) */ - if (srh->type != IPV6_SRCRT_TYPE_4) - return NULL; - len = (srh->hdrlen + 1) << 3; if (!pskb_may_pull(skb, srhoff + len)) From 230cfd2dbc228a6992287d31c5d93bc6c2552024 Mon Sep 17 00:00:00 2001 From: Josh Hunt Date: Sun, 10 Sep 2017 15:48:50 -0400 Subject: [PATCH 09/43] net/sched: fix pointer check in gen_handle Fixes sparse warning about pointer in gen_handle: net/sched/cls_rsvp.h:392:40: warning: Using plain integer as NULL pointer Fixes: 8113c095672f6 ("net_sched: use void pointer for filter handle") Signed-off-by: Josh Hunt Acked-by: Cong Wang Signed-off-by: David S. Miller --- net/sched/cls_rsvp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 98c05db85bcb..b1f6ed48bc72 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -389,7 +389,7 @@ static unsigned int gen_handle(struct tcf_proto *tp, unsigned salt) if ((data->hgenerator += 0x10000) == 0) data->hgenerator = 0x10000; h = data->hgenerator|salt; - if (rsvp_get(tp, h) == 0) + if (!rsvp_get(tp, h)) return h; } return 0; From 9c0827317f235865ae421293f8aecf6cb327a63e Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Mon, 11 Sep 2017 17:43:11 +0000 Subject: [PATCH 10/43] smsc95xx: Configure pause time to 0xffff when tx flow control enabled Configure pause time to 0xffff when tx flow control enabled Set pause time to 0xffff in the pause frame to indicate the partner to stop sending the packets. When RX buffer frees up, the device sends pause frame with pause time zero for partner to resume transmission. Fixes: 2f7ca802bdae ("Add SMSC LAN9500 USB2.0 10/100 ethernet adapter driver") Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/smsc95xx.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/smsc95xx.c b/drivers/net/usb/smsc95xx.c index 340c13484e5c..309b88acd3d0 100644 --- a/drivers/net/usb/smsc95xx.c +++ b/drivers/net/usb/smsc95xx.c @@ -526,7 +526,7 @@ static void smsc95xx_set_multicast(struct net_device *netdev) static int smsc95xx_phy_update_flowcontrol(struct usbnet *dev, u8 duplex, u16 lcladv, u16 rmtadv) { - u32 flow, afc_cfg = 0; + u32 flow = 0, afc_cfg; int ret = smsc95xx_read_reg(dev, AFC_CFG, &afc_cfg); if (ret < 0) @@ -537,20 +537,19 @@ static int smsc95xx_phy_update_flowcontrol(struct usbnet *dev, u8 duplex, if (cap & FLOW_CTRL_RX) flow = 0xFFFF0002; - else - flow = 0; - if (cap & FLOW_CTRL_TX) + if (cap & FLOW_CTRL_TX) { afc_cfg |= 0xF; - else + flow |= 0xFFFF0000; + } else { afc_cfg &= ~0xF; + } netif_dbg(dev, link, dev->net, "rx pause %s, tx pause %s\n", cap & FLOW_CTRL_RX ? "enabled" : "disabled", cap & FLOW_CTRL_TX ? "enabled" : "disabled"); } else { netif_dbg(dev, link, dev->net, "half duplex\n"); - flow = 0; afc_cfg |= 0xF; } From 5829e62ac17a40ab08c1b905565604a4b5fa7af6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 11 Sep 2017 21:56:20 +0200 Subject: [PATCH 11/43] openvswitch: Fix an error handling path in 'ovs_nla_init_match_and_action()' All other error handling paths in this function go through the 'error' label. This one should do the same. Fixes: 9cc9a5cb176c ("datapath: Avoid using stack larger than 1024.") Signed-off-by: Christophe JAILLET Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/openvswitch/datapath.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 76cf273a56c7..c3aec6227c91 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1112,7 +1112,8 @@ static int ovs_nla_init_match_and_action(struct net *net, if (!a[OVS_FLOW_ATTR_KEY]) { OVS_NLERR(log, "Flow key attribute not present in set flow."); - return -EINVAL; + error = -EINVAL; + goto error; } *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key, From da8ab57863ed7e912d10b179b6bdc652f635bd19 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Sep 2017 15:58:38 -0700 Subject: [PATCH 12/43] tcp/dccp: remove reqsk_put() from inet_child_forget() Back in linux-4.4, I inadvertently put a call to reqsk_put() in inet_child_forget(), forgetting it could be called from two different points. In the case it is called from inet_csk_reqsk_queue_add(), we want to keep the reference on the request socket, since it is released later by the caller (tcp_v{4|6}_rcv()) This bug never showed up because atomic_dec_and_test() was not signaling the underflow, and SLAB_DESTROY_BY RCU semantic for request sockets prevented the request to be put in quarantine. Recent conversion of socket refcount from atomic_t to refcount_t finally exposed the bug. So move the reqsk_put() to inet_csk_listen_stop() to fix this. Thanks to Shankara Pailoor for using syzkaller and providing a nice set of .config and C repro. WARNING: CPU: 2 PID: 4277 at lib/refcount.c:186 refcount_sub_and_test+0x167/0x1b0 lib/refcount.c:186 Kernel panic - not syncing: panic_on_warn set ... CPU: 2 PID: 4277 Comm: syz-executor0 Not tainted 4.13.0-rc7 #3 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0xf7/0x1aa lib/dump_stack.c:52 panic+0x1ae/0x3a7 kernel/panic.c:180 __warn+0x1c4/0x1d9 kernel/panic.c:541 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:273 do_error_trap+0x118/0x340 arch/x86/kernel/traps.c:310 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323 invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:846 RIP: 0010:refcount_sub_and_test+0x167/0x1b0 lib/refcount.c:186 RSP: 0018:ffff88006e006b60 EFLAGS: 00010286 RAX: 0000000000000026 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000026 RSI: 1ffff1000dc00d2c RDI: ffffed000dc00d60 RBP: ffff88006e006bf0 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: 1ffff1000dc00d6d R13: 00000000ffffffff R14: 0000000000000001 R15: ffff88006ce9d340 refcount_dec_and_test+0x1a/0x20 lib/refcount.c:211 reqsk_put+0x71/0x2b0 include/net/request_sock.h:123 tcp_v4_rcv+0x259e/0x2e20 net/ipv4/tcp_ipv4.c:1729 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:248 [inline] ip_local_deliver+0x1ce/0x6d0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:477 [inline] ip_rcv_finish+0x8db/0x19c0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:248 [inline] ip_rcv+0xc3f/0x17d0 net/ipv4/ip_input.c:488 __netif_receive_skb_core+0x1fb7/0x31f0 net/core/dev.c:4298 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4336 process_backlog+0x1c5/0x6d0 net/core/dev.c:5102 napi_poll net/core/dev.c:5499 [inline] net_rx_action+0x6d3/0x14a0 net/core/dev.c:5565 __do_softirq+0x2cb/0xb2d kernel/softirq.c:284 do_softirq_own_stack+0x1c/0x30 arch/x86/entry/entry_64.S:898 do_softirq.part.16+0x63/0x80 kernel/softirq.c:328 do_softirq kernel/softirq.c:176 [inline] __local_bh_enable_ip+0x84/0x90 kernel/softirq.c:181 local_bh_enable include/linux/bottom_half.h:31 [inline] rcu_read_unlock_bh include/linux/rcupdate.h:705 [inline] ip_finish_output2+0x8ad/0x1360 net/ipv4/ip_output.c:231 ip_finish_output+0x74e/0xb80 net/ipv4/ip_output.c:317 NF_HOOK_COND include/linux/netfilter.h:237 [inline] ip_output+0x1cc/0x850 net/ipv4/ip_output.c:405 dst_output include/net/dst.h:471 [inline] ip_local_out+0x95/0x160 net/ipv4/ip_output.c:124 ip_queue_xmit+0x8c6/0x1810 net/ipv4/ip_output.c:504 tcp_transmit_skb+0x1963/0x3320 net/ipv4/tcp_output.c:1123 tcp_send_ack.part.35+0x38c/0x620 net/ipv4/tcp_output.c:3575 tcp_send_ack+0x49/0x60 net/ipv4/tcp_output.c:3545 tcp_rcv_synsent_state_process net/ipv4/tcp_input.c:5795 [inline] tcp_rcv_state_process+0x4876/0x4b60 net/ipv4/tcp_input.c:5930 tcp_v4_do_rcv+0x58a/0x820 net/ipv4/tcp_ipv4.c:1483 sk_backlog_rcv include/net/sock.h:907 [inline] __release_sock+0x124/0x360 net/core/sock.c:2223 release_sock+0xa4/0x2a0 net/core/sock.c:2715 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline] __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682 SYSC_connect+0x204/0x470 net/socket.c:1628 SyS_connect+0x24/0x30 net/socket.c:1609 entry_SYSCALL_64_fastpath+0x18/0xad RIP: 0033:0x451e59 RSP: 002b:00007f474843fc08 EFLAGS: 00000216 ORIG_RAX: 000000000000002a RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 0000000000451e59 RDX: 0000000000000010 RSI: 0000000020002000 RDI: 0000000000000007 RBP: 0000000000000046 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: 0000000000000000 R13: 00007ffc040a0f8f R14: 00007f47484409c0 R15: 0000000000000000 Fixes: ebb516af60e1 ("tcp/dccp: fix race at listener dismantle phase") Signed-off-by: Eric Dumazet Reported-by: Shankara Pailoor Tested-by: Shankara Pailoor Signed-off-by: David S. Miller --- net/ipv4/inet_connection_sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 4089c013cb03..b9c64b40a83a 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -916,7 +916,6 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, tcp_sk(child)->fastopen_rsk = NULL; } inet_csk_destroy_sock(child); - reqsk_put(req); } struct sock *inet_csk_reqsk_queue_add(struct sock *sk, @@ -987,6 +986,7 @@ void inet_csk_listen_stop(struct sock *sk) sock_hold(child); inet_child_forget(sk, req, child); + reqsk_put(req); bh_unlock_sock(child); local_bh_enable(); sock_put(child); From d7fb60b9cafb982cb2e46a267646a8dfd4f2e5da Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 11 Sep 2017 16:33:30 -0700 Subject: [PATCH 13/43] net_sched: get rid of tcfa_rcu gen estimator has been rewritten in commit 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators"), the caller is no longer needed to wait for a grace period. So this patch gets rid of it. This also completely closes a race condition between action free path and filter chain add/remove path for the following patch. Because otherwise the nested RCU callback can't be caught by rcu_barrier(). Please see also the comments in code. Cc: Jiri Pirko Cc: Jamal Hadi Salim Cc: Eric Dumazet Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/net/act_api.h | 2 -- net/sched/act_api.c | 17 ++++++++--------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/net/act_api.h b/include/net/act_api.h index 8f3d5d8b5ae0..b944e0eb93be 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -34,7 +34,6 @@ struct tc_action { struct gnet_stats_queue tcfa_qstats; struct net_rate_estimator __rcu *tcfa_rate_est; spinlock_t tcfa_lock; - struct rcu_head tcfa_rcu; struct gnet_stats_basic_cpu __percpu *cpu_bstats; struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie *act_cookie; @@ -50,7 +49,6 @@ struct tc_action { #define tcf_qstats common.tcfa_qstats #define tcf_rate_est common.tcfa_rate_est #define tcf_lock common.tcfa_lock -#define tcf_rcu common.tcfa_rcu /* Update lastuse only if needed, to avoid dirtying a cache line. * We use a temp variable to avoid fetching jiffies twice. diff --git a/net/sched/act_api.c b/net/sched/act_api.c index a306974e2fb4..fcd7dc7b807a 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -53,10 +53,13 @@ static void tcf_action_goto_chain_exec(const struct tc_action *a, res->goto_tp = rcu_dereference_bh(chain->filter_chain); } -static void free_tcf(struct rcu_head *head) +/* XXX: For standalone actions, we don't need a RCU grace period either, because + * actions are always connected to filters and filters are already destroyed in + * RCU callbacks, so after a RCU grace period actions are already disconnected + * from filters. Readers later can not find us. + */ +static void free_tcf(struct tc_action *p) { - struct tc_action *p = container_of(head, struct tc_action, tcfa_rcu); - free_percpu(p->cpu_bstats); free_percpu(p->cpu_qstats); @@ -76,11 +79,7 @@ static void tcf_idr_remove(struct tcf_idrinfo *idrinfo, struct tc_action *p) idr_remove_ext(&idrinfo->action_idr, p->tcfa_index); spin_unlock_bh(&idrinfo->lock); gen_kill_estimator(&p->tcfa_rate_est); - /* - * gen_estimator est_timer() might access p->tcfa_lock - * or bstats, wait a RCU grace period before freeing p - */ - call_rcu(&p->tcfa_rcu, free_tcf); + free_tcf(p); } int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) @@ -259,7 +258,7 @@ void tcf_idr_cleanup(struct tc_action *a, struct nlattr *est) { if (est) gen_kill_estimator(&a->tcfa_rate_est); - call_rcu(&a->tcfa_rcu, free_tcf); + free_tcf(a); } EXPORT_SYMBOL(tcf_idr_cleanup); From e2ef75445340ca7ec2c4558f84ae6c8c5d650fc8 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 11 Sep 2017 16:33:31 -0700 Subject: [PATCH 14/43] net_sched: fix reference counting of tc filter chain This patch fixes the following ugliness of tc filter chain refcnt: a) tp proto should hold a refcnt to the chain too. This significantly simplifies the logic. b) Chain 0 is no longer special, it is created with refcnt=1 like any other chains. All the ugliness in tcf_chain_put() can be gone! c) No need to handle the flushing oddly, because block still holds chain 0, it can not be released, this guarantees block is the last user. d) The race condition with RCU callbacks is easier to handle with just a rcu_barrier(). Much easier to understand, nothing to hide. Thanks to the previous patch. Please see also the comments in code. e) Make the code understandable by humans, much less error-prone. Fixes: 744a4cf63e52 ("net: sched: fix use after free when tcf_chain_destroy is called multiple times") Fixes: 5bc1701881e3 ("net: sched: introduce multichain support for filters") Cc: Jiri Pirko Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 47 +++++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c743f03cfebd..d29e79d98a69 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -182,7 +182,7 @@ static struct tcf_chain *tcf_chain_create(struct tcf_block *block, list_add_tail(&chain->list, &block->chain_list); chain->block = block; chain->index = chain_index; - chain->refcnt = 0; + chain->refcnt = 1; return chain; } @@ -194,21 +194,20 @@ static void tcf_chain_flush(struct tcf_chain *chain) RCU_INIT_POINTER(*chain->p_filter_chain, NULL); while ((tp = rtnl_dereference(chain->filter_chain)) != NULL) { RCU_INIT_POINTER(chain->filter_chain, tp->next); + tcf_chain_put(chain); tcf_proto_destroy(tp); } } static void tcf_chain_destroy(struct tcf_chain *chain) { - /* May be already removed from the list by the previous call. */ - if (!list_empty(&chain->list)) - list_del_init(&chain->list); + list_del(&chain->list); + kfree(chain); +} - /* There might still be a reference held when we got here from - * tcf_block_put. Wait for the user to drop reference before free. - */ - if (!chain->refcnt) - kfree(chain); +static void tcf_chain_hold(struct tcf_chain *chain) +{ + ++chain->refcnt; } struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, @@ -217,24 +216,19 @@ struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index, struct tcf_chain *chain; list_for_each_entry(chain, &block->chain_list, list) { - if (chain->index == chain_index) - goto incref; + if (chain->index == chain_index) { + tcf_chain_hold(chain); + return chain; + } } - chain = create ? tcf_chain_create(block, chain_index) : NULL; -incref: - if (chain) - chain->refcnt++; - return chain; + return create ? tcf_chain_create(block, chain_index) : NULL; } EXPORT_SYMBOL(tcf_chain_get); void tcf_chain_put(struct tcf_chain *chain) { - /* Destroy unused chain, with exception of chain 0, which is the - * default one and has to be always present. - */ - if (--chain->refcnt == 0 && !chain->filter_chain && chain->index != 0) + if (--chain->refcnt == 0) tcf_chain_destroy(chain); } EXPORT_SYMBOL(tcf_chain_put); @@ -279,10 +273,19 @@ void tcf_block_put(struct tcf_block *block) if (!block) return; + /* XXX: Standalone actions are not allowed to jump to any chain, and + * bound actions should be all removed after flushing. However, + * filters are destroyed in RCU callbacks, we have to flush and wait + * for them inside the loop, otherwise we race with RCU callbacks on + * this list. + */ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { tcf_chain_flush(chain); - tcf_chain_destroy(chain); + rcu_barrier(); } + + list_for_each_entry_safe(chain, tmp, &block->chain_list, list) + tcf_chain_put(chain); kfree(block); } EXPORT_SYMBOL(tcf_block_put); @@ -360,6 +363,7 @@ static void tcf_chain_tp_insert(struct tcf_chain *chain, rcu_assign_pointer(*chain->p_filter_chain, tp); RCU_INIT_POINTER(tp->next, tcf_chain_tp_prev(chain_info)); rcu_assign_pointer(*chain_info->pprev, tp); + tcf_chain_hold(chain); } static void tcf_chain_tp_remove(struct tcf_chain *chain, @@ -371,6 +375,7 @@ static void tcf_chain_tp_remove(struct tcf_chain *chain, if (chain->p_filter_chain && tp == chain->filter_chain) RCU_INIT_POINTER(*chain->p_filter_chain, next); RCU_INIT_POINTER(*chain_info->pprev, next); + tcf_chain_put(chain); } static struct tcf_proto *tcf_chain_tp_find(struct tcf_chain *chain, From 1697c4bb5245649a23f06a144cc38c06715e1b65 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 11 Sep 2017 16:33:32 -0700 Subject: [PATCH 15/43] net_sched: carefully handle tcf_block_put() As pointed out by Jiri, there is still a race condition between tcf_block_put() and tcf_chain_destroy() in a RCU callback. There is no way to make it correct without proper locking or synchronization, because both operate on a shared list. Locking is hard, because the only lock we can pick here is a spinlock, however, in tc_dump_tfilter() we iterate this list with a sleeping function called (tcf_chain_dump()), which makes using a lock to protect chain_list almost impossible. Jiri suggested the idea of holding a refcnt before flushing, this works because it guarantees us there would be no parallel tcf_chain_destroy() during the loop, therefore the race condition is gone. But we have to be very careful with proper synchronization with RCU callbacks. Suggested-by: Jiri Pirko Cc: Jamal Hadi Salim Signed-off-by: Cong Wang Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/cls_api.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index d29e79d98a69..0b2219adf520 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -275,15 +275,27 @@ void tcf_block_put(struct tcf_block *block) /* XXX: Standalone actions are not allowed to jump to any chain, and * bound actions should be all removed after flushing. However, - * filters are destroyed in RCU callbacks, we have to flush and wait - * for them inside the loop, otherwise we race with RCU callbacks on - * this list. + * filters are destroyed in RCU callbacks, we have to hold the chains + * first, otherwise we would always race with RCU callbacks on this list + * without proper locking. */ - list_for_each_entry_safe(chain, tmp, &block->chain_list, list) { - tcf_chain_flush(chain); - rcu_barrier(); - } + /* Wait for existing RCU callbacks to cool down. */ + rcu_barrier(); + + /* Hold a refcnt for all chains, except 0, in case they are gone. */ + list_for_each_entry(chain, &block->chain_list, list) + if (chain->index) + tcf_chain_hold(chain); + + /* No race on the list, because no chain could be destroyed. */ + list_for_each_entry(chain, &block->chain_list, list) + tcf_chain_flush(chain); + + /* Wait for RCU callbacks to release the reference count. */ + rcu_barrier(); + + /* At this point, all the chains should have refcnt == 1. */ list_for_each_entry_safe(chain, tmp, &block->chain_list, list) tcf_chain_put(chain); kfree(block); From 6399ebcccffa12e65bc15eda039d37673264ebce Mon Sep 17 00:00:00 2001 From: Yuval Mintz Date: Tue, 12 Sep 2017 08:50:53 +0200 Subject: [PATCH 16/43] mlxsw: spectrum: Prevent mirred-related crash on removal When removing the offloading of mirred actions under matchall classifiers, mlxsw would find the destination port associated with the offloaded action and utilize it for undoing the configuration. Depending on the order by which ports are removed, it's possible that the destination port would get removed before the source port. In such a scenario, when actions would be flushed for the source port mlxsw would perform an illegal dereference as the destination port is no longer listed. Since the only item necessary for undoing the configuration on the destination side is the port-id and that in turn is already maintained by mlxsw on the source-port, simply stop trying to access the destination port and use the port-id directly instead. Fixes: 763b4b70af ("mlxsw: spectrum: Add support in matchall mirror TC offloading") Signed-off-by: Yuval Mintz Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../net/ethernet/mellanox/mlxsw/spectrum.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index e0804599fcae..696b99e65a5a 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -575,15 +575,14 @@ static void mlxsw_sp_span_entry_destroy(struct mlxsw_sp *mlxsw_sp, } static struct mlxsw_sp_span_entry * -mlxsw_sp_span_entry_find(struct mlxsw_sp_port *port) +mlxsw_sp_span_entry_find(struct mlxsw_sp *mlxsw_sp, u8 local_port) { - struct mlxsw_sp *mlxsw_sp = port->mlxsw_sp; int i; for (i = 0; i < mlxsw_sp->span.entries_count; i++) { struct mlxsw_sp_span_entry *curr = &mlxsw_sp->span.entries[i]; - if (curr->used && curr->local_port == port->local_port) + if (curr->used && curr->local_port == local_port) return curr; } return NULL; @@ -594,7 +593,8 @@ static struct mlxsw_sp_span_entry { struct mlxsw_sp_span_entry *span_entry; - span_entry = mlxsw_sp_span_entry_find(port); + span_entry = mlxsw_sp_span_entry_find(port->mlxsw_sp, + port->local_port); if (span_entry) { /* Already exists, just take a reference */ span_entry->ref_count++; @@ -783,12 +783,13 @@ static int mlxsw_sp_span_mirror_add(struct mlxsw_sp_port *from, } static void mlxsw_sp_span_mirror_remove(struct mlxsw_sp_port *from, - struct mlxsw_sp_port *to, + u8 destination_port, enum mlxsw_sp_span_type type) { struct mlxsw_sp_span_entry *span_entry; - span_entry = mlxsw_sp_span_entry_find(to); + span_entry = mlxsw_sp_span_entry_find(from->mlxsw_sp, + destination_port); if (!span_entry) { netdev_err(from->dev, "no span entry found\n"); return; @@ -1563,14 +1564,12 @@ static void mlxsw_sp_port_del_cls_matchall_mirror(struct mlxsw_sp_port *mlxsw_sp_port, struct mlxsw_sp_port_mall_mirror_tc_entry *mirror) { - struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; enum mlxsw_sp_span_type span_type; - struct mlxsw_sp_port *to_port; - to_port = mlxsw_sp->ports[mirror->to_local_port]; span_type = mirror->ingress ? MLXSW_SP_SPAN_INGRESS : MLXSW_SP_SPAN_EGRESS; - mlxsw_sp_span_mirror_remove(mlxsw_sp_port, to_port, span_type); + mlxsw_sp_span_mirror_remove(mlxsw_sp_port, mirror->to_local_port, + span_type); } static int From 833a8b405465e935a1ff7ab086b54a3ef90437ca Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Tue, 12 Sep 2017 17:47:56 +0800 Subject: [PATCH 17/43] ip_tunnel: fix ip tunnel lookup in collect_md mode In collect_md mode, if the tun dev is down, it still can call ip_tunnel_rcv to receive on packets, and the rx statistics increase improperly. When the md tunnel is down, it's not neccessary to increase RX drops for the tunnel device, packets would be recieved on fallback tunnel, and the RX drops on fallback device will be increased as expected. Fixes: 2e15ea390e6f ("ip_gre: Add support to collect tunnel metadata.") Cc: Pravin B Shelar Signed-off-by: Haishuang Yan Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- net/ipv4/ip_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index e1856bfa753d..e9805ad664ac 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -176,7 +176,7 @@ struct ip_tunnel *ip_tunnel_lookup(struct ip_tunnel_net *itn, return cand; t = rcu_dereference(itn->collect_md_tun); - if (t) + if (t && t->dev->flags & IFF_UP) return t; if (itn->fb_tunnel_dev && itn->fb_tunnel_dev->flags & IFF_UP) From 6c1cb4393cc7e7107e4e94a9a0744451296ca8a6 Mon Sep 17 00:00:00 2001 From: Haishuang Yan Date: Tue, 12 Sep 2017 17:47:57 +0800 Subject: [PATCH 18/43] ip6_tunnel: fix ip6 tunnel lookup in collect_md mode In collect_md mode, if the tun dev is down, it still can call __ip6_tnl_rcv to receive on packets, and the rx statistics increase improperly. When the md tunnel is down, it's not neccessary to increase RX drops for the tunnel device, packets would be recieved on fallback tunnel, and the RX drops on fallback device will be increased as expected. Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels") Cc: Alexei Starovoitov Signed-off-by: Haishuang Yan Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 10a693a19323..ae73164559d5 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -171,7 +171,7 @@ ip6_tnl_lookup(struct net *net, const struct in6_addr *remote, const struct in6_ } t = rcu_dereference(ip6n->collect_md_tun); - if (t) + if (t && t->dev->flags & IFF_UP) return t; t = rcu_dereference(ip6n->tnls_wc[0]); From f13ad104b4e886a03e75f130daf579ef9bf33dfc Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 12 Sep 2017 15:10:05 +0300 Subject: [PATCH 19/43] net: bonding: fix tlb_dynamic_lb default value Commit 8b426dc54cf4 ("bonding: remove hardcoded value") changed the default value for tlb_dynamic_lb which lead to either broken ALB mode (since tlb_dynamic_lb can be changed only in TLB) or setting TLB mode with tlb_dynamic_lb equal to 0. The first issue was recently fixed by setting tlb_dynamic_lb to 1 always when switching to ALB mode, but the default value is still wrong and we'll enter TLB mode with tlb_dynamic_lb equal to 0 if the mode is changed via netlink or sysfs. In order to restore the previous behaviour and default value simply remove the mode check around the default param initialization for tlb_dynamic_lb which will always set it to 1 as before. Fixes: 8b426dc54cf4 ("bonding: remove hardcoded value") Signed-off-by: Nikolay Aleksandrov Acked-by: Mahesh Bandewar Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index fc63992ab0e0..c99dc59d729b 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4289,7 +4289,7 @@ static int bond_check_params(struct bond_params *params) int bond_mode = BOND_MODE_ROUNDROBIN; int xmit_hashtype = BOND_XMIT_POLICY_LAYER2; int lacp_fast = 0; - int tlb_dynamic_lb = 0; + int tlb_dynamic_lb; /* Convert string parameters. */ if (mode) { @@ -4601,16 +4601,13 @@ static int bond_check_params(struct bond_params *params) } ad_user_port_key = valptr->value; - if ((bond_mode == BOND_MODE_TLB) || (bond_mode == BOND_MODE_ALB)) { - bond_opt_initstr(&newval, "default"); - valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), - &newval); - if (!valptr) { - pr_err("Error: No tlb_dynamic_lb default value"); - return -EINVAL; - } - tlb_dynamic_lb = valptr->value; + bond_opt_initstr(&newval, "default"); + valptr = bond_opt_parse(bond_opt_get(BOND_OPT_TLB_DYNAMIC_LB), &newval); + if (!valptr) { + pr_err("Error: No tlb_dynamic_lb default value"); + return -EINVAL; } + tlb_dynamic_lb = valptr->value; if (lp_interval == 0) { pr_warn("Warning: ip_interval must be between 1 and %d, so it was reset to %d\n", From 854426ef359c52bdf7087bc20c8d9105d075ca29 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Sep 2017 14:31:48 +0200 Subject: [PATCH 20/43] w90p910_ether: include linux/interrupt.h A randconfig build caused a compile failure: drivers/net/ethernet/nuvoton/w90p910_ether.c: In function 'w90p910_ether_close': drivers/net/ethernet/nuvoton/w90p910_ether.c:580:2: error: implicit declaration of function 'free_irq'; did you mean 'free_uid'? [-Werror=implicit-function-declaration] Adding the correct include fixes the problem. Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/ethernet/nuvoton/w90p910_ether.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/nuvoton/w90p910_ether.c b/drivers/net/ethernet/nuvoton/w90p910_ether.c index 89ab786da25f..4a67c55aa9f1 100644 --- a/drivers/net/ethernet/nuvoton/w90p910_ether.c +++ b/drivers/net/ethernet/nuvoton/w90p910_ether.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include From 822f8565c93949fb2d31502d595c8bc45629c9b7 Mon Sep 17 00:00:00 2001 From: Suresh Reddy Date: Wed, 13 Sep 2017 11:12:42 -0400 Subject: [PATCH 21/43] be2net: fix TSO6/GSO issue causing TX-stall on Lancer/BEx IPv6 TSO requests with extension hdrs are a problem to the Lancer and BEx chips. Workaround is to disable TSO6 feature for such packets. Also in Lancer chips, MSS less than 256 was resulting in TX stall. Fix this by disabling GSO when MSS less than 256. Signed-off-by: Suresh Reddy Signed-off-by: David S. Miller --- drivers/net/ethernet/emulex/benet/be.h | 8 ++++++++ drivers/net/ethernet/emulex/benet/be_main.c | 14 ++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index 674cf9d13b98..8984c4938881 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -930,6 +930,14 @@ static inline bool is_ipv4_pkt(struct sk_buff *skb) return skb->protocol == htons(ETH_P_IP) && ip_hdr(skb)->version == 4; } +static inline bool is_ipv6_ext_hdr(struct sk_buff *skb) +{ + if (ip_hdr(skb)->version == 6) + return ipv6_ext_hdr(ipv6_hdr(skb)->nexthdr); + else + return false; +} + #define be_error_recovering(adapter) \ (adapter->flags & BE_FLAGS_TRY_RECOVERY) diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 319eee36649b..0e3d9f39a807 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -5089,6 +5089,20 @@ static netdev_features_t be_features_check(struct sk_buff *skb, struct be_adapter *adapter = netdev_priv(dev); u8 l4_hdr = 0; + if (skb_is_gso(skb)) { + /* IPv6 TSO requests with extension hdrs are a problem + * to Lancer and BE3 HW. Disable TSO6 feature. + */ + if (!skyhawk_chip(adapter) && is_ipv6_ext_hdr(skb)) + features &= ~NETIF_F_TSO6; + + /* Lancer cannot handle the packet with MSS less than 256. + * Disable the GSO support in such cases + */ + if (lancer_chip(adapter) && skb_shinfo(skb)->gso_size < 256) + features &= ~NETIF_F_GSO_MASK; + } + /* The code below restricts offload features for some tunneled and * Q-in-Q packets. * Offload features for normal (non tunnel) packets are unchanged. From 255cd50f207ae8ec7b22663246c833407744e634 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 13 Sep 2017 17:32:37 +0200 Subject: [PATCH 22/43] net: sched: fix use-after-free in tcf_action_destroy and tcf_del_walker Recent commit d7fb60b9cafb ("net_sched: get rid of tcfa_rcu") removed freeing in call_rcu, which changed already existing hard-to-hit race condition into 100% hit: [ 598.599825] BUG: unable to handle kernel NULL pointer dereference at 0000000000000030 [ 598.607782] IP: tcf_action_destroy+0xc0/0x140 Or: [ 40.858924] BUG: unable to handle kernel NULL pointer dereference at 0000000000000030 [ 40.862840] IP: tcf_generic_walker+0x534/0x820 Fix this by storing the ops and use them directly for module_put call. Fixes: a85a970af265 ("net_sched: move tc_action into tcf_common") Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_api.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index fcd7dc7b807a..da6fa82c98a8 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -180,7 +180,7 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, idr_for_each_entry_ext(idr, p, id) { ret = __tcf_idr_release(p, false, true); if (ret == ACT_P_DELETED) { - module_put(p->ops->owner); + module_put(ops->owner); n_i++; } else if (ret < 0) { goto nla_put_failure; @@ -514,13 +514,15 @@ EXPORT_SYMBOL(tcf_action_exec); int tcf_action_destroy(struct list_head *actions, int bind) { + const struct tc_action_ops *ops; struct tc_action *a, *tmp; int ret = 0; list_for_each_entry_safe(a, tmp, actions, list) { + ops = a->ops; ret = __tcf_idr_release(a, bind, true); if (ret == ACT_P_DELETED) - module_put(a->ops->owner); + module_put(ops->owner); else if (ret < 0) return ret; } From b95a2d831b815189618d18e3e89bcfa5072351a1 Mon Sep 17 00:00:00 2001 From: Pieter Jansen van Vuuren Date: Wed, 13 Sep 2017 10:15:58 -0700 Subject: [PATCH 23/43] nfp: add whitelist of supported flow dissector Previously we did not check the flow dissector against a list of allowed and supported flow key dissectors. This patch introduces such a list and correctly rejects unsupported flow keys. Fixes: 43f84b72c50d ("nfp: add metadata to each flow offload") Signed-off-by: Pieter Jansen van Vuuren Reviewed-by: Simon Horman Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/flower/offload.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/flower/offload.c b/drivers/net/ethernet/netronome/nfp/flower/offload.c index d396183108f7..a18b4d2b1d3e 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/offload.c +++ b/drivers/net/ethernet/netronome/nfp/flower/offload.c @@ -44,6 +44,16 @@ #include "../nfp_net.h" #include "../nfp_port.h" +#define NFP_FLOWER_WHITELIST_DISSECTOR \ + (BIT(FLOW_DISSECTOR_KEY_CONTROL) | \ + BIT(FLOW_DISSECTOR_KEY_BASIC) | \ + BIT(FLOW_DISSECTOR_KEY_IPV4_ADDRS) | \ + BIT(FLOW_DISSECTOR_KEY_IPV6_ADDRS) | \ + BIT(FLOW_DISSECTOR_KEY_PORTS) | \ + BIT(FLOW_DISSECTOR_KEY_ETH_ADDRS) | \ + BIT(FLOW_DISSECTOR_KEY_VLAN) | \ + BIT(FLOW_DISSECTOR_KEY_IP)) + static int nfp_flower_xmit_flow(struct net_device *netdev, struct nfp_fl_payload *nfp_flow, u8 mtype) @@ -112,6 +122,9 @@ nfp_flower_calculate_key_layers(struct nfp_fl_key_ls *ret_key_ls, u8 key_layer; int key_size; + if (flow->dissector->used_keys & ~NFP_FLOWER_WHITELIST_DISSECTOR) + return -EOPNOTSUPP; + if (dissector_uses_key(flow->dissector, FLOW_DISSECTOR_KEY_ENC_CONTROL)) { struct flow_dissector_key_control *mask_enc_ctl = From 4cbe94f2af25bf8f4d5dea56c770937d896342bf Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 13 Sep 2017 10:15:59 -0700 Subject: [PATCH 24/43] nfp: wait for board state before talking to the NSP Board state informs us which low-level initialization stages the card has completed. We should wait for the card to be fully initialized before trying to communicate with it, not only before we configure passing traffic. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_main.c | 43 +++++++++++++++++++ .../net/ethernet/netronome/nfp/nfp_net_main.c | 23 ---------- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c index f055b1774d65..424707d41fbd 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c @@ -74,6 +74,45 @@ static const struct pci_device_id nfp_pci_device_ids[] = { }; MODULE_DEVICE_TABLE(pci, nfp_pci_device_ids); +static bool nfp_board_ready(struct nfp_pf *pf) +{ + const char *cp; + long state; + int err; + + cp = nfp_hwinfo_lookup(pf->hwinfo, "board.state"); + if (!cp) + return false; + + err = kstrtol(cp, 0, &state); + if (err < 0) + return false; + + return state == 15; +} + +static int nfp_pf_board_state_wait(struct nfp_pf *pf) +{ + const unsigned long wait_until = jiffies + 10 * HZ; + + while (!nfp_board_ready(pf)) { + if (time_is_before_eq_jiffies(wait_until)) { + nfp_err(pf->cpp, "NFP board initialization timeout\n"); + return -EINVAL; + } + + nfp_info(pf->cpp, "waiting for board initialization\n"); + if (msleep_interruptible(500)) + return -ERESTARTSYS; + + /* Refresh cached information */ + kfree(pf->hwinfo); + pf->hwinfo = nfp_hwinfo_read(pf->cpp); + } + + return 0; +} + static int nfp_pcie_sriov_read_nfd_limit(struct nfp_pf *pf) { int err; @@ -425,6 +464,10 @@ static int nfp_pci_probe(struct pci_dev *pdev, nfp_hwinfo_lookup(pf->hwinfo, "assembly.revision"), nfp_hwinfo_lookup(pf->hwinfo, "cpld.version")); + err = nfp_pf_board_state_wait(pf); + if (err) + goto err_hwinfo_free; + err = devlink_register(devlink, &pdev->dev); if (err) goto err_hwinfo_free; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c index 5abb9ba31e7d..ff373acd28f3 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_main.c @@ -64,23 +64,6 @@ #define NFP_PF_CSR_SLICE_SIZE (32 * 1024) -static int nfp_is_ready(struct nfp_pf *pf) -{ - const char *cp; - long state; - int err; - - cp = nfp_hwinfo_lookup(pf->hwinfo, "board.state"); - if (!cp) - return 0; - - err = kstrtol(cp, 0, &state); - if (err < 0) - return 0; - - return state == 15; -} - /** * nfp_net_get_mac_addr() - Get the MAC address. * @pf: NFP PF handle @@ -725,12 +708,6 @@ int nfp_net_pci_probe(struct nfp_pf *pf) INIT_WORK(&pf->port_refresh_work, nfp_net_refresh_vnics); - /* Verify that the board has completed initialization */ - if (!nfp_is_ready(pf)) { - nfp_err(pf->cpp, "NFP is not ready for NIC operation.\n"); - return -EINVAL; - } - if (!pf->rtbl) { nfp_err(pf->cpp, "No %s, giving up.\n", pf->fw_loaded ? "symbol table" : "firmware found"); From 7dbd5b7517376c4395a9ed0b26cf6b4db80d8415 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 13 Sep 2017 10:16:00 -0700 Subject: [PATCH 25/43] nfp: wait for the NSP resource to appear on boot The control process (NSP) may take some time to complete its initialization. This is not a problem on most servers, but on very fast-booting machines it may not be ready for operation when driver probes the device. There is also a version of the flash in the wild where NSP tries to train the links as part of init. To wait for NSP initialization we should make sure its resource has already been added to the resource table. NSP adds itself there as last step of init. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_main.c | 4 ++ .../net/ethernet/netronome/nfp/nfpcore/nfp.h | 2 + .../netronome/nfp/nfpcore/nfp_resource.c | 45 +++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_main.c b/drivers/net/ethernet/netronome/nfp/nfp_main.c index 424707d41fbd..f8fa63b66739 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_main.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_main.c @@ -351,6 +351,10 @@ static int nfp_nsp_init(struct pci_dev *pdev, struct nfp_pf *pf) struct nfp_nsp *nsp; int err; + err = nfp_resource_wait(pf->cpp, NFP_RESOURCE_NSP, 30); + if (err) + return err; + nsp = nfp_nsp_open(pf->cpp); if (IS_ERR(nsp)) { err = PTR_ERR(nsp); diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h index 1a8d04a1e113..3ce51f03126f 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp.h @@ -97,6 +97,8 @@ nfp_resource_acquire(struct nfp_cpp *cpp, const char *name); void nfp_resource_release(struct nfp_resource *res); +int nfp_resource_wait(struct nfp_cpp *cpp, const char *name, unsigned int secs); + u32 nfp_resource_cpp_id(struct nfp_resource *res); const char *nfp_resource_name(struct nfp_resource *res); diff --git a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c index 072612263dab..b1dd13ff282b 100644 --- a/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c +++ b/drivers/net/ethernet/netronome/nfp/nfpcore/nfp_resource.c @@ -249,6 +249,51 @@ void nfp_resource_release(struct nfp_resource *res) kfree(res); } +/** + * nfp_resource_wait() - Wait for resource to appear + * @cpp: NFP CPP handle + * @name: Name of the resource + * @secs: Number of seconds to wait + * + * Wait for resource to appear in the resource table, grab and release + * its lock. The wait is jiffies-based, don't expect fine granularity. + * + * Return: 0 on success, errno otherwise. + */ +int nfp_resource_wait(struct nfp_cpp *cpp, const char *name, unsigned int secs) +{ + unsigned long warn_at = jiffies + NFP_MUTEX_WAIT_FIRST_WARN * HZ; + unsigned long err_at = jiffies + secs * HZ; + struct nfp_resource *res; + + while (true) { + res = nfp_resource_acquire(cpp, name); + if (!IS_ERR(res)) { + nfp_resource_release(res); + return 0; + } + + if (PTR_ERR(res) != -ENOENT) { + nfp_err(cpp, "error waiting for resource %s: %ld\n", + name, PTR_ERR(res)); + return PTR_ERR(res); + } + if (time_is_before_eq_jiffies(err_at)) { + nfp_err(cpp, "timeout waiting for resource %s\n", name); + return -ETIMEDOUT; + } + if (time_is_before_eq_jiffies(warn_at)) { + warn_at = jiffies + NFP_MUTEX_WAIT_NEXT_WARN * HZ; + nfp_info(cpp, "waiting for NFP resource %s\n", name); + } + if (msleep_interruptible(10)) { + nfp_err(cpp, "wait for resource %s interrupted\n", + name); + return -ERESTARTSYS; + } + } +} + /** * nfp_resource_cpp_id() - Return the cpp_id of a resource handle * @res: NFP Resource handle From ca558e185972d8ecd308760abf972f5d408bcff0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Sep 2017 11:16:45 -0700 Subject: [PATCH 26/43] net_sched: gen_estimator: fix scaling error in bytes/packets samples Denys reported wrong rate estimations with HTB classes. It appears the bug was added in linux-4.10, since my tests where using intervals of one second only. HTB using 4 sec default rate estimators, reported rates were 4x higher. We need to properly scale the bytes/packets samples before integrating them in EWMA. Tested: echo 1 >/sys/module/sch_htb/parameters/htb_rate_est Setup HTB with one class with a rate/cail of 5Gbit Generate traffic on this class tc -s -d cl sh dev eth0 classid 7002:11 class htb 7002:11 parent 7002:1 prio 5 quantum 200000 rate 5Gbit ceil 5Gbit linklayer ethernet burst 80000b/1 mpu 0b cburst 80000b/1 mpu 0b level 0 rate_handle 1 Sent 1488215421648 bytes 982969243 pkt (dropped 0, overlimits 0 requeues 0) rate 5Gbit 412814pps backlog 136260b 2p requeues 0 TCP pkts/rtx 982969327/45 bytes 1488215557414/68130 lended: 22732826 borrowed: 0 giants: 0 tokens: -1684 ctokens: -1684 Fixes: 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators") Signed-off-by: Eric Dumazet Reported-by: Denys Fedoryshchenko Signed-off-by: David S. Miller --- net/core/gen_estimator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 0385dece1f6f..7c1ffd6f9501 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -83,10 +83,10 @@ static void est_timer(unsigned long arg) u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (8 - est->ewma_log); + brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); brate -= (est->avbps >> est->ewma_log); - rate = (u64)(b.packets - est->last_packets) << (8 - est->ewma_log); + rate = (u64)(b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); rate -= (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); From 6fa9c623a03c560178e0dcec23f59dbfd29b21b9 Mon Sep 17 00:00:00 2001 From: Sergei Shtylyov Date: Wed, 13 Sep 2017 22:28:53 +0300 Subject: [PATCH 27/43] MAINTAINERS: review Renesas DT bindings as well When adding myself as a reviewer for the Renesas Ethernet drivers I somehow forgot about the bindings -- I want to review them as well. Fixes: 8e6569af3a1b ("MAINTAINERS: add myself as Renesas Ethernet drivers reviewer") Signed-off-by: Sergei Shtylyov Signed-off-by: David S. Miller --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 7f32b510fdea..c2985b7f188e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11379,6 +11379,8 @@ RENESAS ETHERNET DRIVERS R: Sergei Shtylyov L: netdev@vger.kernel.org L: linux-renesas-soc@vger.kernel.org +F: Documentation/devicetree/bindings/net/renesas,*.txt +F: Documentation/devicetree/bindings/net/sh_eth.txt F: drivers/net/ethernet/renesas/ F: include/linux/sh_eth.h From fa5f7b51fc3080c2b195fa87c7eca7c05e56f673 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 14 Sep 2017 02:00:54 +0300 Subject: [PATCH 28/43] sctp: potential read out of bounds in sctp_ulpevent_type_enabled() This code causes a static checker warning because Smatch doesn't trust anything that comes from skb->data. I've reviewed this code and I do think skb->data can be controlled by the user here. The sctp_event_subscribe struct has 13 __u8 fields and we want to see if ours is non-zero. sn_type can be any value in the 0-USHRT_MAX range. We're subtracting SCTP_SN_TYPE_BASE which is 1 << 15 so we could read either before the start of the struct or after the end. This is a very old bug and it's surprising that it would go undetected for so long but my theory is that it just doesn't have a big impact so it would be hard to notice. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- include/net/sctp/ulpevent.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index 1060494ac230..b8c86ec1a8f5 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -153,8 +153,12 @@ __u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event); static inline int sctp_ulpevent_type_enabled(__u16 sn_type, struct sctp_event_subscribe *mask) { + int offset = sn_type - SCTP_SN_TYPE_BASE; char *amask = (char *) mask; - return amask[sn_type - SCTP_SN_TYPE_BASE]; + + if (offset >= sizeof(struct sctp_event_subscribe)) + return 0; + return amask[offset]; } /* Given an event subscription, is this event enabled? */ From a5135676bbf18ab4caed9effd321bd126f9ee11f Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 14 Sep 2017 13:22:25 +0200 Subject: [PATCH 29/43] tls: make tls_sw_free_resources static Make the needlessly global function tls_sw_free_resources static to fix a gcc/sparse warning. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index fa596fa71ba7..7d80040a37b6 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -639,7 +639,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page, return ret; } -void tls_sw_free_resources(struct sock *sk) +static void tls_sw_free_resources(struct sock *sk) { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context *ctx = tls_sw_ctx(tls_ctx); From 23f4822207e04c5f78924fe0e5193c14ba720b4c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 14 Sep 2017 17:01:25 +0100 Subject: [PATCH 30/43] tg3: clean up redundant initialization of tnapi tnapi is being initialized and then immediately updated and hence the initialiation is redundant. Clean up the warning by moving the declaration and initialization to the inside of the for-loop. Cleans up clang scan-build warning: warning: Value stored to 'tnapi' during its initialization is never read Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/tg3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index af33dc15c55f..656e6af70f0a 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -11536,11 +11536,11 @@ static int tg3_start(struct tg3 *tp, bool reset_phy, bool test_irq, tg3_napi_enable(tp); for (i = 0; i < tp->irq_cnt; i++) { - struct tg3_napi *tnapi = &tp->napi[i]; err = tg3_request_irq(tp, i); if (err) { for (i--; i >= 0; i--) { - tnapi = &tp->napi[i]; + struct tg3_napi *tnapi = &tp->napi[i]; + free_irq(tnapi->irq_vec, tnapi); } goto out_napi_fini; From 4739df6211911e6597602222b640e5c002563df6 Mon Sep 17 00:00:00 2001 From: Himanshu Jha Date: Tue, 12 Sep 2017 16:49:22 +0530 Subject: [PATCH 31/43] qed: remove unnecessary call to memset call to memset to assign 0 value immediately after allocating memory with kzalloc is unnecesaary as kzalloc allocates the memory filled with 0 value. Semantic patch used to resolve this issue: @@ expression e,e2; constant c; statement S; @@ e = kzalloc(e2, c); if(e == NULL) S - memset(e, 0, e2); Signed-off-by: Himanshu Jha Signed-off-by: Himanshu Jha Acked-by: Sudarsana Kalluru Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_dcbx.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c index eaca4578435d..8f6ccc0c39e5 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dcbx.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dcbx.c @@ -1244,7 +1244,6 @@ int qed_dcbx_get_config_params(struct qed_hwfn *p_hwfn, if (!dcbx_info) return -ENOMEM; - memset(dcbx_info, 0, sizeof(*dcbx_info)); rc = qed_dcbx_query_params(p_hwfn, dcbx_info, QED_DCBX_OPERATIONAL_MIB); if (rc) { kfree(dcbx_info); From ecf091171b70787f92b18eeaa4ddc74f9221fa56 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 12 Sep 2017 22:10:53 +0200 Subject: [PATCH 32/43] net: vrf: avoid gcc-4.6 warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building an allmodconfig kernel with gcc-4.6, we get a rather odd warning: drivers/net/vrf.c: In function ‘vrf_ip6_input_dst’: drivers/net/vrf.c:964:3: error: initialized field with side-effects overwritten [-Werror] drivers/net/vrf.c:964:3: error: (near initialization for ‘fl6’) [-Werror] I have no idea what this warning is even trying to say, but it does seem like a false positive. Reordering the initialization in to match the structure definition gets rid of the warning, and might also avoid whatever gcc thinks is wrong here. Fixes: 9ff74384600a ("net: vrf: Handle ipv6 multicast and link-local addresses") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller --- drivers/net/vrf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 7e19051f3230..9b243e6f3008 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -957,12 +957,12 @@ static void vrf_ip6_input_dst(struct sk_buff *skb, struct net_device *vrf_dev, { const struct ipv6hdr *iph = ipv6_hdr(skb); struct flowi6 fl6 = { + .flowi6_iif = ifindex, + .flowi6_mark = skb->mark, + .flowi6_proto = iph->nexthdr, .daddr = iph->daddr, .saddr = iph->saddr, .flowlabel = ip6_flowinfo(iph), - .flowi6_mark = skb->mark, - .flowi6_proto = iph->nexthdr, - .flowi6_iif = ifindex, }; struct net *net = dev_net(vrf_dev); struct rt6_info *rt6; From 7095c973453e56efa0903e863b59cd89c75e62dc Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 12 Sep 2017 13:14:26 -0700 Subject: [PATCH 33/43] net: systemport: Fix 64-bit stats deadlock We can enter a deadlock situation because there is no sufficient protection when ndo_get_stats64() runs in process context to guard against RX or TX NAPI contexts running in softirq, this can lead to the following lockdep splat and actual deadlock was experienced as well with an iperf session in the background and a while loop doing ifconfig + ethtool. [ 5.780350] ================================ [ 5.784679] WARNING: inconsistent lock state [ 5.789011] 4.13.0-rc7-02179-g32fae27c725d #70 Not tainted [ 5.794561] -------------------------------- [ 5.798890] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. [ 5.804971] swapper/0/0 [HC0[0]:SC1[1]:HE0:SE0] takes: [ 5.810175] (&syncp->seq#2){+.?...}, at: [] bcm_sysport_tx_reclaim+0x30/0x54 [ 5.818327] {SOFTIRQ-ON-W} state was registered at: [ 5.823278] bcm_sysport_get_stats64+0x17c/0x258 [ 5.828053] dev_get_stats+0x38/0xac [ 5.831776] rtnl_fill_stats+0x30/0x118 [ 5.835761] rtnl_fill_ifinfo+0x538/0xe24 [ 5.839921] rtmsg_ifinfo_build_skb+0x6c/0xd8 [ 5.844430] rtmsg_ifinfo_event.part.5+0x14/0x44 [ 5.849201] rtmsg_ifinfo+0x20/0x28 [ 5.852837] register_netdevice+0x628/0x6b8 [ 5.857171] register_netdev+0x14/0x24 [ 5.861051] bcm_sysport_probe+0x30c/0x438 [ 5.865280] platform_drv_probe+0x50/0xb0 [ 5.869418] driver_probe_device+0x2e8/0x450 [ 5.873817] __driver_attach+0x104/0x120 [ 5.877871] bus_for_each_dev+0x7c/0xc0 [ 5.881834] bus_add_driver+0x1b0/0x270 [ 5.885797] driver_register+0x78/0xf4 [ 5.889675] do_one_initcall+0x54/0x190 [ 5.893646] kernel_init_freeable+0x144/0x1d0 [ 5.898135] kernel_init+0x8/0x110 [ 5.901665] ret_from_fork+0x14/0x2c [ 5.905363] irq event stamp: 24263 [ 5.908804] hardirqs last enabled at (24262): [] net_rx_action+0xc4/0x4e4 [ 5.916624] hardirqs last disabled at (24263): [] _raw_spin_lock_irqsave+0x1c/0x98 [ 5.925143] softirqs last enabled at (24258): [] irq_enter+0x84/0x98 [ 5.932524] softirqs last disabled at (24259): [] irq_exit+0x108/0x16c [ 5.939985] [ 5.939985] other info that might help us debug this: [ 5.946576] Possible unsafe locking scenario: [ 5.946576] [ 5.952556] CPU0 [ 5.955031] ---- [ 5.957506] lock(&syncp->seq#2); [ 5.960955] [ 5.963604] lock(&syncp->seq#2); [ 5.967227] [ 5.967227] *** DEADLOCK *** [ 5.967227] [ 5.973222] 1 lock held by swapper/0/0: [ 5.977092] #0: (&(&ring->lock)->rlock){..-...}, at: [] bcm_sysport_tx_reclaim+0x20/0x54 So just remove the u64_stats_update_begin()/end() pair in ndo_get_stats64() since it does not appear to be useful for anything. No inconsistency was observed with either ifconfig or ethtool, global TX counts equal the sum of per-queue TX counts on a 32-bit architecture. Fixes: 10377ba7673d ("net: systemport: Support 64bit statistics") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bcmsysport.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index a6572b51435a..c3c53f6cd9e6 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -1735,11 +1735,8 @@ static void bcm_sysport_get_stats64(struct net_device *dev, stats->tx_packets += tx_packets; } - /* lockless update tx_bytes and tx_packets */ - u64_stats_update_begin(&priv->syncp); stats64->tx_bytes = stats->tx_bytes; stats64->tx_packets = stats->tx_packets; - u64_stats_update_end(&priv->syncp); do { start = u64_stats_fetch_begin_irq(&priv->syncp); From 2aa70f864955bf02362e3fb3008e4208d7a17a98 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 13 Sep 2017 19:42:05 +0200 Subject: [PATCH 34/43] net: smsc911x: Quieten netif during suspend If the network interface is kept running during suspend, the net core may call net_device_ops.ndo_start_xmit() while the Ethernet device is still suspended, which may lead to a system crash. E.g. on sh73a0/kzm9g and r8a73a4/ape6evm, the external Ethernet chip is driven by a PM controlled clock. If the Ethernet registers are accessed while the clock is not running, the system will crash with an imprecise external abort. As this is a race condition with a small time window, it is not so easy to trigger at will. Using pm_test may increase your chances: # echo 0 > /sys/module/printk/parameters/console_suspend # echo platform > /sys/power/pm_test # echo mem > /sys/power/state To fix this, make sure the network interface is quietened during suspend. Signed-off-by: Geert Uytterhoeven Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/smsc/smsc911x.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c index 0b6a39b003a4..012fb66eed8d 100644 --- a/drivers/net/ethernet/smsc/smsc911x.c +++ b/drivers/net/ethernet/smsc/smsc911x.c @@ -2595,6 +2595,11 @@ static int smsc911x_suspend(struct device *dev) struct net_device *ndev = dev_get_drvdata(dev); struct smsc911x_data *pdata = netdev_priv(ndev); + if (netif_running(ndev)) { + netif_stop_queue(ndev); + netif_device_detach(ndev); + } + /* enable wake on LAN, energy detection and the external PME * signal. */ smsc911x_reg_write(pdata, PMT_CTRL, @@ -2628,7 +2633,15 @@ static int smsc911x_resume(struct device *dev) while (!(smsc911x_reg_read(pdata, PMT_CTRL) & PMT_CTRL_READY_) && --to) udelay(1000); - return (to == 0) ? -EIO : 0; + if (to == 0) + return -EIO; + + if (netif_running(ndev)) { + netif_device_attach(ndev); + netif_start_queue(ndev); + } + + return 0; } static const struct dev_pm_ops smsc911x_pm_ops = { From cbea8f02069533ea2ad4e5b3bfbcdb0894c20354 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 13 Sep 2017 17:11:37 -0700 Subject: [PATCH 35/43] net: ipv4: fix l3slave check for index returned in IP_PKTINFO rt_iif is only set to the actual egress device for the output path. The recent change to consider the l3slave flag when returning IP_PKTINFO works for local traffic (the correct device index is returned), but it broke the more typical use case of packets received from a remote host always returning the VRF index rather than the original ingress device. Update the fixup to consider l3slave and rt_iif actually getting set. Fixes: 1dfa76390bf05 ("net: ipv4: add check for l3slave for index returned in IP_PKTINFO") Signed-off-by: David Ahern Signed-off-by: David S. Miller --- net/ipv4/ip_sockglue.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e558e4f9597b..a599aa83fdad 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1207,7 +1207,6 @@ static int do_ip_setsockopt(struct sock *sk, int level, void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) { struct in_pktinfo *pktinfo = PKTINFO_SKB_CB(skb); - bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags); bool prepare = (inet_sk(sk)->cmsg_flags & IP_CMSG_PKTINFO) || ipv6_sk_rxinfo(sk); @@ -1221,8 +1220,13 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb) * (e.g., process binds socket to eth0 for Tx which is * redirected to loopback in the rtable/dst). */ - if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX || l3slave) + struct rtable *rt = skb_rtable(skb); + bool l3slave = ipv4_l3mdev_skb(IPCB(skb)->flags); + + if (pktinfo->ipi_ifindex == LOOPBACK_IFINDEX) pktinfo->ipi_ifindex = inet_iif(skb); + else if (l3slave && rt && rt->rt_iif) + pktinfo->ipi_ifindex = rt->rt_iif; pktinfo->ipi_spec_dst.s_addr = fib_compute_spec_dst(skb); } else { From 8c72c65b426b47b3c166a8fef0d8927fe5e8a28d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Sep 2017 20:30:39 -0700 Subject: [PATCH 36/43] tcp: update skb->skb_mstamp more carefully liujian reported a problem in TCP_USER_TIMEOUT processing with a patch in tcp_probe_timer() : https://www.spinics.net/lists/netdev/msg454496.html After investigations, the root cause of the problem is that we update skb->skb_mstamp of skbs in write queue, even if the attempt to send a clone or copy of it failed. One reason being a routing problem. This patch prevents this, solving liujian issue. It also removes a potential RTT miscalculation, since __tcp_retransmit_skb() is not OR-ing TCP_SKB_CB(skb)->sacked with TCPCB_EVER_RETRANS if a failure happens, but skb->skb_mstamp has been changed. A future ACK would then lead to a very small RTT sample and min_rtt would then be lowered to this too small value. Tested: # cat user_timeout.pkt --local_ip=192.168.102.64 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0 bind(3, ..., ...) = 0 +0 listen(3, 1) = 0 +0 `ifconfig tun0 192.168.102.64/16; ip ro add 192.0.2.1 dev tun0` +0 < S 0:0(0) win 0 +0 > S. 0:0(0) ack 1 +.1 < . 1:1(0) ack 1 win 65530 +0 accept(3, ..., ...) = 4 +0 setsockopt(4, SOL_TCP, TCP_USER_TIMEOUT, [3000], 4) = 0 +0 write(4, ..., 24) = 24 +0 > P. 1:25(24) ack 1 win 29200 +.1 < . 1:1(0) ack 25 win 65530 //change the ipaddress +1 `ifconfig tun0 192.168.0.10/16` +1 write(4, ..., 24) = 24 +1 write(4, ..., 24) = 24 +1 write(4, ..., 24) = 24 +1 write(4, ..., 24) = 24 +0 `ifconfig tun0 192.168.102.64/16` +0 < . 1:2(1) ack 25 win 65530 +0 `ifconfig tun0 192.168.0.10/16` +3 write(4, ..., 24) = -1 # ./packetdrill user_timeout.pkt Signed-off-by: Eric Dumazet Reported-by: liujian Acked-by: Neal Cardwell Acked-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5b6690d05abb..a85a8c2948e5 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -991,6 +991,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, struct tcp_skb_cb *tcb; struct tcp_out_options opts; unsigned int tcp_options_size, tcp_header_size; + struct sk_buff *oskb = NULL; struct tcp_md5sig_key *md5; struct tcphdr *th; int err; @@ -998,12 +999,12 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); - skb->skb_mstamp = tp->tcp_mstamp; if (clone_it) { TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; tcp_rate_skb_sent(sk, skb); + oskb = skb; if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); else @@ -1011,6 +1012,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (unlikely(!skb)) return -ENOBUFS; } + skb->skb_mstamp = tp->tcp_mstamp; inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); @@ -1122,12 +1124,14 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl); - if (likely(err <= 0)) - return err; + if (unlikely(err > 0)) { + tcp_enter_cwr(sk); + err = net_xmit_eval(err); + } + if (!err && oskb) + oskb->skb_mstamp = tp->tcp_mstamp; - tcp_enter_cwr(sk); - - return net_xmit_eval(err); + return err; } /* This routine just queues the buffer for sending. @@ -2869,10 +2873,11 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) skb_headroom(skb) >= 0xFFFF)) { struct sk_buff *nskb; - skb->skb_mstamp = tp->tcp_mstamp; nskb = __pskb_copy(skb, MAX_TCP_HEADER, GFP_ATOMIC); err = nskb ? tcp_transmit_skb(sk, nskb, 0, GFP_ATOMIC) : -ENOBUFS; + if (!err) + skb->skb_mstamp = tp->tcp_mstamp; } else { err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } From 5023a6db73196695f4cc2db1a0eb37957ca27772 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 14 Sep 2017 09:31:07 -0700 Subject: [PATCH 37/43] netvsc: increase default receive buffer size The default receive buffer size was reduced by recent change to a value which was appropriate for 10G and Windows Server 2016. But the value is too small for full performance with 40G on Azure. Increase the default back to maximum supported by host. Fixes: 8b5327975ae1 ("netvsc: allow controlling send/recv buffer size") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index c538a4f15f3b..d4902ee5f260 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -49,7 +49,7 @@ #define NETVSC_MIN_TX_SECTIONS 10 #define NETVSC_DEFAULT_TX 192 /* ~1M */ #define NETVSC_MIN_RX_SECTIONS 10 /* ~64K */ -#define NETVSC_DEFAULT_RX 2048 /* ~4M */ +#define NETVSC_DEFAULT_RX 10485 /* Max ~16M */ #define LINKCHANGE_INT (2 * HZ) #define VF_TAKEOVER_INT (HZ / 10) From d25adbeb0cdb860fb39e09cdd025e9cfc954c5ab Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 11:02:21 +0800 Subject: [PATCH 38/43] sctp: fix an use-after-free issue in sctp_sock_dump Commit 86fdb3448cc1 ("sctp: ensure ep is not destroyed before doing the dump") tried to fix an use-after-free issue by checking !sctp_sk(sk)->ep with holding sock and sock lock. But Paolo noticed that endpoint could be destroyed in sctp_rcv without sock lock protection. It means the use-after-free issue still could be triggered when sctp_rcv put and destroy ep after sctp_sock_dump checks !ep, although it's pretty hard to reproduce. I could reproduce it by mdelay in sctp_rcv while msleep in sctp_close and sctp_sock_dump long time. This patch is to add another param cb_done to sctp_for_each_transport and dump ep->assocs with holding tsp after jumping out of transport's traversal in it to avoid this issue. It can also improve sctp diag dump to make it run faster, as no need to save sk into cb->args[5] and keep calling sctp_for_each_transport any more. This patch is also to use int * instead of int for the pos argument in sctp_for_each_transport, which could make postion increment only in sctp_for_each_transport and no need to keep changing cb->args[2] in sctp_sock_filter and sctp_sock_dump any more. Fixes: 86fdb3448cc1 ("sctp: ensure ep is not destroyed before doing the dump") Reported-by: Paolo Abeni Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 3 ++- net/sctp/sctp_diag.c | 32 +++++++++----------------------- net/sctp/socket.c | 40 +++++++++++++++++++++++++--------------- 3 files changed, 36 insertions(+), 39 deletions(-) diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 06b4f515e157..d7d8cba01469 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -127,7 +127,8 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), const union sctp_addr *laddr, const union sctp_addr *paddr, void *p); int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), - struct net *net, int pos, void *p); + int (*cb_done)(struct sctp_transport *, void *), + struct net *net, int *pos, void *p); int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p); int sctp_get_sctp_info(struct sock *sk, struct sctp_association *asoc, struct sctp_info *info); diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index e99518e79b52..7008a992749b 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -279,9 +279,11 @@ static int sctp_tsp_dump_one(struct sctp_transport *tsp, void *p) return err; } -static int sctp_sock_dump(struct sock *sk, void *p) +static int sctp_sock_dump(struct sctp_transport *tsp, void *p) { + struct sctp_endpoint *ep = tsp->asoc->ep; struct sctp_comm_param *commp = p; + struct sock *sk = ep->base.sk; struct sk_buff *skb = commp->skb; struct netlink_callback *cb = commp->cb; const struct inet_diag_req_v2 *r = commp->r; @@ -289,9 +291,7 @@ static int sctp_sock_dump(struct sock *sk, void *p) int err = 0; lock_sock(sk); - if (!sctp_sk(sk)->ep) - goto release; - list_for_each_entry(assoc, &sctp_sk(sk)->ep->asocs, asocs) { + list_for_each_entry(assoc, &ep->asocs, asocs) { if (cb->args[4] < cb->args[1]) goto next; @@ -327,40 +327,30 @@ static int sctp_sock_dump(struct sock *sk, void *p) cb->args[4]++; } cb->args[1] = 0; - cb->args[2]++; cb->args[3] = 0; cb->args[4] = 0; release: release_sock(sk); - sock_put(sk); return err; } -static int sctp_get_sock(struct sctp_transport *tsp, void *p) +static int sctp_sock_filter(struct sctp_transport *tsp, void *p) { struct sctp_endpoint *ep = tsp->asoc->ep; struct sctp_comm_param *commp = p; struct sock *sk = ep->base.sk; - struct netlink_callback *cb = commp->cb; const struct inet_diag_req_v2 *r = commp->r; struct sctp_association *assoc = list_entry(ep->asocs.next, struct sctp_association, asocs); /* find the ep only once through the transports by this condition */ if (tsp->asoc != assoc) - goto out; + return 0; if (r->sdiag_family != AF_UNSPEC && sk->sk_family != r->sdiag_family) - goto out; - - sock_hold(sk); - cb->args[5] = (long)sk; + return 0; return 1; - -out: - cb->args[2]++; - return 0; } static int sctp_ep_dump(struct sctp_endpoint *ep, void *p) @@ -503,12 +493,8 @@ static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, if (!(idiag_states & ~(TCPF_LISTEN | TCPF_CLOSE))) goto done; -next: - cb->args[5] = 0; - sctp_for_each_transport(sctp_get_sock, net, cb->args[2], &commp); - - if (cb->args[5] && !sctp_sock_dump((struct sock *)cb->args[5], &commp)) - goto next; + sctp_for_each_transport(sctp_sock_filter, sctp_sock_dump, + net, (int *)&cb->args[2], &commp); done: cb->args[1] = cb->args[4]; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 1b00a1e09b93..d4730ada7f32 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -4658,29 +4658,39 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), EXPORT_SYMBOL_GPL(sctp_transport_lookup_process); int sctp_for_each_transport(int (*cb)(struct sctp_transport *, void *), - struct net *net, int pos, void *p) { + int (*cb_done)(struct sctp_transport *, void *), + struct net *net, int *pos, void *p) { struct rhashtable_iter hti; - void *obj; - int err; + struct sctp_transport *tsp; + int ret; - err = sctp_transport_walk_start(&hti); - if (err) - return err; +again: + ret = sctp_transport_walk_start(&hti); + if (ret) + return ret; - obj = sctp_transport_get_idx(net, &hti, pos + 1); - for (; !IS_ERR_OR_NULL(obj); obj = sctp_transport_get_next(net, &hti)) { - struct sctp_transport *transport = obj; - - if (!sctp_transport_hold(transport)) + tsp = sctp_transport_get_idx(net, &hti, *pos + 1); + for (; !IS_ERR_OR_NULL(tsp); tsp = sctp_transport_get_next(net, &hti)) { + if (!sctp_transport_hold(tsp)) continue; - err = cb(transport, p); - sctp_transport_put(transport); - if (err) + ret = cb(tsp, p); + if (ret) break; + (*pos)++; + sctp_transport_put(tsp); } sctp_transport_walk_stop(&hti); - return err; + if (ret) { + if (cb_done && !cb_done(tsp, p)) { + (*pos)++; + sctp_transport_put(tsp); + goto again; + } + sctp_transport_put(tsp); + } + + return ret; } EXPORT_SYMBOL_GPL(sctp_for_each_transport); From 8c7c19a55e41ae69d1cd18ab56e6e9b66a679a7c Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 15 Sep 2017 11:02:48 +0800 Subject: [PATCH 39/43] sctp: do not mark sk dumped when inet_sctp_diag_fill returns err sctp_diag would not actually dump out sk/asoc if inet_sctp_diag_fill returns err, in which case it shouldn't mark sk dumped by setting cb->args[3] as 1 in sctp_sock_dump(). Otherwise, it could cause some asocs to have no parent's sk dumped in 'ss --sctp'. So this patch is to not set cb->args[3] when inet_sctp_diag_fill() returns err in sctp_sock_dump(). Fixes: 8f840e47f190 ("sctp: add the sctp_diag.c file") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/sctp_diag.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/sctp/sctp_diag.c b/net/sctp/sctp_diag.c index 7008a992749b..22ed01a76b19 100644 --- a/net/sctp/sctp_diag.c +++ b/net/sctp/sctp_diag.c @@ -309,7 +309,6 @@ static int sctp_sock_dump(struct sctp_transport *tsp, void *p) cb->nlh->nlmsg_seq, NLM_F_MULTI, cb->nlh, commp->net_admin) < 0) { - cb->args[3] = 1; err = 1; goto release; } From e67b8a685c7c984e834e3181ef4619cd7025a136 Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Fri, 15 Sep 2017 14:37:38 +0100 Subject: [PATCH 40/43] bpf/verifier: reject BPF_ALU64|BPF_END Neither ___bpf_prog_run nor the JITs accept it. Also adds a new test case. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Signed-off-by: Edward Cree Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/verifier.c | 3 ++- tools/testing/selftests/bpf/test_verifier.c | 16 ++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 477b6932c3c1..799b2451ef2d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2292,7 +2292,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) } } else { if (insn->src_reg != BPF_REG_0 || insn->off != 0 || - (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) { + (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) || + BPF_CLASS(insn->code) == BPF_ALU64) { verbose("BPF_END uses reserved fields\n"); return -EINVAL; } diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 8eb09950258b..26f3250bdcd2 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -6629,6 +6629,22 @@ static struct bpf_test tests[] = { .result = REJECT, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, + { + "invalid 64-bit BPF_END", + .insns = { + BPF_MOV32_IMM(BPF_REG_0, 0), + { + .code = BPF_ALU64 | BPF_END | BPF_TO_LE, + .dst_reg = BPF_REG_0, + .src_reg = 0, + .off = 0, + .imm = 32, + }, + BPF_EXIT_INSN(), + }, + .errstr = "BPF_END uses reserved fields", + .result = REJECT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) From fc22579917eb7e13433448a342f1cb1592920940 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Sep 2017 16:47:42 -0700 Subject: [PATCH 41/43] tcp: fix data delivery rate Now skb->mstamp_skb is updated later, we also need to call tcp_rate_skb_sent() after the update is done. Fixes: 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a85a8c2948e5..1c839c99114c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1002,8 +1002,6 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (clone_it) { TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; - tcp_rate_skb_sent(sk, skb); - oskb = skb; if (unlikely(skb_cloned(skb))) skb = pskb_copy(skb, gfp_mask); @@ -1128,9 +1126,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_enter_cwr(sk); err = net_xmit_eval(err); } - if (!err && oskb) + if (!err && oskb) { oskb->skb_mstamp = tp->tcp_mstamp; - + tcp_rate_skb_sent(sk, oskb); + } return err; } From 2130c0281608a109653272902e4d00b45bf00571 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Sat, 16 Sep 2017 16:28:02 +0200 Subject: [PATCH 42/43] Documentation: link in networking docs Fix link in filter.txt. Acked-by: Pavel Machek Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index e5e33bac2068..789b74dbe1d9 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -45,7 +45,7 @@ in many more places. There's xt_bpf for netfilter, cls_bpf in the kernel qdisc layer, SECCOMP-BPF (SECure COMPuting [1]), and lots of other places such as team driver, PTP code, etc where BPF is being used. - [1] Documentation/prctl/seccomp_filter.txt + [1] Documentation/userspace-api/seccomp_filter.rst Original BPF paper: From 8e29f97979c300406c21994986bdfcdb67fe4ff7 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 15 Sep 2017 15:31:07 +0200 Subject: [PATCH 43/43] mlxsw: spectrum_router: Only handle IPv4 and IPv6 events The driver doesn't support events from address families other than IPv4 and IPv6, so ignore them. Otherwise, we risk queueing a work item before it's initialized. This can happen in case a VRF is configured when MROUTE_MULTIPLE_TABLES is enabled, as the VRF driver will try to add an l3mdev rule for the IPMR family. Fixes: 65e65ec137f4 ("mlxsw: spectrum_router: Don't ignore IPv6 notifications") Signed-off-by: Ido Schimmel Reported-by: Andreas Rammhold Reported-by: Florian Klink Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index f0fb898533fb..2cfb3f5d092d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -4868,7 +4868,8 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb, struct fib_notifier_info *info = ptr; struct mlxsw_sp_router *router; - if (!net_eq(info->net, &init_net)) + if (!net_eq(info->net, &init_net) || + (info->family != AF_INET && info->family != AF_INET6)) return NOTIFY_DONE; fib_work = kzalloc(sizeof(*fib_work), GFP_ATOMIC);