From f30bf2a5cac6c60ab366c4bc6db913597bf4d6ab Mon Sep 17 00:00:00 2001
From: Tommi Rantala <tt.rantala@gmail.com>
Date: Thu, 7 May 2015 15:12:21 +0300
Subject: [PATCH 01/44] ipvs: fix memory leak in ip_vs_ctl.c

Fix memory leak introduced in commit a0840e2e165a ("IPVS: netns,
ip_vs_ctl local vars moved to ipvs struct."):

unreferenced object 0xffff88005785b800 (size 2048):
  comm "(-localed)", pid 1434, jiffies 4294755650 (age 1421.089s)
  hex dump (first 32 bytes):
    bb 89 0b 83 ff ff ff ff b0 78 f0 4e 00 88 ff ff  .........x.N....
    04 00 00 00 a4 01 00 00 00 00 00 00 00 00 00 00  ................
  backtrace:
    [<ffffffff8262ea8e>] kmemleak_alloc+0x4e/0xb0
    [<ffffffff811fba74>] __kmalloc_track_caller+0x244/0x430
    [<ffffffff811b88a0>] kmemdup+0x20/0x50
    [<ffffffff823276b7>] ip_vs_control_net_init+0x1f7/0x510
    [<ffffffff8231d630>] __ip_vs_init+0x100/0x250
    [<ffffffff822363a1>] ops_init+0x41/0x190
    [<ffffffff82236583>] setup_net+0x93/0x150
    [<ffffffff82236cc2>] copy_net_ns+0x82/0x140
    [<ffffffff810ab13d>] create_new_namespaces+0xfd/0x190
    [<ffffffff810ab49a>] unshare_nsproxy_namespaces+0x5a/0xc0
    [<ffffffff810833e3>] SyS_unshare+0x173/0x310
    [<ffffffff8265cbd7>] system_call_fastpath+0x12/0x6f
    [<ffffffffffffffff>] 0xffffffffffffffff

Fixes: a0840e2e165a ("IPVS: netns, ip_vs_ctl local vars moved to ipvs struct.")
Signed-off-by: Tommi Rantala <tt.rantala@gmail.com>
Acked-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Simon Horman <horms@verge.net.au>
---
 net/netfilter/ipvs/ip_vs_ctl.c | 3 +++
 1 file changed, 3 insertions(+)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 49532672f66d..285eae3a1454 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -3823,6 +3823,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct net *net)
 	cancel_work_sync(&ipvs->defense_work.work);
 	unregister_net_sysctl_table(ipvs->sysctl_hdr);
 	ip_vs_stop_estimator(net, &ipvs->tot_stats);
+
+	if (!net_eq(net, &init_net))
+		kfree(ipvs->sysctl_tbl);
 }
 
 #else

From 47b4e1fc4972cc43a19121bc2608a60aef3bf216 Mon Sep 17 00:00:00 2001
From: Janusz Dziedzic <janusz.dziedzic@tieto.com>
Date: Mon, 11 May 2015 11:31:15 +0200
Subject: [PATCH 02/44] mac80211: move WEP tailroom size check

Remove checking tailroom when adding IV as it uses only
headroom, and move the check to the ICV generation that
actually needs the tailroom.

In other case I hit such warning and datapath don't work,
when testing:
- IBSS + WEP
- ath9k with hw crypt enabled
- IPv6 data (ping6)

WARNING: CPU: 3 PID: 13301 at net/mac80211/wep.c:102 ieee80211_wep_add_iv+0x129/0x190 [mac80211]()
[...]
Call Trace:
[<ffffffff817bf491>] dump_stack+0x45/0x57
[<ffffffff8107746a>] warn_slowpath_common+0x8a/0xc0
[<ffffffff8107755a>] warn_slowpath_null+0x1a/0x20
[<ffffffffc09ae109>] ieee80211_wep_add_iv+0x129/0x190 [mac80211]
[<ffffffffc09ae7ab>] ieee80211_crypto_wep_encrypt+0x6b/0xd0 [mac80211]
[<ffffffffc09d3fb1>] invoke_tx_handlers+0xc51/0xf30 [mac80211]
[...]

Cc: stable@vger.kernel.org
Signed-off-by: Janusz Dziedzic <janusz.dziedzic@tieto.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 net/mac80211/wep.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c
index a4220e92f0cc..efa3f48f1ec5 100644
--- a/net/mac80211/wep.c
+++ b/net/mac80211/wep.c
@@ -98,8 +98,7 @@ static u8 *ieee80211_wep_add_iv(struct ieee80211_local *local,
 
 	hdr->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
 
-	if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN ||
-		    skb_headroom(skb) < IEEE80211_WEP_IV_LEN))
+	if (WARN_ON(skb_headroom(skb) < IEEE80211_WEP_IV_LEN))
 		return NULL;
 
 	hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -167,6 +166,9 @@ int ieee80211_wep_encrypt(struct ieee80211_local *local,
 	size_t len;
 	u8 rc4key[3 + WLAN_KEY_LEN_WEP104];
 
+	if (WARN_ON(skb_tailroom(skb) < IEEE80211_WEP_ICV_LEN))
+		return -1;
+
 	iv = ieee80211_wep_add_iv(local, skb, keylen, keyidx);
 	if (!iv)
 		return -1;

From ca79f232054abd079648fdb4400c71a1310f7bc8 Mon Sep 17 00:00:00 2001
From: Wen-chien Jesse Sung <jesse.sung@canonical.com>
Date: Wed, 13 May 2015 11:39:24 +0800
Subject: [PATCH 03/44] Bluetooth: ath3k: Add a new ID 0cf3:e006 to ath3k list

Device info in /sys/kernel/debug/usb/devices:

T:  Bus=01 Lev=01 Prnt=01 Port=05 Cnt=02 Dev#=  3 Spd=12   MxCh= 0
D:  Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=0cf3 ProdID=e006 Rev= 0.01
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Wen-chien Jesse Sung <jesse.sung@canonical.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/ath3k.c | 2 ++
 drivers/bluetooth/btusb.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/bluetooth/ath3k.c b/drivers/bluetooth/ath3k.c
index 288547a3c566..9a05153e1224 100644
--- a/drivers/bluetooth/ath3k.c
+++ b/drivers/bluetooth/ath3k.c
@@ -104,6 +104,7 @@ static const struct usb_device_id ath3k_table[] = {
 	{ USB_DEVICE(0x0cf3, 0xe003) },
 	{ USB_DEVICE(0x0CF3, 0xE004) },
 	{ USB_DEVICE(0x0CF3, 0xE005) },
+	{ USB_DEVICE(0x0CF3, 0xE006) },
 	{ USB_DEVICE(0x13d3, 0x3362) },
 	{ USB_DEVICE(0x13d3, 0x3375) },
 	{ USB_DEVICE(0x13d3, 0x3393) },
@@ -158,6 +159,7 @@ static const struct usb_device_id ath3k_blist_tbl[] = {
 	{ USB_DEVICE(0x0CF3, 0x817a), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x0cf3, 0xe004), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x0cf3, 0xe005), .driver_info = BTUSB_ATH3012 },
+	{ USB_DEVICE(0x0cf3, 0xe006), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x0cf3, 0xe003), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x13d3, 0x3362), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x13d3, 0x3375), .driver_info = BTUSB_ATH3012 },
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index d21f3b4176d3..13d2cae3e13d 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -202,6 +202,7 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x0cf3, 0xe003), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x0cf3, 0xe004), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x0cf3, 0xe005), .driver_info = BTUSB_ATH3012 },
+	{ USB_DEVICE(0x0cf3, 0xe006), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x13d3, 0x3362), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x13d3, 0x3375), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x13d3, 0x3393), .driver_info = BTUSB_ATH3012 },

From 2054111b107d9449393d96dce6b66731bbfea9ad Mon Sep 17 00:00:00 2001
From: Wen-chien Jesse Sung <jesse.sung@canonical.com>
Date: Wed, 13 May 2015 11:39:25 +0800
Subject: [PATCH 04/44] Bluetooth: btusb: Add support for 0cf3:e007

Device 0cf3:e007 is one of the QCA ROME family.

T:  Bus=01 Lev=01 Prnt=01 Port=05 Cnt=02 Dev#=  3 Spd=12   MxCh= 0
D:  Ver= 2.01 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=0cf3 ProdID=e007 Rev= 0.01
C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA
I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=81(I) Atr=03(Int.) MxPS=  16 Ivl=1ms
E:  Ad=82(I) Atr=02(Bulk) MxPS=  64 Ivl=0ms
E:  Ad=02(O) Atr=02(Bulk) MxPS=  64 Ivl=0ms
I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   0 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   0 Ivl=1ms
I:  If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=   9 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=   9 Ivl=1ms
I:  If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  17 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  17 Ivl=1ms
I:  If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  25 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  25 Ivl=1ms
I:  If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  33 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  33 Ivl=1ms
I:  If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
E:  Ad=83(I) Atr=01(Isoc) MxPS=  49 Ivl=1ms
E:  Ad=03(O) Atr=01(Isoc) MxPS=  49 Ivl=1ms

Signed-off-by: Wen-chien Jesse Sung <jesse.sung@canonical.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/btusb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 13d2cae3e13d..7da7efc2cd67 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -219,6 +219,7 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x0489, 0xe03c), .driver_info = BTUSB_ATH3012 },
 
 	/* QCA ROME chipset */
+	{ USB_DEVICE(0x0cf3, 0xe007), .driver_info = BTUSB_QCA_ROME },
 	{ USB_DEVICE(0x0cf3, 0xe300), .driver_info = BTUSB_QCA_ROME },
 	{ USB_DEVICE(0x0cf3, 0xe360), .driver_info = BTUSB_QCA_ROME },
 

From ec0810d2ac1c932dad48f45da67e3adc5c5449a1 Mon Sep 17 00:00:00 2001
From: Dmitry Tunin <hanipouspilot@gmail.com>
Date: Sat, 2 May 2015 13:36:58 +0300
Subject: [PATCH 05/44] Bluetooth: ath3k: add support of 04ca:300f AR3012
 device

BugLink: https://bugs.launchpad.net/bugs/1449730

T:  Bus=01 Lev=01 Prnt=01 Port=04 Cnt=02 Dev#=  3 Spd=12  MxCh= 0
D:  Ver= 1.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs=  1
P:  Vendor=04ca ProdID=300f Rev=00.01
C:  #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=100mA
I:  If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb
I:  If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb

Signed-off-by: Dmitry Tunin <hanipouspilot@gmail.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 drivers/bluetooth/ath3k.c | 2 ++
 drivers/bluetooth/btusb.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/bluetooth/ath3k.c b/drivers/bluetooth/ath3k.c
index 9a05153e1224..8c81af6dbe06 100644
--- a/drivers/bluetooth/ath3k.c
+++ b/drivers/bluetooth/ath3k.c
@@ -88,6 +88,7 @@ static const struct usb_device_id ath3k_table[] = {
 	{ USB_DEVICE(0x04CA, 0x3007) },
 	{ USB_DEVICE(0x04CA, 0x3008) },
 	{ USB_DEVICE(0x04CA, 0x300b) },
+	{ USB_DEVICE(0x04CA, 0x300f) },
 	{ USB_DEVICE(0x04CA, 0x3010) },
 	{ USB_DEVICE(0x0930, 0x0219) },
 	{ USB_DEVICE(0x0930, 0x0220) },
@@ -144,6 +145,7 @@ static const struct usb_device_id ath3k_blist_tbl[] = {
 	{ USB_DEVICE(0x04ca, 0x3007), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x04ca, 0x3008), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x04ca, 0x300b), .driver_info = BTUSB_ATH3012 },
+	{ USB_DEVICE(0x04ca, 0x300f), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x04ca, 0x3010), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x0930, 0x0219), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x0930, 0x0220), .driver_info = BTUSB_ATH3012 },
diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 7da7efc2cd67..3c10d4dfe9a7 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -186,6 +186,7 @@ static const struct usb_device_id blacklist_table[] = {
 	{ USB_DEVICE(0x04ca, 0x3007), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x04ca, 0x3008), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x04ca, 0x300b), .driver_info = BTUSB_ATH3012 },
+	{ USB_DEVICE(0x04ca, 0x300f), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x04ca, 0x3010), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x0930, 0x0219), .driver_info = BTUSB_ATH3012 },
 	{ USB_DEVICE(0x0930, 0x0220), .driver_info = BTUSB_ATH3012 },

From be346ffaad9bc354075fba5cd009fc4519abdd64 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Tue, 12 May 2015 20:53:14 -0400
Subject: [PATCH 06/44] vlan: Correctly propagate promisc|allmulti flags in
 notifier.

Currently vlan notifier handler will try to update all vlans
for a device when that device comes up.  A problem occurs,
however, when the vlan device was set to promiscuous, but not
by the user (ex: a bridge).  In that case, dev->gflags are
not updated.  What results is that the lower device ends
up with an extra promiscuity count.  Here are the
backtraces that prove this:
[62852.052179]  [<ffffffff814fe248>] __dev_set_promiscuity+0x38/0x1e0
[62852.052186]  [<ffffffff8160bcbb>] ? _raw_spin_unlock_bh+0x1b/0x40
[62852.052188]  [<ffffffff814fe4be>] ? dev_set_rx_mode+0x2e/0x40
[62852.052190]  [<ffffffff814fe694>] dev_set_promiscuity+0x24/0x50
[62852.052194]  [<ffffffffa0324795>] vlan_dev_open+0xd5/0x1f0 [8021q]
[62852.052196]  [<ffffffff814fe58f>] __dev_open+0xbf/0x140
[62852.052198]  [<ffffffff814fe88d>] __dev_change_flags+0x9d/0x170
[62852.052200]  [<ffffffff814fe989>] dev_change_flags+0x29/0x60

The above comes from the setting the vlan device to IFF_UP state.

[62852.053569]  [<ffffffff814fe248>] __dev_set_promiscuity+0x38/0x1e0
[62852.053571]  [<ffffffffa032459b>] ? vlan_dev_set_rx_mode+0x2b/0x30
[8021q]
[62852.053573]  [<ffffffff814fe8d5>] __dev_change_flags+0xe5/0x170
[62852.053645]  [<ffffffff814fe989>] dev_change_flags+0x29/0x60
[62852.053647]  [<ffffffffa032334a>] vlan_device_event+0x18a/0x690
[8021q]
[62852.053649]  [<ffffffff8161036c>] notifier_call_chain+0x4c/0x70
[62852.053651]  [<ffffffff8109d456>] raw_notifier_call_chain+0x16/0x20
[62852.053653]  [<ffffffff814f744d>] call_netdevice_notifiers+0x2d/0x60
[62852.053654]  [<ffffffff814fe1a3>] __dev_notify_flags+0x33/0xa0
[62852.053656]  [<ffffffff814fe9b2>] dev_change_flags+0x52/0x60
[62852.053657]  [<ffffffff8150cd57>] do_setlink+0x397/0xa40

And this one comes from the notification code.  What we end
up with is a vlan with promiscuity count of 1 and and a physical
device with a promiscuity count of 2.  They should both have
a count 1.

To resolve this issue, vlan code can use dev_get_flags() api
which correctly masks promiscuity and allmulti flags.

Signed-off-by: Vlad Yasevich <vyasevic@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/8021q/vlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 98a30a5b8664..59555f0f8fc8 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -443,7 +443,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event,
 	case NETDEV_UP:
 		/* Put all VLANs for this dev in the up state too.  */
 		vlan_group_for_each_dev(grp, i, vlandev) {
-			flgs = vlandev->flags;
+			flgs = dev_get_flags(vlandev);
 			if (flgs & IFF_UP)
 				continue;
 

From 177d0506a911eb60b38b172215df5325ed94fa64 Mon Sep 17 00:00:00 2001
From: Wesley Kuo <wesley.kuo@intel.com>
Date: Wed, 13 May 2015 10:33:15 +0800
Subject: [PATCH 07/44] Bluetooth: Fix remote name event return directly.

This patch fixes hci_remote_name_evt dose not resolve name during
discovery status is RESOLVING. Before simultaneous dual mode scan enabled,
hci_check_pending_name will set discovery status to STOPPED eventually.

Signed-off-by: Wesley Kuo <wesley.kuo@intel.com>
Signed-off-by: Marcel Holtmann <marcel@holtmann.org>
---
 net/bluetooth/hci_core.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 4663c3dad3f5..c4802f3bd4c5 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -2854,9 +2854,11 @@ static void le_scan_disable_work_complete(struct hci_dev *hdev, u8 status,
 			 * state. If we were running both LE and BR/EDR inquiry
 			 * simultaneously, and BR/EDR inquiry is already
 			 * finished, stop discovery, otherwise BR/EDR inquiry
-			 * will stop discovery when finished.
+			 * will stop discovery when finished. If we will resolve
+			 * remote device name, do not change discovery state.
 			 */
-			if (!test_bit(HCI_INQUIRY, &hdev->flags))
+			if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
+			    hdev->discovery.state != DISCOVERY_RESOLVING)
 				hci_discovery_set_state(hdev,
 							DISCOVERY_STOPPED);
 		} else {

From 91dd93f956b9ea9ecf47fd4b9acd2d2e7f980303 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 12 May 2015 17:24:50 -0700
Subject: [PATCH 08/44] netlink: move nl_table in read_mostly section

netlink sockets creation and deletion heavily modify nl_table_users
and nl_table_lock.

If nl_table is sharing one cache line with one of them, netlink
performance is really bad on SMP.

ffffffff81ff5f00 B nl_table
ffffffff81ff5f0c b nl_table_users

Putting nl_table in read_mostly section increased performance
of my open/delete netlink sockets test by about 80 %

This came up while diagnosing a getaddrinfo() problem.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index daa0b818174b..dbe885901b34 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -89,7 +89,7 @@ static inline int netlink_is_kernel(struct sock *sk)
 	return nlk_sk(sk)->flags & NETLINK_KERNEL_SOCKET;
 }
 
-struct netlink_table *nl_table;
+struct netlink_table *nl_table __read_mostly;
 EXPORT_SYMBOL_GPL(nl_table);
 
 static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);

From e87a468eb97da35d8dc00e8fa9828b4de4ab69d0 Mon Sep 17 00:00:00 2001
From: Vlad Yasevich <vyasevich@gmail.com>
Date: Thu, 14 May 2015 20:34:08 -0400
Subject: [PATCH 09/44] ipv6: Fix udp checksums with raw sockets

It was reported that trancerout6 would cause
a kernel to crash when trying to compute checksums
on raw UDP packets.  The cause was the check in
__ip6_append_data that would attempt to use
partial checksums on the packet.  However,
raw sockets do not initialize partial checksum
fields so partial checksums can't be used.

Solve this the same way IPv4 does it.  raw sockets
pass transhdrlen value of 0 to ip_append_data which
causes the checksum to be computed in software.  Use
the same check in ip6_append_data (check transhdrlen).

Reported-by: Wolfgang Walter <linux@stwm.de>
CC: Wolfgang Walter <linux@stwm.de>
CC: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Vladislav Yasevich <vyasevic@redhat.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_output.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index c21777565c58..bc09cb97b840 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -1300,8 +1300,10 @@ static int __ip6_append_data(struct sock *sk,
 
 	/* If this is the first and only packet and device
 	 * supports checksum offloading, let's use it.
+	 * Use transhdrlen, same as IPv4, because partial
+	 * sums only work when transhdrlen is set.
 	 */
-	if (!skb && sk->sk_protocol == IPPROTO_UDP &&
+	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
 	    length + fragheaderlen < mtu &&
 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
 	    !exthdrlen)

From c1c52db16e26a26b545821abae303310a074350f Mon Sep 17 00:00:00 2001
From: Bjorn Helgaas <bhelgaas@google.com>
Date: Thu, 14 May 2015 18:17:08 -0500
Subject: [PATCH 10/44] net/mlx4: Avoid 'may be used uninitialized' warnings

With a cross-compiler based on gcc-4.9, I see warnings like the following:

  drivers/net/ethernet/mellanox/mlx4/resource_tracker.c: In function 'mlx4_SW2HW_CQ_wrapper':
  drivers/net/ethernet/mellanox/mlx4/resource_tracker.c:3048:10: error: 'cq' may be used uninitialized in this function [-Werror=maybe-uninitialized]
    cq->mtt = mtt;

I think the warning is spurious because we only use cq when
cq_res_start_move_to() returns zero, and it always initializes *cq in that
case.  The srq case is similar.  But maybe gcc isn't smart enough to figure
that out.

Initialize cq and srq explicitly to avoid the warnings.

Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 92fce1b98558..bafe2180cf0c 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -3187,7 +3187,7 @@ int mlx4_SW2HW_CQ_wrapper(struct mlx4_dev *dev, int slave,
 	int cqn = vhcr->in_modifier;
 	struct mlx4_cq_context *cqc = inbox->buf;
 	int mtt_base = cq_get_mtt_addr(cqc) / dev->caps.mtt_entry_sz;
-	struct res_cq *cq;
+	struct res_cq *cq = NULL;
 	struct res_mtt *mtt;
 
 	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_HW, &cq);
@@ -3223,7 +3223,7 @@ int mlx4_HW2SW_CQ_wrapper(struct mlx4_dev *dev, int slave,
 {
 	int err;
 	int cqn = vhcr->in_modifier;
-	struct res_cq *cq;
+	struct res_cq *cq = NULL;
 
 	err = cq_res_start_move_to(dev, slave, cqn, RES_CQ_ALLOCATED, &cq);
 	if (err)
@@ -3362,7 +3362,7 @@ int mlx4_SW2HW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
 	int err;
 	int srqn = vhcr->in_modifier;
 	struct res_mtt *mtt;
-	struct res_srq *srq;
+	struct res_srq *srq = NULL;
 	struct mlx4_srq_context *srqc = inbox->buf;
 	int mtt_base = srq_get_mtt_addr(srqc) / dev->caps.mtt_entry_sz;
 
@@ -3406,7 +3406,7 @@ int mlx4_HW2SW_SRQ_wrapper(struct mlx4_dev *dev, int slave,
 {
 	int err;
 	int srqn = vhcr->in_modifier;
-	struct res_srq *srq;
+	struct res_srq *srq = NULL;
 
 	err = srq_res_start_move_to(dev, slave, srqn, RES_SRQ_ALLOCATED, &srq);
 	if (err)

From eea39946a1f36e8a5a47c86e7ecfca6076868505 Mon Sep 17 00:00:00 2001
From: Roopa Prabhu <roopa@cumulusnetworks.com>
Date: Wed, 13 May 2015 21:17:41 -0700
Subject: [PATCH 11/44] rename RTNH_F_EXTERNAL to RTNH_F_OFFLOAD

RTNH_F_EXTERNAL today is printed as "offload" in iproute2 output.

This patch renames the flag to be consistent with what the user sees.

Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/uapi/linux/rtnetlink.h | 2 +-
 net/ipv4/fib_trie.c            | 2 +-
 net/switchdev/switchdev.c      | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 974db03f7b1a..17fb02f488da 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -337,7 +337,7 @@ struct rtnexthop {
 #define RTNH_F_DEAD		1	/* Nexthop is dead (used by multipath)	*/
 #define RTNH_F_PERVASIVE	2	/* Do recursive gateway lookup	*/
 #define RTNH_F_ONLINK		4	/* Gateway is forced on link	*/
-#define RTNH_F_EXTERNAL		8	/* Route installed externally	*/
+#define RTNH_F_OFFLOAD		8	/* offloaded route */
 
 /* Macros to handle hexthops */
 
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index e13fcc602da2..64c2076ced54 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1764,7 +1764,7 @@ void fib_table_flush_external(struct fib_table *tb)
 			/* record local slen */
 			slen = fa->fa_slen;
 
-			if (!fi || !(fi->fib_flags & RTNH_F_EXTERNAL))
+			if (!fi || !(fi->fib_flags & RTNH_F_OFFLOAD))
 				continue;
 
 			netdev_switch_fib_ipv4_del(n->key,
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index 46568b85c333..055453d48668 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -338,7 +338,7 @@ int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
 					      fi, tos, type, nlflags,
 					      tb_id);
 		if (!err)
-			fi->fib_flags |= RTNH_F_EXTERNAL;
+			fi->fib_flags |= RTNH_F_OFFLOAD;
 	}
 
 	return err;
@@ -364,7 +364,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 	const struct swdev_ops *ops;
 	int err = 0;
 
-	if (!(fi->fib_flags & RTNH_F_EXTERNAL))
+	if (!(fi->fib_flags & RTNH_F_OFFLOAD))
 		return 0;
 
 	dev = netdev_switch_get_dev_by_nhs(fi);
@@ -376,7 +376,7 @@ int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
 		err = ops->swdev_fib_ipv4_del(dev, htonl(dst), dst_len,
 					      fi, tos, type, tb_id);
 		if (!err)
-			fi->fib_flags &= ~RTNH_F_EXTERNAL;
+			fi->fib_flags &= ~RTNH_F_OFFLOAD;
 	}
 
 	return err;

From 86b5e7de07aa1abc87ecc40d2aca6070348e4469 Mon Sep 17 00:00:00 2001
From: Nathan Sullivan <nathan.sullivan@ni.com>
Date: Wed, 13 May 2015 17:01:36 -0500
Subject: [PATCH 12/44] net: macb: Add better comment for RXUBR handling

Describe the handler for RXUBR better with a new comment.

Signed-off-by: Nathan Sullivan <nathan.sullivan@ni.com>
Reviewied-by: Josh Cartwright <joshc@ni.com>
Reviewied-by: Ben Shelton <ben.shelton@ni.com>
Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cadence/macb.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 61aa570aad9a..5f10dfc631d7 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -1037,6 +1037,12 @@ static irqreturn_t macb_interrupt(int irq, void *dev_id)
 		 * add that if/when we get our hands on a full-blown MII PHY.
 		 */
 
+		/* There is a hardware issue under heavy load where DMA can
+		 * stop, this causes endless "used buffer descriptor read"
+		 * interrupts but it can be cleared by re-enabling RX. See
+		 * the at91 manual, section 41.3.1 or the Zynq manual
+		 * section 16.7.4 for details.
+		 */
 		if (status & MACB_BIT(RXUBR)) {
 			ctrl = macb_readl(bp, NCR);
 			macb_writel(bp, NCR, ctrl & ~MACB_BIT(RE));

From 595ca5880b37d4aa3c292d75531577175d36b225 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Thu, 7 May 2015 14:15:58 +0200
Subject: [PATCH 13/44] netfilter: avoid build error if TPROXY/SOCKET=y &&
 NF_DEFRAG_IPV6=m

With TPROXY=y but DEFRAG_IPV6=m we get build failure:

net/built-in.o: In function `tproxy_tg_init':
net/netfilter/xt_TPROXY.c:588: undefined reference to `nf_defrag_ipv6_enable'

If DEFRAG_IPV6 is modular, TPROXY must be too.
(or both must be builtin).

This enforces =m for both.

Reported-and-tested-by: Liu Hua <liusdu@126.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index f70e34a68f70..a0f3e6a3c7d1 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -863,6 +863,7 @@ config NETFILTER_XT_TARGET_TPROXY
 	depends on NETFILTER_XTABLES
 	depends on NETFILTER_ADVANCED
 	depends on (IPV6 || IPV6=n)
+	depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
 	depends on IP_NF_MANGLE
 	select NF_DEFRAG_IPV4
 	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
@@ -1356,6 +1357,7 @@ config NETFILTER_XT_MATCH_SOCKET
 	depends on NETFILTER_ADVANCED
 	depends on !NF_CONNTRACK || NF_CONNTRACK
 	depends on (IPV6 || IPV6=n)
+	depends on (IP6_NF_IPTABLES || IP6_NF_IPTABLES=n)
 	select NF_DEFRAG_IPV4
 	select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
 	help

From b3cad287d13b5f6695c6b4aab72969cd64bf0171 Mon Sep 17 00:00:00 2001
From: Jesper Dangaard Brouer <brouer@redhat.com>
Date: Thu, 7 May 2015 14:54:16 +0200
Subject: [PATCH 14/44] conntrack: RFC5961 challenge ACK confuse conntrack
 LAST-ACK transition

In compliance with RFC5961, the network stack send challenge ACK in
response to spurious SYN packets, since commit 0c228e833c88 ("tcp:
Restore RFC5961-compliant behavior for SYN packets").

This pose a problem for netfilter conntrack in state LAST_ACK, because
this challenge ACK is (falsely) seen as ACKing last FIN, causing a
false state transition (into TIME_WAIT).

The challenge ACK is hard to distinguish from real last ACK.  Thus,
solution introduce a flag that tracks the potential for seeing a
challenge ACK, in case a SYN packet is let through and current state
is LAST_ACK.

When conntrack transition LAST_ACK to TIME_WAIT happens, this flag is
used for determining if we are expecting a challenge ACK.

Scapy based reproducer script avail here:
 https://github.com/netoptimizer/network-testing/blob/master/scapy/tcp_hacks_3WHS_LAST_ACK.py

Fixes: 0c228e833c88 ("tcp: Restore RFC5961-compliant behavior for SYN packets")
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Acked-by: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 .../uapi/linux/netfilter/nf_conntrack_tcp.h   |  3 ++
 net/netfilter/nf_conntrack_proto_tcp.c        | 35 +++++++++++++++++--
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/netfilter/nf_conntrack_tcp.h b/include/uapi/linux/netfilter/nf_conntrack_tcp.h
index 9993a421201c..ef9f80f0f529 100644
--- a/include/uapi/linux/netfilter/nf_conntrack_tcp.h
+++ b/include/uapi/linux/netfilter/nf_conntrack_tcp.h
@@ -42,6 +42,9 @@ enum tcp_conntrack {
 /* The field td_maxack has been set */
 #define IP_CT_TCP_FLAG_MAXACK_SET		0x20
 
+/* Marks possibility for expected RFC5961 challenge ACK */
+#define IP_CT_EXP_CHALLENGE_ACK 		0x40
+
 struct nf_ct_tcp_flags {
 	__u8 flags;
 	__u8 mask;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index 5caa0c41bf26..70383de72054 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -202,7 +202,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
  *	sES -> sES	:-)
  *	sFW -> sCW	Normal close request answered by ACK.
  *	sCW -> sCW
- *	sLA -> sTW	Last ACK detected.
+ *	sLA -> sTW	Last ACK detected (RFC5961 challenged)
  *	sTW -> sTW	Retransmitted last ACK. Remain in the same state.
  *	sCL -> sCL
  */
@@ -261,7 +261,7 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
  *	sES -> sES	:-)
  *	sFW -> sCW	Normal close request answered by ACK.
  *	sCW -> sCW
- *	sLA -> sTW	Last ACK detected.
+ *	sLA -> sTW	Last ACK detected (RFC5961 challenged)
  *	sTW -> sTW	Retransmitted last ACK.
  *	sCL -> sCL
  */
@@ -906,6 +906,7 @@ static int tcp_packet(struct nf_conn *ct,
 					1 : ct->proto.tcp.last_win;
 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
 				ct->proto.tcp.last_wscale;
+			ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
 			ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
 				ct->proto.tcp.last_flags;
 			memset(&ct->proto.tcp.seen[dir], 0,
@@ -923,7 +924,9 @@ static int tcp_packet(struct nf_conn *ct,
 		 * may be in sync but we are not. In that case, we annotate
 		 * the TCP options and let the packet go through. If it is a
 		 * valid SYN packet, the server will reply with a SYN/ACK, and
-		 * then we'll get in sync. Otherwise, the server ignores it. */
+		 * then we'll get in sync. Otherwise, the server potentially
+		 * responds with a challenge ACK if implementing RFC5961.
+		 */
 		if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
 			struct ip_ct_tcp_state seen = {};
 
@@ -939,6 +942,13 @@ static int tcp_packet(struct nf_conn *ct,
 				ct->proto.tcp.last_flags |=
 					IP_CT_TCP_FLAG_SACK_PERM;
 			}
+			/* Mark the potential for RFC5961 challenge ACK,
+			 * this pose a special problem for LAST_ACK state
+			 * as ACK is intrepretated as ACKing last FIN.
+			 */
+			if (old_state == TCP_CONNTRACK_LAST_ACK)
+				ct->proto.tcp.last_flags |=
+					IP_CT_EXP_CHALLENGE_ACK;
 		}
 		spin_unlock_bh(&ct->lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
@@ -970,6 +980,25 @@ static int tcp_packet(struct nf_conn *ct,
 			nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid state ");
 		return -NF_ACCEPT;
+	case TCP_CONNTRACK_TIME_WAIT:
+		/* RFC5961 compliance cause stack to send "challenge-ACK"
+		 * e.g. in response to spurious SYNs.  Conntrack MUST
+		 * not believe this ACK is acking last FIN.
+		 */
+		if (old_state == TCP_CONNTRACK_LAST_ACK &&
+		    index == TCP_ACK_SET &&
+		    ct->proto.tcp.last_dir != dir &&
+		    ct->proto.tcp.last_index == TCP_SYN_SET &&
+		    (ct->proto.tcp.last_flags & IP_CT_EXP_CHALLENGE_ACK)) {
+			/* Detected RFC5961 challenge ACK */
+			ct->proto.tcp.last_flags &= ~IP_CT_EXP_CHALLENGE_ACK;
+			spin_unlock_bh(&ct->lock);
+			if (LOG_INVALID(net, IPPROTO_TCP))
+				nf_log_packet(net, pf, 0, skb, NULL, NULL, NULL,
+				      "nf_ct_tcp: challenge-ACK ignored ");
+			return NF_ACCEPT; /* Don't change state */
+		}
+		break;
 	case TCP_CONNTRACK_CLOSE:
 		if (index == TCP_RST_SET
 		    && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)

From 960bd2c26421d321e890f1936938196ead41976f Mon Sep 17 00:00:00 2001
From: Mirek Kratochvil <exa.exa@gmail.com>
Date: Fri, 15 May 2015 21:15:29 +0200
Subject: [PATCH 15/44] netfilter: nf_tables: fix bogus warning in
 nft_data_uninit()

The values 0x00000000-0xfffffeff are reserved for userspace datatype. When,
deleting set elements with maps, a bogus warning is triggered.

WARNING: CPU: 0 PID: 11133 at net/netfilter/nf_tables_api.c:4481 nft_data_uninit+0x35/0x40 [nf_tables]()

This fixes the check accordingly to enum definition in
include/linux/netfilter/nf_tables.h

Fixes: https://bugzilla.netfilter.org/show_bug.cgi?id=1013
Signed-off-by: Mirek Kratochvil <exa.exa@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nf_tables_api.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index ad9d11fb29fd..34ded09317e7 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -4472,9 +4472,9 @@ EXPORT_SYMBOL_GPL(nft_data_init);
  */
 void nft_data_uninit(const struct nft_data *data, enum nft_data_types type)
 {
-	switch (type) {
-	case NFT_DATA_VALUE:
+	if (type < NFT_DATA_VERDICT)
 		return;
+	switch (type) {
 	case NFT_DATA_VERDICT:
 		return nft_verdict_uninit(data);
 	default:

From 1f9993f6825f9cb93f75f794b66e7428dfc72467 Mon Sep 17 00:00:00 2001
From: Ying Xue <ying.xue@windriver.com>
Date: Fri, 15 May 2015 12:53:21 +0800
Subject: [PATCH 16/44] rocker: fix a neigh entry leak issue

Once we get a neighbour through looking up arp cache or creating a
new one in rocker_port_ipv4_resolve(), the neighbour's refcount is
already taken. But as we don't put the refcount again after it's
used, this makes the neighbour entry leaked.

Suggested-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Acked-by: Jiri Pirko <jiri@resnulli.us>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/rocker/rocker.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index ec251531bd9f..cf98cc9bbc8d 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -2921,10 +2921,11 @@ static int rocker_port_ipv4_resolve(struct rocker_port *rocker_port,
 	struct neighbour *n = __ipv4_neigh_lookup(dev, (__force u32)ip_addr);
 	int err = 0;
 
-	if (!n)
+	if (!n) {
 		n = neigh_create(&arp_tbl, &ip_addr, dev);
-	if (!n)
-		return -ENOMEM;
+		if (IS_ERR(n))
+			return IS_ERR(n);
+	}
 
 	/* If the neigh is already resolved, then go ahead and
 	 * install the entry, otherwise start the ARP process to
@@ -2936,6 +2937,7 @@ static int rocker_port_ipv4_resolve(struct rocker_port *rocker_port,
 	else
 		neigh_event_send(n, NULL);
 
+	neigh_release(n);
 	return err;
 }
 

From 7e14069651591c81046ffaec13c3dac8cb70f5fb Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Fri, 15 May 2015 16:30:41 -0700
Subject: [PATCH 17/44] net: phy: Allow EEE for all RGMII variants

RGMII interfaces come in multiple flavors: RGMII with transmit or
receive internal delay, no delays at all, or delays in both direction.

This change extends the initial check for PHY_INTERFACE_MODE_RGMII to
cover all of these variants since EEE should be allowed for any of these
modes, since it is a property of the RGMII, hence Gigabit PHY capability
more than the RGMII electrical interface and its delays.

Fixes: a59a4d192166 ("phy: add the EEE support and the way to access to the MMD registers")
Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 52cd8db2c57d..757f28a4284c 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -1053,13 +1053,14 @@ int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable)
 {
 	/* According to 802.3az,the EEE is supported only in full duplex-mode.
 	 * Also EEE feature is active when core is operating with MII, GMII
-	 * or RGMII. Internal PHYs are also allowed to proceed and should
-	 * return an error if they do not support EEE.
+	 * or RGMII (all kinds). Internal PHYs are also allowed to proceed and
+	 * should return an error if they do not support EEE.
 	 */
 	if ((phydev->duplex == DUPLEX_FULL) &&
 	    ((phydev->interface == PHY_INTERFACE_MODE_MII) ||
 	    (phydev->interface == PHY_INTERFACE_MODE_GMII) ||
-	    (phydev->interface == PHY_INTERFACE_MODE_RGMII) ||
+	    (phydev->interface >= PHY_INTERFACE_MODE_RGMII &&
+	     phydev->interface <= PHY_INTERFACE_MODE_RGMII_TXID) ||
 	     phy_is_internal(phydev))) {
 		int eee_lp, eee_cap, eee_adv;
 		u32 lp, cap, adv;

From c0bb07df7d981e4091432754e30c9c720e2c0c78 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Sat, 16 May 2015 21:50:28 +0800
Subject: [PATCH 18/44] netlink: Reset portid after netlink_insert failure

The commit c5adde9468b0714a051eac7f9666f23eb10b61f7 ("netlink:
eliminate nl_sk_hash_lock") breaks the autobind retry mechanism
because it doesn't reset portid after a failed netlink_insert.

This means that should autobind fail the first time around, then
the socket will be stuck in limbo as it can never be bound again
since it already has a non-zero portid.

Fixes: c5adde9468b0 ("netlink: eliminate nl_sk_hash_lock")
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/netlink/af_netlink.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index dbe885901b34..bf6e76643f78 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -1081,6 +1081,7 @@ static int netlink_insert(struct sock *sk, u32 portid)
 	if (err) {
 		if (err == -EEXIST)
 			err = -EADDRINUSE;
+		nlk_sk(sk)->portid = 0;
 		sock_put(sk);
 	}
 

From db9683fb412d4af33f66b9fe3d8dace1c6d113c9 Mon Sep 17 00:00:00 2001
From: Tim Beale <tim.beale@alliedtelesis.co.nz>
Date: Wed, 13 May 2015 13:55:04 +1200
Subject: [PATCH 19/44] net: phy: Make sure PHY_RESUMING state change is always
 processed

If phy_start_aneg() was called while the phydev is in the PHY_RESUMING
state, then its state would immediately transition to PHY_AN (or
PHY_FORCING). This meant the phy_state_machine() never processed the
PHY_RESUMING state change, which meant interrupts weren't enabled for the
PHY. If the PHY used low-power mode (i.e. using BMCR_PDOWN), then the
physical link wouldn't get powered up again.

There seems no point for phy_start_aneg() to make the PHY_RESUMING -->
PHY_AN transition, as the state machine will do this anyway. I'm not sure
about the case where autoneg is disabled, as my patch will change
behaviour so that the PHY goes to PHY_NOLINK instead of PHY_FORCING. An
alternative solution would be to move the phy_config_interrupt() and
phy_resume() work out of the state machine and into phy_start().

The background behind this: we're running linux v3.16.7 and from user-space
we want to enable the eth port (i.e. do a SIOCSIFFLAGS ioctl with the
IFF_UP flag) and immediately afterward set the interface's speed/duplex.
Enabling the interface calls .ndo_open() then phy_start() and the PHY
transitions PHY_HALTED --> PHY_RESUMING. Setting the speed/duplex ends up
calling phy_ethtool_sset(), which calls phy_start_aneg() (meanwhile the
phy_state_machine() hasn't processed the PHY_RESUMING state change yet).

Signed-off-by: Tim Beale <tim.beale@alliedtelesis.co.nz>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 757f28a4284c..710696d1af97 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -465,7 +465,7 @@ int phy_start_aneg(struct phy_device *phydev)
 	if (err < 0)
 		goto out_unlock;
 
-	if (phydev->state != PHY_HALTED) {
+	if (phydev->state != PHY_HALTED && phydev->state != PHY_RESUMING) {
 		if (AUTONEG_ENABLE == phydev->autoneg) {
 			phydev->state = PHY_AN;
 			phydev->link_timeout = PHY_AN_TIMEOUT;

From 07ee0722bf941960fb3888f9c9b5839473372fd1 Mon Sep 17 00:00:00 2001
From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 15 May 2015 11:30:47 +0800
Subject: [PATCH 20/44] rhashtable: Add cap on number of elements in hash table

We currently have no limit on the number of elements in a hash table.
This is a problem because some users (tipc) set a ceiling on the
maximum table size and when that is reached the hash table may
degenerate.  Others may encounter OOM when growing and if we allow
insertions when that happens the hash table perofrmance may also
suffer.

This patch adds a new paramater insecure_max_entries which becomes
the cap on the table.  If unset it defaults to max_size * 2.  If
it is also zero it means that there is no cap on the number of
elements in the table.  However, the table will grow whenever the
utilisation hits 100% and if that growth fails, you will get ENOMEM
on insertion.

As allowing oversubscription is potentially dangerous, the name
contains the word insecure.

Note that the cap is not a hard limit.  This is done for performance
reasons as enforcing a hard limit will result in use of atomic ops
that are heavier than the ones we currently use.

The reasoning is that we're only guarding against a gross over-
subscription of the table, rather than a small breach of the limit.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/rhashtable.h | 19 +++++++++++++++++++
 lib/rhashtable.c           | 11 +++++++++++
 2 files changed, 30 insertions(+)

diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h
index dbcbcc59aa92..843ceca9a21e 100644
--- a/include/linux/rhashtable.h
+++ b/include/linux/rhashtable.h
@@ -17,6 +17,7 @@
 #ifndef _LINUX_RHASHTABLE_H
 #define _LINUX_RHASHTABLE_H
 
+#include <linux/atomic.h>
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/jhash.h>
@@ -100,6 +101,7 @@ struct rhashtable;
  * @key_len: Length of key
  * @key_offset: Offset of key in struct to be hashed
  * @head_offset: Offset of rhash_head in struct to be hashed
+ * @insecure_max_entries: Maximum number of entries (may be exceeded)
  * @max_size: Maximum size while expanding
  * @min_size: Minimum size while shrinking
  * @nulls_base: Base value to generate nulls marker
@@ -115,6 +117,7 @@ struct rhashtable_params {
 	size_t			key_len;
 	size_t			key_offset;
 	size_t			head_offset;
+	unsigned int		insecure_max_entries;
 	unsigned int		max_size;
 	unsigned int		min_size;
 	u32			nulls_base;
@@ -286,6 +289,18 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht,
 		(!ht->p.max_size || tbl->size < ht->p.max_size);
 }
 
+/**
+ * rht_grow_above_max - returns true if table is above maximum
+ * @ht:		hash table
+ * @tbl:	current table
+ */
+static inline bool rht_grow_above_max(const struct rhashtable *ht,
+				      const struct bucket_table *tbl)
+{
+	return ht->p.insecure_max_entries &&
+	       atomic_read(&ht->nelems) >= ht->p.insecure_max_entries;
+}
+
 /* The bucket lock is selected based on the hash and protects mutations
  * on a group of hash buckets.
  *
@@ -589,6 +604,10 @@ static inline int __rhashtable_insert_fast(
 		goto out;
 	}
 
+	err = -E2BIG;
+	if (unlikely(rht_grow_above_max(ht, tbl)))
+		goto out;
+
 	if (unlikely(rht_grow_above_100(ht, tbl))) {
 slow_path:
 		spin_unlock_bh(lock);
diff --git a/lib/rhashtable.c b/lib/rhashtable.c
index b28df4019ade..4396434e4715 100644
--- a/lib/rhashtable.c
+++ b/lib/rhashtable.c
@@ -14,6 +14,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/atomic.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/log2.h>
@@ -446,6 +447,10 @@ int rhashtable_insert_slow(struct rhashtable *ht, const void *key,
 	if (key && rhashtable_lookup_fast(ht, key, ht->p))
 		goto exit;
 
+	err = -E2BIG;
+	if (unlikely(rht_grow_above_max(ht, tbl)))
+		goto exit;
+
 	err = -EAGAIN;
 	if (rhashtable_check_elasticity(ht, tbl, hash) ||
 	    rht_grow_above_100(ht, tbl))
@@ -738,6 +743,12 @@ int rhashtable_init(struct rhashtable *ht,
 	if (params->max_size)
 		ht->p.max_size = rounddown_pow_of_two(params->max_size);
 
+	if (params->insecure_max_entries)
+		ht->p.insecure_max_entries =
+			rounddown_pow_of_two(params->insecure_max_entries);
+	else
+		ht->p.insecure_max_entries = ht->p.max_size * 2;
+
 	ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE);
 
 	/* The maximum (not average) chain length grows with the

From ed2a80ab7b76f11af0b2c6255709c4ebf164b667 Mon Sep 17 00:00:00 2001
From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Date: Wed, 13 May 2015 14:19:42 +0200
Subject: [PATCH 21/44] rtnl/bond: don't send rtnl msg for unregistered iface

Before the patch, the command 'ip link add bond2 type bond mode 802.3ad'
causes the kernel to send a rtnl message for the bond2 interface, with an
ifindex 0.

'ip monitor' shows:
0: bond2: <BROADCAST,MULTICAST,MASTER> mtu 1500 state DOWN group default
    link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff
9: bond2@NONE: <BROADCAST,MULTICAST,MASTER> mtu 1500 qdisc noop state DOWN group default
    link/ether ea:3e:1f:53:92:7b brd ff:ff:ff:ff:ff:ff
[snip]

The patch fixes the spotted bug by checking in bond driver if the interface
is registered before calling the notifier chain.
It also adds a check in rtmsg_ifinfo() to prevent this kind of bug in the
future.

Fixes: d4261e565000 ("bonding: create netlink event when bonding option is changed")
CC: Jiri Pirko <jiri@resnulli.us>
Reported-by: Julien Meunier <julien.meunier@6wind.com>
Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/bonding/bond_options.c | 2 +-
 net/core/rtnetlink.c               | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 4df28943d222..e8d3c1d35453 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -624,7 +624,7 @@ int __bond_opt_set(struct bonding *bond,
 out:
 	if (ret)
 		bond_opt_error_interpret(bond, opt, ret, val);
-	else
+	else if (bond->dev->reg_state == NETREG_REGISTERED)
 		call_netdevice_notifiers(NETDEV_CHANGEINFODATA, bond->dev);
 
 	return ret;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 666e0928ba40..8de36824018d 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2416,6 +2416,9 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
 {
 	struct sk_buff *skb;
 
+	if (dev->reg_state != NETREG_REGISTERED)
+		return;
+
 	skb = rtmsg_ifinfo_build_skb(type, dev, change, flags);
 	if (skb)
 		rtmsg_ifinfo_send(skb, dev, flags);

From 21858cd02dabcf290564cbf4769b101eba54d7bb Mon Sep 17 00:00:00 2001
From: Florent Fourcot <florent.fourcot@enst-bretagne.fr>
Date: Sat, 16 May 2015 00:24:59 +0200
Subject: [PATCH 22/44] tcp/ipv6: fix flow label setting in TIME_WAIT state

commit 1d13a96c74fc ("ipv6: tcp: fix flowlabel value in ACK messages
send from TIME_WAIT") added the flow label in the last TCP packets.
Unfortunately, it was not casted properly.

This patch replace the buggy shift with be32_to_cpu/cpu_to_be32.

Fixes: 1d13a96c74fc ("ipv6: tcp: fix flowlabel value in ACK messages")
Reported-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Florent Fourcot <florent.fourcot@enst-bretagne.fr>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_minisocks.c | 2 +-
 net/ipv6/tcp_ipv6.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index e5d7649136fc..b5732a54f2ad 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -300,7 +300,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 			tw->tw_v6_daddr = sk->sk_v6_daddr;
 			tw->tw_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
 			tw->tw_tclass = np->tclass;
-			tw->tw_flowlabel = np->flow_label >> 12;
+			tw->tw_flowlabel = be32_to_cpu(np->flow_label & IPV6_FLOWLABEL_MASK);
 			tw->tw_ipv6only = sk->sk_ipv6only;
 		}
 #endif
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index b6575d665568..3adffb300238 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -914,7 +914,7 @@ static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb)
 			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 			tcp_time_stamp + tcptw->tw_ts_offset,
 			tcptw->tw_ts_recent, tw->tw_bound_dev_if, tcp_twsk_md5_key(tcptw),
-			tw->tw_tclass, (tw->tw_flowlabel << 12));
+			tw->tw_tclass, cpu_to_be32(tw->tw_flowlabel));
 
 	inet_twsk_put(tw);
 }

From 13c3ed6a92724d8c8cb148a14b0ae190ddfe7413 Mon Sep 17 00:00:00 2001
From: "John W. Linville" <linville@tuxdriver.com>
Date: Mon, 18 May 2015 13:51:24 -0400
Subject: [PATCH 23/44] vxlan: correct typo in call to
 unregister_netdevice_queue

By inspection, this appears to be a typo.  The gating comparison
involves vxlan->dev rather than dev.  In fact, dev is the iterator in
the preceding loop above but it is actually constant in the 2nd loop.

Use of dev seems to be a bad cut-n-paste from the prior call to
unregister_netdevice_queue.  Change dev to vxlan->dev, since that is
what is actually being checked.

Signed-off-by: John W. Linville <linville@tuxdriver.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/vxlan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 27a5f954f8e9..21a0fbf1ed94 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2961,7 +2961,7 @@ static void __net_exit vxlan_exit_net(struct net *net)
 		 * to the list by the previous loop.
 		 */
 		if (!net_eq(dev_net(vxlan->dev), net))
-			unregister_netdevice_queue(dev, &list);
+			unregister_netdevice_queue(vxlan->dev, &list);
 	}
 
 	unregister_netdevice_many(&list);

From b5d721d761fccf491f92eefc611e318561683d0a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Mon, 18 May 2015 17:06:14 -0700
Subject: [PATCH 24/44] inet: properly align icsk_ca_priv

tcp_illinois and upcoming tcp_cdg require 64bit alignment of
icsk_ca_priv

x86 does not care, but other architectures might.

Fixes: 05cbc0db03e82 ("ipv4: Create probe timer for tcp PMTU as per RFC4821")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Fan Du <fan.du@intel.com>
Acked-by: Fan Du <fan.du@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/inet_connection_sock.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 48a815823587..497bc14cdb85 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -129,9 +129,10 @@ struct inet_connection_sock {
 
 		u32		  probe_timestamp;
 	} icsk_mtup;
-	u32			  icsk_ca_priv[16];
 	u32			  icsk_user_timeout;
-#define ICSK_CA_PRIV_SIZE	(16 * sizeof(u32))
+
+	u64			  icsk_ca_priv[64 / sizeof(u64)];
+#define ICSK_CA_PRIV_SIZE      (8 * sizeof(u64))
 };
 
 #define ICSK_TIME_RETRANS	1	/* Retransmit timer */

From 33b4b015e1a1ca7a8fdce40af5e71642a8ea355c Mon Sep 17 00:00:00 2001
From: Henning Rogge <hrogge@gmail.com>
Date: Mon, 18 May 2015 21:08:49 +0200
Subject: [PATCH 25/44] net/ipv6/udp: Fix ipv6 multicast socket filter
 regression

Commit <5cf3d46192fc> ("udp: Simplify__udp*_lib_mcast_deliver")
simplified the filter for incoming IPv6 multicast but removed
the check of the local socket address and the UDP destination
address.

This patch restores the filter to prevent sockets bound to a IPv6
multicast IP to receive other UDP traffic link unicast.

Signed-off-by: Henning Rogge <hrogge@gmail.com>
Fixes: 5cf3d46192fc ("udp: Simplify__udp*_lib_mcast_deliver")
Cc: "David S. Miller" <davem@davemloft.net>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/udp.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 3477c919fcc8..c2ec41617a35 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -731,7 +731,9 @@ static bool __udp_v6_is_mcast_sock(struct net *net, struct sock *sk,
 	    (inet->inet_dport && inet->inet_dport != rmt_port) ||
 	    (!ipv6_addr_any(&sk->sk_v6_daddr) &&
 		    !ipv6_addr_equal(&sk->sk_v6_daddr, rmt_addr)) ||
-	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif))
+	    (sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif) ||
+	    (!ipv6_addr_any(&sk->sk_v6_rcv_saddr) &&
+		    !ipv6_addr_equal(&sk->sk_v6_rcv_saddr, loc_addr)))
 		return false;
 	if (!inet6_mc_check(sk, loc_addr, rmt_addr))
 		return false;

From da34ac7626b571d262f92b93f11eb32dd58d9c4e Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 18 May 2015 12:31:44 -0700
Subject: [PATCH 26/44] tcp: only undo on partial ACKs in CA_Loss

Undo based on TCP timestamps should only happen on ACKs that advance
SND_UNA, according to the Eifel algorithm in RFC 3522:

Section 3.2:

  (4) If the value of the Timestamp Echo Reply field of the
      acceptable ACK's Timestamps option is smaller than the
      value of RetransmitTS, then proceed to step (5),

Section Terminology:
   We use the term 'acceptable ACK' as defined in [RFC793].  That is an
   ACK that acknowledges previously unacknowledged data.

This is because upon receiving an out-of-order packet, the receiver
returns the last timestamp that advances RCV_NXT, not the current
timestamp of the packet in the DUPACK. Without checking the flag,
the DUPACK will cause tcp_packet_delayed() to return true and
tcp_try_undo_loss() will revert cwnd reduction.

Note that we check the condition in CA_Recovery already by only
calling tcp_try_undo_partial() if FLAG_SND_UNA_ADVANCED is set or
tcp_try_undo_recovery() if snd_una crosses high_seq.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index bc790ea9960f..9faf775a8c4a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2698,11 +2698,16 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 	struct tcp_sock *tp = tcp_sk(sk);
 	bool recovered = !before(tp->snd_una, tp->high_seq);
 
+	if ((flag & FLAG_SND_UNA_ADVANCED) &&
+	    tcp_try_undo_loss(sk, false))
+		return;
+
 	if (tp->frto) { /* F-RTO RFC5682 sec 3.1 (sack enhanced version). */
 		/* Step 3.b. A timeout is spurious if not all data are
 		 * lost, i.e., never-retransmitted data are (s)acked.
 		 */
-		if (tcp_try_undo_loss(sk, flag & FLAG_ORIG_SACK_ACKED))
+		if ((flag & FLAG_ORIG_SACK_ACKED) &&
+		    tcp_try_undo_loss(sk, true))
 			return;
 
 		if (after(tp->snd_nxt, tp->high_seq) &&
@@ -2732,8 +2737,6 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 		else if (flag & FLAG_SND_UNA_ADVANCED)
 			tcp_reset_reno_sack(tp);
 	}
-	if (tcp_try_undo_loss(sk, false))
-		return;
 	tcp_xmit_retransmit_queue(sk);
 }
 

From b7b0ed910cd8450db6d98cd4361c644bb1c88412 Mon Sep 17 00:00:00 2001
From: Yuchung Cheng <ycheng@google.com>
Date: Mon, 18 May 2015 12:31:45 -0700
Subject: [PATCH 27/44] tcp: don't over-send F-RTO probes

After sending the new data packets to probe (step 2), F-RTO may
incorrectly send more probes if the next ACK advances SND_UNA and
does not sack new packet. However F-RTO RFC 5682 probes at most
once. This bug may cause sender to always send new data instead of
repairing holes, inducing longer HoL blocking on the receiver for
the application.

Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/tcp_input.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9faf775a8c4a..243d674b3ef5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2710,9 +2710,9 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
 		    tcp_try_undo_loss(sk, true))
 			return;
 
-		if (after(tp->snd_nxt, tp->high_seq) &&
-		    (flag & FLAG_DATA_SACKED || is_dupack)) {
-			tp->frto = 0; /* Loss was real: 2nd part of step 3.a */
+		if (after(tp->snd_nxt, tp->high_seq)) {
+			if (flag & FLAG_DATA_SACKED || is_dupack)
+				tp->frto = 0; /* Step 3.a. loss was real */
 		} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
 			tp->high_seq = tp->snd_nxt;
 			__tcp_push_pending_frames(sk, tcp_current_mss(sk),

From 3bfe049807c240344b407e3cfb74544927359817 Mon Sep 17 00:00:00 2001
From: Francesco Ruggeri <fruggeri@aristanetworks.com>
Date: Sun, 17 May 2015 14:30:31 -0700
Subject: [PATCH 28/44] netfilter: nfnetlink_{log,queue}: Register pernet in
 first place

nfnetlink_{log,queue}_init() register the netlink callback nf*_rcv_nl_event
before registering the pernet_subsys, but the callback relies on data
structures allocated by pernet init functions.

When nfnetlink_{log,queue} is loaded, if a netlink message is received after
the netlink callback is registered but before the pernet_subsys is registered,
the kernel will panic in the sequence

nfulnl_rcv_nl_event
  nfnl_log_pernet
    net_generic
      BUG_ON(id == 0)  where id is nfnl_log_net_id.

The panic can be easily reproduced in 4.0.3 by:

while true ;do modprobe nfnetlink_log ; rmmod nfnetlink_log ; done &
while true ;do ip netns add dummy ; ip netns del dummy ; done &

This patch moves register_pernet_subsys to earlier in nfnetlink_log_init.

Notice that the BUG_ON hit in 4.0.3 was recently removed in 2591ffd308
["netns: remove BUG_ONs from net_generic()"].

Signed-off-by: Francesco Ruggeri <fruggeri@arista.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/nfnetlink_log.c        | 19 ++++++++++---------
 net/netfilter/nfnetlink_queue_core.c | 18 +++++++++---------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 3ad91266c821..4ef1fae8445e 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -1073,7 +1073,13 @@ static struct pernet_operations nfnl_log_net_ops = {
 
 static int __init nfnetlink_log_init(void)
 {
-	int status = -ENOMEM;
+	int status;
+
+	status = register_pernet_subsys(&nfnl_log_net_ops);
+	if (status < 0) {
+		pr_err("failed to register pernet ops\n");
+		goto out;
+	}
 
 	netlink_register_notifier(&nfulnl_rtnl_notifier);
 	status = nfnetlink_subsys_register(&nfulnl_subsys);
@@ -1088,28 +1094,23 @@ static int __init nfnetlink_log_init(void)
 		goto cleanup_subsys;
 	}
 
-	status = register_pernet_subsys(&nfnl_log_net_ops);
-	if (status < 0) {
-		pr_err("failed to register pernet ops\n");
-		goto cleanup_logger;
-	}
 	return status;
 
-cleanup_logger:
-	nf_log_unregister(&nfulnl_logger);
 cleanup_subsys:
 	nfnetlink_subsys_unregister(&nfulnl_subsys);
 cleanup_netlink_notifier:
 	netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+	unregister_pernet_subsys(&nfnl_log_net_ops);
+out:
 	return status;
 }
 
 static void __exit nfnetlink_log_fini(void)
 {
-	unregister_pernet_subsys(&nfnl_log_net_ops);
 	nf_log_unregister(&nfulnl_logger);
 	nfnetlink_subsys_unregister(&nfulnl_subsys);
 	netlink_unregister_notifier(&nfulnl_rtnl_notifier);
+	unregister_pernet_subsys(&nfnl_log_net_ops);
 }
 
 MODULE_DESCRIPTION("netfilter userspace logging");
diff --git a/net/netfilter/nfnetlink_queue_core.c b/net/netfilter/nfnetlink_queue_core.c
index 0b98c7420239..11c7682fa0ea 100644
--- a/net/netfilter/nfnetlink_queue_core.c
+++ b/net/netfilter/nfnetlink_queue_core.c
@@ -1317,7 +1317,13 @@ static struct pernet_operations nfnl_queue_net_ops = {
 
 static int __init nfnetlink_queue_init(void)
 {
-	int status = -ENOMEM;
+	int status;
+
+	status = register_pernet_subsys(&nfnl_queue_net_ops);
+	if (status < 0) {
+		pr_err("nf_queue: failed to register pernet ops\n");
+		goto out;
+	}
 
 	netlink_register_notifier(&nfqnl_rtnl_notifier);
 	status = nfnetlink_subsys_register(&nfqnl_subsys);
@@ -1326,19 +1332,13 @@ static int __init nfnetlink_queue_init(void)
 		goto cleanup_netlink_notifier;
 	}
 
-	status = register_pernet_subsys(&nfnl_queue_net_ops);
-	if (status < 0) {
-		pr_err("nf_queue: failed to register pernet ops\n");
-		goto cleanup_subsys;
-	}
 	register_netdevice_notifier(&nfqnl_dev_notifier);
 	nf_register_queue_handler(&nfqh);
 	return status;
 
-cleanup_subsys:
-	nfnetlink_subsys_unregister(&nfqnl_subsys);
 cleanup_netlink_notifier:
 	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+out:
 	return status;
 }
 
@@ -1346,9 +1346,9 @@ static void __exit nfnetlink_queue_fini(void)
 {
 	nf_unregister_queue_handler();
 	unregister_netdevice_notifier(&nfqnl_dev_notifier);
-	unregister_pernet_subsys(&nfnl_queue_net_ops);
 	nfnetlink_subsys_unregister(&nfqnl_subsys);
 	netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+	unregister_pernet_subsys(&nfnl_queue_net_ops);
 
 	rcu_barrier(); /* Wait for completion of call_rcu()'s */
 }

From 1086bbe97a074844188c6c988fa0b1a98c3ccbb9 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@codemonkey.org.uk>
Date: Tue, 19 May 2015 20:55:17 -0400
Subject: [PATCH 29/44] netfilter: ensure number of counters is >0 in
 do_replace()

After improving setsockopt() coverage in trinity, I started triggering
vmalloc failures pretty reliably from this code path:

warn_alloc_failed+0xe9/0x140
__vmalloc_node_range+0x1be/0x270
vzalloc+0x4b/0x50
__do_replace+0x52/0x260 [ip_tables]
do_ipt_set_ctl+0x15d/0x1d0 [ip_tables]
nf_setsockopt+0x65/0x90
ip_setsockopt+0x61/0xa0
raw_setsockopt+0x16/0x60
sock_common_setsockopt+0x14/0x20
SyS_setsockopt+0x71/0xd0

It turns out we don't validate that the num_counters field in the
struct we pass in from userspace is initialized.

The same problem also exists in ebtables, arptables, ipv6, and the
compat variants.

Signed-off-by: Dave Jones <davej@codemonkey.org.uk>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/bridge/netfilter/ebtables.c | 4 ++++
 net/ipv4/netfilter/arp_tables.c | 6 ++++++
 net/ipv4/netfilter/ip_tables.c  | 6 ++++++
 net/ipv6/netfilter/ip6_tables.c | 6 ++++++
 4 files changed, 22 insertions(+)

diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 91180a7fc943..24c7c96bf5f8 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -1117,6 +1117,8 @@ static int do_replace(struct net *net, const void __user *user,
 		return -ENOMEM;
 	if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
 		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
 
 	tmp.name[sizeof(tmp.name) - 1] = 0;
 
@@ -2159,6 +2161,8 @@ static int compat_copy_ebt_replace_from_user(struct ebt_replace *repl,
 		return -ENOMEM;
 	if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter))
 		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
 
 	memcpy(repl, &tmp, offsetof(struct ebt_replace, hook_entry));
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 13bfe84bf3ca..a61200754f4b 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -1075,6 +1075,9 @@ static int do_replace(struct net *net, const void __user *user,
 	/* overflow check */
 	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
 		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
+
 	tmp.name[sizeof(tmp.name)-1] = 0;
 
 	newinfo = xt_alloc_table_info(tmp.size);
@@ -1499,6 +1502,9 @@ static int compat_do_replace(struct net *net, void __user *user,
 		return -ENOMEM;
 	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
 		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
+
 	tmp.name[sizeof(tmp.name)-1] = 0;
 
 	newinfo = xt_alloc_table_info(tmp.size);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index c69db7fa25ee..2d0e265fef6e 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -1262,6 +1262,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
 	/* overflow check */
 	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
 		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
+
 	tmp.name[sizeof(tmp.name)-1] = 0;
 
 	newinfo = xt_alloc_table_info(tmp.size);
@@ -1809,6 +1812,9 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 		return -ENOMEM;
 	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
 		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
+
 	tmp.name[sizeof(tmp.name)-1] = 0;
 
 	newinfo = xt_alloc_table_info(tmp.size);
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 1a732a1d3c8e..62f5b0d0bc9b 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -1275,6 +1275,9 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
 	/* overflow check */
 	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
 		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
+
 	tmp.name[sizeof(tmp.name)-1] = 0;
 
 	newinfo = xt_alloc_table_info(tmp.size);
@@ -1822,6 +1825,9 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 		return -ENOMEM;
 	if (tmp.num_counters >= INT_MAX / sizeof(struct xt_counters))
 		return -ENOMEM;
+	if (tmp.num_counters == 0)
+		return -EINVAL;
+
 	tmp.name[sizeof(tmp.name)-1] = 0;
 
 	newinfo = xt_alloc_table_info(tmp.size);

From faecbb45ebefb20260ad4a631e011e93c896cb73 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 20 May 2015 13:42:25 +0200
Subject: [PATCH 30/44] Revert "netfilter: bridge: query conntrack about skb
 dnat"

This reverts commit c055d5b03bb4cb69d349d787c9787c0383abd8b2.

There are two issues:
'dnat_took_place' made me think that this is related to
-j DNAT/MASQUERADE.

But thats only one part of the story.  This is also relevant for SNAT
when we undo snat translation in reverse/reply direction.

Furthermore, I originally wanted to do this mainly to avoid
storing ipv6 addresses once we make DNAT/REDIRECT work
for ipv6 on bridges.

However, I forgot about SNPT/DNPT which is stateless.

So we can't escape storing address for ipv6 anyway. Might as
well do it for ipv4 too.

Reported-and-tested-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/skbuff.h    |  1 +
 net/bridge/br_netfilter.c | 27 +++++++++------------------
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 66e374d62f64..f15154a879c7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -176,6 +176,7 @@ struct nf_bridge_info {
 	struct net_device	*physindev;
 	struct net_device	*physoutdev;
 	char			neigh_header[8];
+	__be32			ipv4_daddr;
 };
 #endif
 
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index ab55e2472beb..60ddfbeb47f5 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -37,10 +37,6 @@
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-#include <net/netfilter/nf_conntrack.h>
-#endif
-
 #include <asm/uaccess.h>
 #include "br_private.h"
 #ifdef CONFIG_SYSCTL
@@ -350,24 +346,15 @@ static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
 	return 0;
 }
 
-static bool dnat_took_place(const struct sk_buff *skb)
+static bool daddr_was_changed(const struct sk_buff *skb,
+			      const struct nf_bridge_info *nf_bridge)
 {
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct;
-
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || nf_ct_is_untracked(ct))
-		return false;
-
-	return test_bit(IPS_DST_NAT_BIT, &ct->status);
-#else
-	return false;
-#endif
+	return ip_hdr(skb)->daddr != nf_bridge->ipv4_daddr;
 }
 
 /* This requires some explaining. If DNAT has taken place,
  * we will need to fix up the destination Ethernet address.
+ * This is also true when SNAT takes place (for the reply direction).
  *
  * There are two cases to consider:
  * 1. The packet was DNAT'ed to a device in the same bridge
@@ -421,7 +408,7 @@ static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
 		nf_bridge->pkt_otherhost = false;
 	}
 	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
-	if (dnat_took_place(skb)) {
+	if (daddr_was_changed(skb, nf_bridge)) {
 		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
 			struct in_device *in_dev = __in_dev_get_rcu(dev);
 
@@ -632,6 +619,7 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
 				      struct sk_buff *skb,
 				      const struct nf_hook_state *state)
 {
+	struct nf_bridge_info *nf_bridge;
 	struct net_bridge_port *p;
 	struct net_bridge *br;
 	__u32 len = nf_bridge_encap_header_len(skb);
@@ -669,6 +657,9 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
 	if (!setup_pre_routing(skb))
 		return NF_DROP;
 
+	nf_bridge = nf_bridge_info_get(skb);
+	nf_bridge->ipv4_daddr = ip_hdr(skb)->daddr;
+
 	skb->protocol = htons(ETH_P_IP);
 
 	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb,

From 35f1b4e96b9258a3668872b1139c51e5a23eb876 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= <mkubecek@suse.cz>
Date: Mon, 18 May 2015 20:53:55 +0200
Subject: [PATCH 31/44] ipv6: do not delete previously existing ECMP routes if
 add fails

If adding a nexthop of an IPv6 multipath route fails, comment in
ip6_route_multipath() says we are going to delete all nexthops already
added. However, current implementation deletes even the routes it
hasn't even tried to add yet. For example, running

  ip route add 1234:5678::/64 \
      nexthop via fe80::aa dev dummy1 \
      nexthop via fe80::bb dev dummy1 \
      nexthop via fe80::cc dev dummy1

twice results in removing all routes first command added.

Limit the second (delete) run to nexthops that succeeded in the first
(add) run.

Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d3588885f097..3821a3517478 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2504,9 +2504,9 @@ static int ip6_route_multipath(struct fib6_config *cfg, int add)
 	int attrlen;
 	int err = 0, last_err = 0;
 
+	remaining = cfg->fc_mp_len;
 beginning:
 	rtnh = (struct rtnexthop *)cfg->fc_mp;
-	remaining = cfg->fc_mp_len;
 
 	/* Parse a Multipath Entry */
 	while (rtnh_ok(rtnh, remaining)) {
@@ -2536,6 +2536,7 @@ static int ip6_route_multipath(struct fib6_config *cfg, int add)
 				 * next hops that have been already added.
 				 */
 				add = 0;
+				remaining = cfg->fc_mp_len - remaining;
 				goto beginning;
 			}
 		}

From 27596472473a02cfef2908a6bcda7e55264ba6b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= <mkubecek@suse.cz>
Date: Mon, 18 May 2015 20:54:00 +0200
Subject: [PATCH 32/44] ipv6: fix ECMP route replacement

When replacing an IPv6 multipath route with "ip route replace", i.e.
NLM_F_CREATE | NLM_F_REPLACE, fib6_add_rt2node() replaces only first
matching route without fixing its siblings, resulting in corrupted
siblings linked list; removing one of the siblings can then end in an
infinite loop.

IPv6 ECMP implementation is a bit different from IPv4 so that route
replacement cannot work in exactly the same way. This should be a
reasonable approximation:

1. If the new route is ECMP-able and there is a matching ECMP-able one
already, replace it and all its siblings (if any).

2. If the new route is ECMP-able and no matching ECMP-able route exists,
replace first matching non-ECMP-able (if any) or just add the new one.

3. If the new route is not ECMP-able, replace first matching
non-ECMP-able route (if any) or add the new route.

We also need to remove the NLM_F_REPLACE flag after replacing old
route(s) by first nexthop of an ECMP route so that each subsequent
nexthop does not replace previous one.

Fixes: 51ebd3181572 ("ipv6: add support of equal cost multipath (ECMP)")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv6/ip6_fib.c | 39 +++++++++++++++++++++++++++++++++++++--
 net/ipv6/route.c   | 11 +++++++----
 2 files changed, 44 insertions(+), 6 deletions(-)

diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 96dbffff5a24..bde57b113009 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -693,6 +693,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 {
 	struct rt6_info *iter = NULL;
 	struct rt6_info **ins;
+	struct rt6_info **fallback_ins = NULL;
 	int replace = (info->nlh &&
 		       (info->nlh->nlmsg_flags & NLM_F_REPLACE));
 	int add = (!info->nlh ||
@@ -716,8 +717,13 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			    (info->nlh->nlmsg_flags & NLM_F_EXCL))
 				return -EEXIST;
 			if (replace) {
-				found++;
-				break;
+				if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
+					found++;
+					break;
+				}
+				if (rt_can_ecmp)
+					fallback_ins = fallback_ins ?: ins;
+				goto next_iter;
 			}
 
 			if (iter->dst.dev == rt->dst.dev &&
@@ -753,9 +759,17 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 		if (iter->rt6i_metric > rt->rt6i_metric)
 			break;
 
+next_iter:
 		ins = &iter->dst.rt6_next;
 	}
 
+	if (fallback_ins && !found) {
+		/* No ECMP-able route found, replace first non-ECMP one */
+		ins = fallback_ins;
+		iter = *ins;
+		found++;
+	}
+
 	/* Reset round-robin state, if necessary */
 	if (ins == &fn->leaf)
 		fn->rr_ptr = NULL;
@@ -815,6 +829,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 		}
 
 	} else {
+		int nsiblings;
+
 		if (!found) {
 			if (add)
 				goto add;
@@ -835,8 +851,27 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt,
 			info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
 			fn->fn_flags |= RTN_RTINFO;
 		}
+		nsiblings = iter->rt6i_nsiblings;
 		fib6_purge_rt(iter, fn, info->nl_net);
 		rt6_release(iter);
+
+		if (nsiblings) {
+			/* Replacing an ECMP route, remove all siblings */
+			ins = &rt->dst.rt6_next;
+			iter = *ins;
+			while (iter) {
+				if (rt6_qualify_for_ecmp(iter)) {
+					*ins = iter->dst.rt6_next;
+					fib6_purge_rt(iter, fn, info->nl_net);
+					rt6_release(iter);
+					nsiblings--;
+				} else {
+					ins = &iter->dst.rt6_next;
+				}
+				iter = *ins;
+			}
+			WARN_ON(nsiblings != 0);
+		}
 	}
 
 	return 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3821a3517478..c73ae5039e46 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2541,11 +2541,14 @@ static int ip6_route_multipath(struct fib6_config *cfg, int add)
 			}
 		}
 		/* Because each route is added like a single route we remove
-		 * this flag after the first nexthop (if there is a collision,
-		 * we have already fail to add the first nexthop:
-		 * fib6_add_rt2node() has reject it).
+		 * these flags after the first nexthop: if there is a collision,
+		 * we have already failed to add the first nexthop:
+		 * fib6_add_rt2node() has rejected it; when replacing, old
+		 * nexthops have been replaced by first new, the rest should
+		 * be added to it.
 		 */
-		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~NLM_F_EXCL;
+		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
+						     NLM_F_REPLACE);
 		rtnh = rtnh_next(rtnh, &remaining);
 	}
 

From c15e10e71ce3b4ee78d85d80102a9621cde1edbd Mon Sep 17 00:00:00 2001
From: Tim Beale <tim.beale@alliedtelesis.co.nz>
Date: Mon, 18 May 2015 15:38:38 +1200
Subject: [PATCH 33/44] net: phy: Make sure phy_start() always re-enables the
 phy interrupts

This is an alternative way of fixing:
 commit db9683fb412d ("net: phy: Make sure PHY_RESUMING state change
                      is always processed")

When the PHY state transitions from PHY_HALTED to PHY_RESUMING, there are
two things we need to do:
1). Re-enable interrupts (and power up the physical link, if powered down)
2). Update the PHY state and net-device based on the link status.

There's no strict reason why #1 has to be done from within the main
phy_state_machine() function. There is a risk that other changes to the
PHY (e.g. setting speed/duplex, which calls phy_start_aneg()) could cause
a subsequent state transition before phy_state_machine() has processed
the PHY_RESUMING state change. This would leave the PHY with interrupts
disabled and/or still in the BMCR_PDOWN/low-power mode.

Moving enabling the interrupts and phy_resume() into phy_start() will
guarantee this work always gets done. As the PHY is already in the HALTED
state and interrupts are disabled, it shouldn't conflict with any work
being done in phy_state_machine(). The downside of this change is that if
the PHY_RESUMING state is ever entered from anywhere else, it'll also have
to repeat this work.

Signed-off-by: Tim Beale <tim.beale@alliedtelesis.co.nz>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/phy/phy.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index 710696d1af97..47cd578052fc 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -465,7 +465,7 @@ int phy_start_aneg(struct phy_device *phydev)
 	if (err < 0)
 		goto out_unlock;
 
-	if (phydev->state != PHY_HALTED && phydev->state != PHY_RESUMING) {
+	if (phydev->state != PHY_HALTED) {
 		if (AUTONEG_ENABLE == phydev->autoneg) {
 			phydev->state = PHY_AN;
 			phydev->link_timeout = PHY_AN_TIMEOUT;
@@ -742,6 +742,9 @@ EXPORT_SYMBOL(phy_stop);
  */
 void phy_start(struct phy_device *phydev)
 {
+	bool do_resume = false;
+	int err = 0;
+
 	mutex_lock(&phydev->lock);
 
 	switch (phydev->state) {
@@ -752,11 +755,22 @@ void phy_start(struct phy_device *phydev)
 		phydev->state = PHY_UP;
 		break;
 	case PHY_HALTED:
+		/* make sure interrupts are re-enabled for the PHY */
+		err = phy_enable_interrupts(phydev);
+		if (err < 0)
+			break;
+
 		phydev->state = PHY_RESUMING;
+		do_resume = true;
+		break;
 	default:
 		break;
 	}
 	mutex_unlock(&phydev->lock);
+
+	/* if phy was suspended, bring the physical link up again */
+	if (do_resume)
+		phy_resume(phydev);
 }
 EXPORT_SYMBOL(phy_start);
 
@@ -769,7 +783,7 @@ void phy_state_machine(struct work_struct *work)
 	struct delayed_work *dwork = to_delayed_work(work);
 	struct phy_device *phydev =
 			container_of(dwork, struct phy_device, state_queue);
-	bool needs_aneg = false, do_suspend = false, do_resume = false;
+	bool needs_aneg = false, do_suspend = false;
 	int err = 0;
 
 	mutex_lock(&phydev->lock);
@@ -888,14 +902,6 @@ void phy_state_machine(struct work_struct *work)
 		}
 		break;
 	case PHY_RESUMING:
-		err = phy_clear_interrupt(phydev);
-		if (err)
-			break;
-
-		err = phy_config_interrupt(phydev, PHY_INTERRUPT_ENABLED);
-		if (err)
-			break;
-
 		if (AUTONEG_ENABLE == phydev->autoneg) {
 			err = phy_aneg_done(phydev);
 			if (err < 0)
@@ -933,7 +939,6 @@ void phy_state_machine(struct work_struct *work)
 			}
 			phydev->adjust_link(phydev->attached_dev);
 		}
-		do_resume = true;
 		break;
 	}
 
@@ -943,8 +948,6 @@ void phy_state_machine(struct work_struct *work)
 		err = phy_start_aneg(phydev);
 	else if (do_suspend)
 		phy_suspend(phydev);
-	else if (do_resume)
-		phy_resume(phydev);
 
 	if (err < 0)
 		phy_error(phydev);

From c78e1746d3ad7d548bdf3fe491898cc453911a49 Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Wed, 20 May 2015 17:13:33 +0200
Subject: [PATCH 34/44] net: sched: fix call_rcu() race on classifier module
 unloads

Vijay reported that a loop as simple as ...

  while true; do
    tc qdisc add dev foo root handle 1: prio
    tc filter add dev foo parent 1: u32 match u32 0 0  flowid 1
    tc qdisc del dev foo root
    rmmod cls_u32
  done

... will panic the kernel. Moreover, he bisected the change
apparently introducing it to 78fd1d0ab072 ("netlink: Re-add
locking to netlink_lookup() and seq walker").

The removal of synchronize_net() from the netlink socket
triggering the qdisc to be removed, seems to have uncovered
an RCU resp. module reference count race from the tc API.
Given that RCU conversion was done after e341694e3eb5 ("netlink:
Convert netlink_lookup() to use RCU protected hash table")
which added the synchronize_net() originally, occasion of
hitting the bug was less likely (not impossible though):

When qdiscs that i) support attaching classifiers and,
ii) have at least one of them attached, get deleted, they
invoke tcf_destroy_chain(), and thus call into ->destroy()
handler from a classifier module.

After RCU conversion, all classifier that have an internal
prio list, unlink them and initiate freeing via call_rcu()
deferral.

Meanhile, tcf_destroy() releases already reference to the
tp->ops->owner module before the queued RCU callback handler
has been invoked.

Subsequent rmmod on the classifier module is then not prevented
since all module references are already dropped.

By the time, the kernel invokes the RCU callback handler from
the module, that function address is then invalid.

One way to fix it would be to add an rcu_barrier() to
unregister_tcf_proto_ops() to wait for all pending call_rcu()s
to complete.

synchronize_rcu() is not appropriate as under heavy RCU
callback load, registered call_rcu()s could be deferred
longer than a grace period. In case we don't have any pending
call_rcu()s, the barrier is allowed to return immediately.

Since we came here via unregister_tcf_proto_ops(), there
are no users of a given classifier anymore. Further nested
call_rcu()s pointing into the module space are not being
done anywhere.

Only cls_bpf_delete_prog() may schedule a work item, to
unlock pages eventually, but that is not in the range/context
of cls_bpf anymore.

Fixes: 25d8c0d55f24 ("net: rcu-ify tcf_proto")
Fixes: 9888faefe132 ("net: sched: cls_basic use RCU")
Reported-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Cc: John Fastabend <john.r.fastabend@intel.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Thomas Graf <tgraf@suug.ch>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/sched/cls_api.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index b6ef9a04de06..a75864d93142 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -81,6 +81,11 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 	struct tcf_proto_ops *t;
 	int rc = -ENOENT;
 
+	/* Wait for outstanding call_rcu()s, if any, from a
+	 * tcf_proto_ops's destroy() handler.
+	 */
+	rcu_barrier();
+
 	write_lock(&cls_mod_lock);
 	list_for_each_entry(t, &tcf_proto_base, head) {
 		if (t == ops) {

From d654976cbf852ee20612ee10dbe57cdacda9f452 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 May 2015 21:51:19 -0700
Subject: [PATCH 35/44] tcp: fix a potential deadlock in tcp_get_info()

Taking socket spinlock in tcp_get_info() can deadlock, as
inet_diag_dump_icsk() holds the &hashinfo->ehash_locks[i],
while packet processing can use the reverse locking order.

We could avoid this locking for TCP_LISTEN states, but lockdep would
certainly get confused as all TCP sockets share same lockdep classes.

[  523.722504] ======================================================
[  523.728706] [ INFO: possible circular locking dependency detected ]
[  523.734990] 4.1.0-dbg-DEV #1676 Not tainted
[  523.739202] -------------------------------------------------------
[  523.745474] ss/18032 is trying to acquire lock:
[  523.750002]  (slock-AF_INET){+.-...}, at: [<ffffffff81669d44>] tcp_get_info+0x2c4/0x360
[  523.758129]
[  523.758129] but task is already holding lock:
[  523.763968]  (&(&hashinfo->ehash_locks[i])->rlock){+.-...}, at: [<ffffffff816bcb75>] inet_diag_dump_icsk+0x1d5/0x6c0
[  523.774661]
[  523.774661] which lock already depends on the new lock.
[  523.774661]
[  523.782850]
[  523.782850] the existing dependency chain (in reverse order) is:
[  523.790326]
-> #1 (&(&hashinfo->ehash_locks[i])->rlock){+.-...}:
[  523.796599]        [<ffffffff811126bb>] lock_acquire+0xbb/0x270
[  523.802565]        [<ffffffff816f5868>] _raw_spin_lock+0x38/0x50
[  523.808628]        [<ffffffff81665af8>] __inet_hash_nolisten+0x78/0x110
[  523.815273]        [<ffffffff816819db>] tcp_v4_syn_recv_sock+0x24b/0x350
[  523.822067]        [<ffffffff81684d41>] tcp_check_req+0x3c1/0x500
[  523.828199]        [<ffffffff81682d09>] tcp_v4_do_rcv+0x239/0x3d0
[  523.834331]        [<ffffffff816842fe>] tcp_v4_rcv+0xa8e/0xc10
[  523.840202]        [<ffffffff81658fa3>] ip_local_deliver_finish+0x133/0x3e0
[  523.847214]        [<ffffffff81659a9a>] ip_local_deliver+0xaa/0xc0
[  523.853440]        [<ffffffff816593b8>] ip_rcv_finish+0x168/0x5c0
[  523.859624]        [<ffffffff81659db7>] ip_rcv+0x307/0x420

Lets use u64_sync infrastructure instead. As a bonus, 64bit
arches get optimized, as these are nop for them.

Fixes: 0df48c26d841 ("tcp: add tcpi_bytes_acked to tcp_info")
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/tcp.h     |  2 ++
 net/ipv4/tcp.c          | 11 +++++++----
 net/ipv4/tcp_fastopen.c |  4 ++++
 net/ipv4/tcp_input.c    |  4 ++++
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 3b2911502a8c..e8bbf403618f 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -158,6 +158,8 @@ struct tcp_sock {
 				 * sum(delta(snd_una)), or how many bytes
 				 * were acked.
 				 */
+	struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */
+
  	u32	snd_una;	/* First byte we want an ack for	*/
  	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
 	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 46efa03d2b11..f1377f2a0472 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -402,6 +402,7 @@ void tcp_init_sock(struct sock *sk)
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tp->snd_cwnd_clamp = ~0;
 	tp->mss_cache = TCP_MSS_DEFAULT;
+	u64_stats_init(&tp->syncp);
 
 	tp->reordering = sysctl_tcp_reordering;
 	tcp_enable_early_retrans(tp);
@@ -2598,6 +2599,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	const struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	u32 now = tcp_time_stamp;
+	unsigned int start;
 	u32 rate;
 
 	memset(info, 0, sizeof(*info));
@@ -2665,10 +2667,11 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 	rate = READ_ONCE(sk->sk_max_pacing_rate);
 	info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
 
-	spin_lock_bh(&sk->sk_lock.slock);
-	info->tcpi_bytes_acked = tp->bytes_acked;
-	info->tcpi_bytes_received = tp->bytes_received;
-	spin_unlock_bh(&sk->sk_lock.slock);
+	do {
+		start = u64_stats_fetch_begin_irq(&tp->syncp);
+		info->tcpi_bytes_acked = tp->bytes_acked;
+		info->tcpi_bytes_received = tp->bytes_received;
+	} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
 }
 EXPORT_SYMBOL_GPL(tcp_get_info);
 
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 3c673d5e6cff..46b087a27503 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -206,6 +206,10 @@ static bool tcp_fastopen_create_child(struct sock *sk,
 			skb_set_owner_r(skb2, child);
 			__skb_queue_tail(&child->sk_receive_queue, skb2);
 			tp->syn_data_acked = 1;
+
+			/* u64_stats_update_begin(&tp->syncp) not needed here,
+			 * as we certainly are not changing upper 32bit value (0)
+			 */
 			tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
 		} else {
 			end_seq = TCP_SKB_CB(skb)->seq + 1;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 243d674b3ef5..c9ab964189a0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3286,7 +3286,9 @@ static void tcp_snd_una_update(struct tcp_sock *tp, u32 ack)
 {
 	u32 delta = ack - tp->snd_una;
 
+	u64_stats_update_begin(&tp->syncp);
 	tp->bytes_acked += delta;
+	u64_stats_update_end(&tp->syncp);
 	tp->snd_una = ack;
 }
 
@@ -3295,7 +3297,9 @@ static void tcp_rcv_nxt_update(struct tcp_sock *tp, u32 seq)
 {
 	u32 delta = seq - tp->rcv_nxt;
 
+	u64_stats_update_begin(&tp->syncp);
 	tp->bytes_received += delta;
+	u64_stats_update_end(&tp->syncp);
 	tp->rcv_nxt = seq;
 }
 

From 381c759d9916c42959515ad34a6d467e24a88e93 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Fri, 22 May 2015 04:58:12 -0500
Subject: [PATCH 36/44] ipv4: Avoid crashing in ip_error

ip_error does not check if in_dev is NULL before dereferencing it.

IThe following sequence of calls is possible:
CPU A                          CPU B
ip_rcv_finish
    ip_route_input_noref()
        ip_route_input_slow()
                               inetdev_destroy()
    dst_input()

With the result that a network device can be destroyed while processing
an input packet.

A crash was triggered with only unicast packets in flight, and
forwarding enabled on the only network device.   The error condition
was created by the removal of the network device.

As such it is likely the that error code was -EHOSTUNREACH, and the
action taken by ip_error (if in_dev had been accessible) would have
been to not increment any counters and to have tried and likely failed
to send an icmp error as the network device is going away.

Therefore handle this weird case by just dropping the packet if
!in_dev.  It will result in dropping the packet sooner, and will not
result in an actual change of behavior.

Fixes: 251da4130115b ("ipv4: Cache ip_error() routes even when not forwarding.")
Reported-by: Vittorio Gambaletta <linuxbugs@vittgam.net>
Tested-by: Vittorio Gambaletta <linuxbugs@vittgam.net>
Signed-off-by: Vittorio Gambaletta <linuxbugs@vittgam.net>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Acked-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/route.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index bff62fc87b8e..f45f2a12f37b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -902,6 +902,10 @@ static int ip_error(struct sk_buff *skb)
 	bool send;
 	int code;
 
+	/* IP on this device is disabled. */
+	if (!in_dev)
+		goto out;
+
 	net = dev_net(rt->dst.dev);
 	if (!IN_DEV_FORWARD(in_dev)) {
 		switch (rt->dst.error) {

From 44f6731d8b68fa02f5ed65eaceac41f8c3c9279e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= <bjorn@mork.no>
Date: Fri, 22 May 2015 13:15:22 +0200
Subject: [PATCH 37/44] cdc_ncm: Fix tx_bytes statistics
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The tx_curr_frame_payload field is u32. When we try to calculate a
small negative delta based on it, we end up with a positive integer
close to 2^32 instead.  So the tx_bytes pointer increases by about
2^32 for every transmitted frame.

Fix by calculating the delta as a signed long.

Cc: Ben Hutchings <ben.hutchings@codethink.co.uk>
Reported-by: Florian Bruhin <me@the-compiler.org>
Fixes: 7a1e890e2168 ("usbnet: Fix tx_bytes statistic running backward in cdc_ncm")
Signed-off-by: Bjørn Mork <bjorn@mork.no>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/usb/cdc_ncm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index c3e4da9e79ca..8067b8fbb0ee 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -1182,7 +1182,7 @@ cdc_ncm_fill_tx_frame(struct usbnet *dev, struct sk_buff *skb, __le32 sign)
 	 * payload data instead.
 	 */
 	usbnet_set_skb_tx_stats(skb_out, n,
-				ctx->tx_curr_frame_payload - skb_out->len);
+				(long)ctx->tx_curr_frame_payload - skb_out->len);
 
 	return skb_out;
 

From d4e64c2909231222ceba0999d921e7ac8908f656 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michal=20Kube=C4=8Dek?= <mkubecek@suse.cz>
Date: Fri, 22 May 2015 13:40:09 +0200
Subject: [PATCH 38/44] ipv4: fill in table id when replacing a route

When replacing an IPv4 route, tb_id member of the new fib_alias
structure is not set in the replace code path so that the new route is
ignored.

Fixes: 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse")
Signed-off-by: Michal Kubecek <mkubecek@suse.cz>
Acked-by: Alexander Duyck <alexander.h.duyck@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/ipv4/fib_trie.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 64c2076ced54..09b62e17dd8c 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1164,6 +1164,7 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
 			state = fa->fa_state;
 			new_fa->fa_state = state & ~FA_S_ACCESSED;
 			new_fa->fa_slen = fa->fa_slen;
+			new_fa->tb_id = tb->tb_id;
 
 			err = netdev_switch_fib_ipv4_add(key, plen, fi,
 							 new_fa->fa_tos,

From 6db99596d11a99f59e8f8f9353b22ef76e31ad6a Mon Sep 17 00:00:00 2001
From: Nathan Sullivan <nathan.sullivan@ni.com>
Date: Fri, 22 May 2015 09:22:09 -0500
Subject: [PATCH 39/44] net: macb: Document zynq gem dt binding

Signed-off-by: Nathan Sullivan <nathan.sullivan@ni.com>
Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/devicetree/bindings/net/cdns-emac.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/cdns-emac.txt b/Documentation/devicetree/bindings/net/cdns-emac.txt
index abd67c13d344..4451ee973223 100644
--- a/Documentation/devicetree/bindings/net/cdns-emac.txt
+++ b/Documentation/devicetree/bindings/net/cdns-emac.txt
@@ -3,7 +3,8 @@
 Required properties:
 - compatible: Should be "cdns,[<chip>-]{emac}"
   Use "cdns,at91rm9200-emac" Atmel at91rm9200 SoC.
-  or the generic form: "cdns,emac".
+  Use "cdns,zynq-gem" Xilinx Zynq-7xxx SoC.
+  Or the generic form: "cdns,emac".
 - reg: Address and length of the register set for the device
 - interrupts: Should contain macb interrupt
 - phy-mode: see ethernet.txt file in the same directory.

From 222ca8e0c183309081bdb0fd5827fc2a922be3ad Mon Sep 17 00:00:00 2001
From: Nathan Sullivan <nathan.sullivan@ni.com>
Date: Fri, 22 May 2015 09:22:10 -0500
Subject: [PATCH 40/44] net: macb: Disable half duplex gigabit on Zynq

According to the Zynq TRM, gigabit half duplex is not supported.  Add a
new cap and compatible string so Zynq can avoid advertising that mode.

Signed-off-by: Nathan Sullivan <nathan.sullivan@ni.com>
Acked-by: Nicolas Ferre <nicolas.ferre@atmel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 drivers/net/ethernet/cadence/macb.c | 12 ++++++++++++
 drivers/net/ethernet/cadence/macb.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c
index 5f10dfc631d7..fc646a41d548 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -350,6 +350,9 @@ static int macb_mii_probe(struct net_device *dev)
 	else
 		phydev->supported &= PHY_BASIC_FEATURES;
 
+	if (bp->caps & MACB_CAPS_NO_GIGABIT_HALF)
+		phydev->supported &= ~SUPPORTED_1000baseT_Half;
+
 	phydev->advertising = phydev->supported;
 
 	bp->link = 0;
@@ -2699,6 +2702,14 @@ static const struct macb_config emac_config = {
 	.init = at91ether_init,
 };
 
+static const struct macb_config zynq_config = {
+	.caps = MACB_CAPS_SG_DISABLED | MACB_CAPS_GIGABIT_MODE_AVAILABLE |
+		MACB_CAPS_NO_GIGABIT_HALF,
+	.dma_burst_length = 16,
+	.clk_init = macb_clk_init,
+	.init = macb_init,
+};
+
 static const struct of_device_id macb_dt_ids[] = {
 	{ .compatible = "cdns,at32ap7000-macb" },
 	{ .compatible = "cdns,at91sam9260-macb", .data = &at91sam9260_config },
@@ -2709,6 +2720,7 @@ static const struct of_device_id macb_dt_ids[] = {
 	{ .compatible = "atmel,sama5d4-gem", .data = &sama5d4_config },
 	{ .compatible = "cdns,at91rm9200-emac", .data = &emac_config },
 	{ .compatible = "cdns,emac", .data = &emac_config },
+	{ .compatible = "cdns,zynq-gem", .data = &zynq_config },
 	{ /* sentinel */ }
 };
 MODULE_DEVICE_TABLE(of, macb_dt_ids);
diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h
index eb7d76f7bf6a..24b1d9bcd865 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -393,6 +393,7 @@
 #define MACB_CAPS_ISR_CLEAR_ON_WRITE		0x00000001
 #define MACB_CAPS_USRIO_HAS_CLKEN		0x00000002
 #define MACB_CAPS_USRIO_DEFAULT_IS_MII		0x00000004
+#define MACB_CAPS_NO_GIGABIT_HALF		0x00000008
 #define MACB_CAPS_FIFO_MODE			0x10000000
 #define MACB_CAPS_GIGABIT_MODE_AVAILABLE	0x20000000
 #define MACB_CAPS_SG_DISABLED			0x40000000

From 9eeb5161397a54580a630c99841b5badb07cf246 Mon Sep 17 00:00:00 2001
From: Nathan Sullivan <nathan.sullivan@ni.com>
Date: Fri, 22 May 2015 09:22:11 -0500
Subject: [PATCH 41/44] ARM: zynq: DT: Use the zynq binding with macb

Use the new zynq binding for macb ethernet, since it will disable half
duplex gigabit like the Zynq TRM says to do.

Signed-off-by: Nathan Sullivan <nathan.sullivan@ni.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 arch/arm/boot/dts/zynq-7000.dtsi | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm/boot/dts/zynq-7000.dtsi b/arch/arm/boot/dts/zynq-7000.dtsi
index a5cd2eda3edf..9ea54b3dba09 100644
--- a/arch/arm/boot/dts/zynq-7000.dtsi
+++ b/arch/arm/boot/dts/zynq-7000.dtsi
@@ -193,7 +193,7 @@
 		};
 
 		gem0: ethernet@e000b000 {
-			compatible = "cdns,gem";
+			compatible = "cdns,zynq-gem";
 			reg = <0xe000b000 0x1000>;
 			status = "disabled";
 			interrupts = <0 22 4>;
@@ -204,7 +204,7 @@
 		};
 
 		gem1: ethernet@e000c000 {
-			compatible = "cdns,gem";
+			compatible = "cdns,zynq-gem";
 			reg = <0xe000c000 0x1000>;
 			status = "disabled";
 			interrupts = <0 45 4>;

From 47cc84ce0c2fe75c99ea5963c4b5704dd78ead54 Mon Sep 17 00:00:00 2001
From: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Date: Fri, 22 May 2015 12:18:59 -0300
Subject: [PATCH 42/44] bridge: fix parsing of MLDv2 reports

When more than a multicast address is present in a MLDv2 report, all but
the first address is ignored, because the code breaks out of the loop if
there has not been an error adding that address.

This has caused failures when two guests connected through the bridge
tried to communicate using IPv6. Neighbor discoveries would not be
transmitted to the other guest when both used a link-local address and a
static address.

This only happens when there is a MLDv2 querier in the network.

The fix will only break out of the loop when there is a failure adding a
multicast address.

The mdb before the patch:

dev ovirtmgmt port vnet0 grp ff02::1:ff7d:6603 temp
dev ovirtmgmt port vnet1 grp ff02::1:ff7d:6604 temp
dev ovirtmgmt port bond0.86 grp ff02::2 temp

After the patch:

dev ovirtmgmt port vnet0 grp ff02::1:ff7d:6603 temp
dev ovirtmgmt port vnet1 grp ff02::1:ff7d:6604 temp
dev ovirtmgmt port bond0.86 grp ff02::fb temp
dev ovirtmgmt port bond0.86 grp ff02::2 temp
dev ovirtmgmt port bond0.86 grp ff02::d temp
dev ovirtmgmt port vnet0 grp ff02::1:ff00:76 temp
dev ovirtmgmt port bond0.86 grp ff02::16 temp
dev ovirtmgmt port vnet1 grp ff02::1:ff00:77 temp
dev ovirtmgmt port bond0.86 grp ff02::1:ff00:def temp
dev ovirtmgmt port bond0.86 grp ff02::1:ffa1:40bf temp

Fixes: 08b202b67264 ("bridge br_multicast: IPv6 MLD support.")
Reported-by: Rik Theys <Rik.Theys@esat.kuleuven.be>
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Tested-by: Rik Theys <Rik.Theys@esat.kuleuven.be>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_multicast.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 4b6722f8f179..a3abe6ed111e 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -1072,7 +1072,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge *br,
 
 		err = br_ip6_multicast_add_group(br, port, &grec->grec_mca,
 						 vid);
-		if (!err)
+		if (err)
 			break;
 	}
 

From f96dee13b8e10f00840124255bed1d8b4c6afd6f Mon Sep 17 00:00:00 2001
From: Arun Parameswaran <aparames@broadcom.com>
Date: Wed, 20 May 2015 14:35:30 -0700
Subject: [PATCH 43/44] net: core: 'ethtool' issue with querying phy settings

When trying to configure the settings for PHY1, using commands
like 'ethtool -s eth0 phyad 1 speed 100', the 'ethtool' seems to
modify other settings apart from the speed of the PHY1, in the
above case.

The ethtool seems to query the settings for PHY0, and use this
as the base to apply the new settings to the PHY1. This is
causing the other settings of the PHY 1 to be wrongly
configured.

The issue is caused by the '_ethtool_get_settings()' API, which
gets called because of the 'ETHTOOL_GSET' command, is clearing
the 'cmd' pointer (of type 'struct ethtool_cmd') by calling
memset. This clears all the parameters (if any) passed for the
'ETHTOOL_GSET' cmd. So the driver's callback is always invoked
with 'cmd->phy_address' as '0'.

The '_ethtool_get_settings()' is called from other files in the
'net/core'. So the fix is applied to the 'ethtool_get_settings()'
which is only called in the context of the 'ethtool'.

Signed-off-by: Arun Parameswaran <aparames@broadcom.com>
Reviewed-by: Ray Jui <rjui@broadcom.com>
Reviewed-by: Scott Branden <sbranden@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/core/ethtool.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 1d00b8922902..1347e11f5cc9 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -359,7 +359,15 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
 	int err;
 	struct ethtool_cmd cmd;
 
-	err = __ethtool_get_settings(dev, &cmd);
+	if (!dev->ethtool_ops->get_settings)
+		return -EOPNOTSUPP;
+
+	if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
+		return -EFAULT;
+
+	cmd.cmd = ETHTOOL_GSET;
+
+	err = dev->ethtool_ops->get_settings(dev, &cmd);
 	if (err < 0)
 		return err;
 

From 93a33a584e2a49a217118148125944fd02d47b54 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 21 May 2015 13:28:29 -0700
Subject: [PATCH 44/44] bridge: fix lockdep splat

Following lockdep splat was reported :

[   29.382286] ===============================
[   29.382315] [ INFO: suspicious RCU usage. ]
[   29.382344] 4.1.0-0.rc0.git11.1.fc23.x86_64 #1 Not tainted
[   29.382380] -------------------------------
[   29.382409] net/bridge/br_private.h:626 suspicious
rcu_dereference_check() usage!
[   29.382455]
               other info that might help us debug this:

[   29.382507]
               rcu_scheduler_active = 1, debug_locks = 0
[   29.382549] 2 locks held by swapper/0/0:
[   29.382576]  #0:  (((&p->forward_delay_timer))){+.-...}, at:
[<ffffffff81139f75>] call_timer_fn+0x5/0x4f0
[   29.382660]  #1:  (&(&br->lock)->rlock){+.-...}, at:
[<ffffffffa0450dc1>] br_forward_delay_timer_expired+0x31/0x140
[bridge]
[   29.382754]
               stack backtrace:
[   29.382787] CPU: 0 PID: 0 Comm: swapper/0 Not tainted
4.1.0-0.rc0.git11.1.fc23.x86_64 #1
[   29.382838] Hardware name: LENOVO 422916G/LENOVO, BIOS A1KT53AUS 04/07/2015
[   29.382882]  0000000000000000 3ebfc20364115825 ffff880666603c48
ffffffff81892d4b
[   29.382943]  0000000000000000 ffffffff81e124e0 ffff880666603c78
ffffffff8110bcd7
[   29.383004]  ffff8800785c9d00 ffff88065485ac58 ffff880c62002800
ffff880c5fc88ac0
[   29.383065] Call Trace:
[   29.383084]  <IRQ>  [<ffffffff81892d4b>] dump_stack+0x4c/0x65
[   29.383130]  [<ffffffff8110bcd7>] lockdep_rcu_suspicious+0xe7/0x120
[   29.383178]  [<ffffffffa04520f9>] br_fill_ifinfo+0x4a9/0x6a0 [bridge]
[   29.383225]  [<ffffffffa045266b>] br_ifinfo_notify+0x11b/0x4b0 [bridge]
[   29.383271]  [<ffffffffa0450d90>] ? br_hold_timer_expired+0x70/0x70 [bridge]
[   29.383320]  [<ffffffffa0450de8>]
br_forward_delay_timer_expired+0x58/0x140 [bridge]
[   29.383371]  [<ffffffffa0450d90>] ? br_hold_timer_expired+0x70/0x70 [bridge]
[   29.383416]  [<ffffffff8113a033>] call_timer_fn+0xc3/0x4f0
[   29.383454]  [<ffffffff81139f75>] ? call_timer_fn+0x5/0x4f0
[   29.383493]  [<ffffffff8110a90f>] ? lock_release_holdtime.part.29+0xf/0x200
[   29.383541]  [<ffffffffa0450d90>] ? br_hold_timer_expired+0x70/0x70 [bridge]
[   29.383587]  [<ffffffff8113a6a4>] run_timer_softirq+0x244/0x490
[   29.383629]  [<ffffffff810b68cc>] __do_softirq+0xec/0x670
[   29.383666]  [<ffffffff810b70d5>] irq_exit+0x145/0x150
[   29.383703]  [<ffffffff8189f506>] smp_apic_timer_interrupt+0x46/0x60
[   29.383744]  [<ffffffff8189d523>] apic_timer_interrupt+0x73/0x80
[   29.383782]  <EOI>  [<ffffffff816f131f>] ? cpuidle_enter_state+0x5f/0x2f0
[   29.383832]  [<ffffffff816f131b>] ? cpuidle_enter_state+0x5b/0x2f0

Problem here is that br_forward_delay_timer_expired() is a timer
handler, calling br_ifinfo_notify() which assumes either rcu_read_lock()
or RTNL are held.

Simplest fix seems to add rcu read lock section.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reported-by: Josh Boyer <jwboyer@fedoraproject.org>
Reported-by: Dominick Grift <dac.override@gmail.com>
Cc: Vlad Yasevich <vyasevich@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 net/bridge/br_stp_timer.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index 4fcaa67750fd..7caf7fae2d5b 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -97,7 +97,9 @@ static void br_forward_delay_timer_expired(unsigned long arg)
 		netif_carrier_on(br->dev);
 	}
 	br_log_state(p);
+	rcu_read_lock();
 	br_ifinfo_notify(RTM_NEWLINK, p);
+	rcu_read_unlock();
 	spin_unlock(&br->lock);
 }