myri10ge: improve parity error detection and recovery
Improve myri10ge parity error detection and recovery: 1) Don't restore PCI config space to a rebooted NIC until AFTER the host is quiescent. 2) Let myri10ge_close() know the NIC is dead, so it won't waste time waiting for a dead nic to respond to MXGEFW_CMD_ETHERNET_DOWN 3) When the NIC is quiet (link down, or otherwise idle link) use a pci config space read to detect a rebooted NIC. Otherwise we might never notice that a NIC rebooted Signed-off-by: Andrew Gallatin <gallatin@myri.com> Signed-off-by: Brice Goglin <brice@myri.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
c9145a2df0
commit
d02342151c
1 changed files with 46 additions and 17 deletions
|
@ -75,7 +75,7 @@
|
|||
#include "myri10ge_mcp.h"
|
||||
#include "myri10ge_mcp_gen_header.h"
|
||||
|
||||
#define MYRI10GE_VERSION_STR "1.5.0-1.418"
|
||||
#define MYRI10GE_VERSION_STR "1.5.0-1.432"
|
||||
|
||||
MODULE_DESCRIPTION("Myricom 10G driver (10GbE)");
|
||||
MODULE_AUTHOR("Maintainer: help@myri.com");
|
||||
|
@ -188,6 +188,7 @@ struct myri10ge_slice_state {
|
|||
dma_addr_t fw_stats_bus;
|
||||
int watchdog_tx_done;
|
||||
int watchdog_tx_req;
|
||||
int watchdog_rx_done;
|
||||
#ifdef CONFIG_MYRI10GE_DCA
|
||||
int cached_dca_tag;
|
||||
int cpu;
|
||||
|
@ -256,6 +257,7 @@ struct myri10ge_priv {
|
|||
u32 link_changes;
|
||||
u32 msg_enable;
|
||||
unsigned int board_number;
|
||||
int rebooted;
|
||||
};
|
||||
|
||||
static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat";
|
||||
|
@ -2552,17 +2554,22 @@ static int myri10ge_close(struct net_device *dev)
|
|||
netif_carrier_off(dev);
|
||||
|
||||
netif_tx_stop_all_queues(dev);
|
||||
old_down_cnt = mgp->down_cnt;
|
||||
mb();
|
||||
status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
|
||||
if (status)
|
||||
printk(KERN_ERR "myri10ge: %s: Couldn't bring down link\n",
|
||||
dev->name);
|
||||
|
||||
wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, HZ);
|
||||
if (old_down_cnt == mgp->down_cnt)
|
||||
printk(KERN_ERR "myri10ge: %s never got down irq\n", dev->name);
|
||||
if (mgp->rebooted == 0) {
|
||||
old_down_cnt = mgp->down_cnt;
|
||||
mb();
|
||||
status =
|
||||
myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
|
||||
if (status)
|
||||
printk(KERN_ERR
|
||||
"myri10ge: %s: Couldn't bring down link\n",
|
||||
dev->name);
|
||||
|
||||
wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt,
|
||||
HZ);
|
||||
if (old_down_cnt == mgp->down_cnt)
|
||||
printk(KERN_ERR "myri10ge: %s never got down irq\n",
|
||||
dev->name);
|
||||
}
|
||||
netif_tx_disable(dev);
|
||||
myri10ge_free_irq(mgp);
|
||||
for (i = 0; i < mgp->num_slices; i++)
|
||||
|
@ -3427,12 +3434,13 @@ static void myri10ge_watchdog(struct work_struct *work)
|
|||
container_of(work, struct myri10ge_priv, watchdog_work);
|
||||
struct myri10ge_tx_buf *tx;
|
||||
u32 reboot;
|
||||
int status;
|
||||
int status, rebooted;
|
||||
int i;
|
||||
u16 cmd, vendor;
|
||||
|
||||
mgp->watchdog_resets++;
|
||||
pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
|
||||
rebooted = 0;
|
||||
if ((cmd & PCI_COMMAND_MASTER) == 0) {
|
||||
/* Bus master DMA disabled? Check to see
|
||||
* if the card rebooted due to a parity error
|
||||
|
@ -3444,9 +3452,12 @@ static void myri10ge_watchdog(struct work_struct *work)
|
|||
myri10ge_reset_recover ? " " : " not");
|
||||
if (myri10ge_reset_recover == 0)
|
||||
return;
|
||||
|
||||
rtnl_lock();
|
||||
mgp->rebooted = 1;
|
||||
rebooted = 1;
|
||||
myri10ge_close(mgp->dev);
|
||||
myri10ge_reset_recover--;
|
||||
|
||||
mgp->rebooted = 0;
|
||||
/*
|
||||
* A rebooted nic will come back with config space as
|
||||
* it was after power was applied to PCIe bus.
|
||||
|
@ -3494,8 +3505,10 @@ static void myri10ge_watchdog(struct work_struct *work)
|
|||
}
|
||||
}
|
||||
|
||||
rtnl_lock();
|
||||
myri10ge_close(mgp->dev);
|
||||
if (!rebooted) {
|
||||
rtnl_lock();
|
||||
myri10ge_close(mgp->dev);
|
||||
}
|
||||
status = myri10ge_load_firmware(mgp, 1);
|
||||
if (status != 0)
|
||||
printk(KERN_ERR "myri10ge: %s: failed to load firmware\n",
|
||||
|
@ -3516,12 +3529,14 @@ static void myri10ge_watchdog_timer(unsigned long arg)
|
|||
{
|
||||
struct myri10ge_priv *mgp;
|
||||
struct myri10ge_slice_state *ss;
|
||||
int i, reset_needed;
|
||||
int i, reset_needed, busy_slice_cnt;
|
||||
u32 rx_pause_cnt;
|
||||
u16 cmd;
|
||||
|
||||
mgp = (struct myri10ge_priv *)arg;
|
||||
|
||||
rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
|
||||
busy_slice_cnt = 0;
|
||||
for (i = 0, reset_needed = 0;
|
||||
i < mgp->num_slices && reset_needed == 0; ++i) {
|
||||
|
||||
|
@ -3559,8 +3574,22 @@ static void myri10ge_watchdog_timer(unsigned long arg)
|
|||
reset_needed = 1;
|
||||
}
|
||||
}
|
||||
if (ss->watchdog_tx_done != ss->tx.done ||
|
||||
ss->watchdog_rx_done != ss->rx_done.cnt) {
|
||||
busy_slice_cnt++;
|
||||
}
|
||||
ss->watchdog_tx_done = ss->tx.done;
|
||||
ss->watchdog_tx_req = ss->tx.req;
|
||||
ss->watchdog_rx_done = ss->rx_done.cnt;
|
||||
}
|
||||
/* if we've sent or received no traffic, poll the NIC to
|
||||
* ensure it is still there. Otherwise, we risk not noticing
|
||||
* an error in a timely fashion */
|
||||
if (busy_slice_cnt == 0) {
|
||||
pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
|
||||
if ((cmd & PCI_COMMAND_MASTER) == 0) {
|
||||
reset_needed = 1;
|
||||
}
|
||||
}
|
||||
mgp->watchdog_pause = rx_pause_cnt;
|
||||
|
||||
|
|
Loading…
Reference in a new issue