myri10ge: improve parity error detection and recovery

Improve myri10ge parity error detection and recovery:
1) Don't restore PCI config space to a rebooted NIC until AFTER the
   host is quiescent.
2) Let myri10ge_close() know the NIC is dead, so it won't waste time
   waiting for a dead nic to respond to MXGEFW_CMD_ETHERNET_DOWN
3) When the NIC is quiet (link down, or otherwise idle link) use
   a pci config space read to detect a rebooted NIC.  Otherwise
   we might never notice that a NIC rebooted

Signed-off-by: Andrew Gallatin <gallatin@myri.com>
Signed-off-by: Brice Goglin <brice@myri.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/myri10ge/myri10ge.c b/drivers/net/myri10ge/myri10ge.c
index 1a34f7e..75deef3 100644
--- a/drivers/net/myri10ge/myri10ge.c
+++ b/drivers/net/myri10ge/myri10ge.c
@@ -75,7 +75,7 @@
 #include "myri10ge_mcp.h"
 #include "myri10ge_mcp_gen_header.h"
 
-#define MYRI10GE_VERSION_STR "1.5.0-1.418"
+#define MYRI10GE_VERSION_STR "1.5.0-1.432"
 
 MODULE_DESCRIPTION("Myricom 10G driver (10GbE)");
 MODULE_AUTHOR("Maintainer: help@myri.com");
@@ -188,6 +188,7 @@
 	dma_addr_t fw_stats_bus;
 	int watchdog_tx_done;
 	int watchdog_tx_req;
+	int watchdog_rx_done;
 #ifdef CONFIG_MYRI10GE_DCA
 	int cached_dca_tag;
 	int cpu;
@@ -256,6 +257,7 @@
 	u32 link_changes;
 	u32 msg_enable;
 	unsigned int board_number;
+	int rebooted;
 };
 
 static char *myri10ge_fw_unaligned = "myri10ge_ethp_z8e.dat";
@@ -2552,17 +2554,22 @@
 	netif_carrier_off(dev);
 
 	netif_tx_stop_all_queues(dev);
-	old_down_cnt = mgp->down_cnt;
-	mb();
-	status = myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
-	if (status)
-		printk(KERN_ERR "myri10ge: %s: Couldn't bring down link\n",
-		       dev->name);
+	if (mgp->rebooted == 0) {
+		old_down_cnt = mgp->down_cnt;
+		mb();
+		status =
+		    myri10ge_send_cmd(mgp, MXGEFW_CMD_ETHERNET_DOWN, &cmd, 0);
+		if (status)
+			printk(KERN_ERR
+			       "myri10ge: %s: Couldn't bring down link\n",
+			       dev->name);
 
-	wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt, HZ);
-	if (old_down_cnt == mgp->down_cnt)
-		printk(KERN_ERR "myri10ge: %s never got down irq\n", dev->name);
-
+		wait_event_timeout(mgp->down_wq, old_down_cnt != mgp->down_cnt,
+				   HZ);
+		if (old_down_cnt == mgp->down_cnt)
+			printk(KERN_ERR "myri10ge: %s never got down irq\n",
+			       dev->name);
+	}
 	netif_tx_disable(dev);
 	myri10ge_free_irq(mgp);
 	for (i = 0; i < mgp->num_slices; i++)
@@ -3427,12 +3434,13 @@
 	    container_of(work, struct myri10ge_priv, watchdog_work);
 	struct myri10ge_tx_buf *tx;
 	u32 reboot;
-	int status;
+	int status, rebooted;
 	int i;
 	u16 cmd, vendor;
 
 	mgp->watchdog_resets++;
 	pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
+	rebooted = 0;
 	if ((cmd & PCI_COMMAND_MASTER) == 0) {
 		/* Bus master DMA disabled?  Check to see
 		 * if the card rebooted due to a parity error
@@ -3444,9 +3452,12 @@
 		       myri10ge_reset_recover ? " " : " not");
 		if (myri10ge_reset_recover == 0)
 			return;
-
+		rtnl_lock();
+		mgp->rebooted = 1;
+		rebooted = 1;
+		myri10ge_close(mgp->dev);
 		myri10ge_reset_recover--;
-
+		mgp->rebooted = 0;
 		/*
 		 * A rebooted nic will come back with config space as
 		 * it was after power was applied to PCIe bus.
@@ -3494,8 +3505,10 @@
 		}
 	}
 
-	rtnl_lock();
-	myri10ge_close(mgp->dev);
+	if (!rebooted) {
+		rtnl_lock();
+		myri10ge_close(mgp->dev);
+	}
 	status = myri10ge_load_firmware(mgp, 1);
 	if (status != 0)
 		printk(KERN_ERR "myri10ge: %s: failed to load firmware\n",
@@ -3516,12 +3529,14 @@
 {
 	struct myri10ge_priv *mgp;
 	struct myri10ge_slice_state *ss;
-	int i, reset_needed;
+	int i, reset_needed, busy_slice_cnt;
 	u32 rx_pause_cnt;
+	u16 cmd;
 
 	mgp = (struct myri10ge_priv *)arg;
 
 	rx_pause_cnt = ntohl(mgp->ss[0].fw_stats->dropped_pause);
+	busy_slice_cnt = 0;
 	for (i = 0, reset_needed = 0;
 	     i < mgp->num_slices && reset_needed == 0; ++i) {
 
@@ -3559,8 +3574,22 @@
 				reset_needed = 1;
 			}
 		}
+		if (ss->watchdog_tx_done != ss->tx.done ||
+		    ss->watchdog_rx_done != ss->rx_done.cnt) {
+			busy_slice_cnt++;
+		}
 		ss->watchdog_tx_done = ss->tx.done;
 		ss->watchdog_tx_req = ss->tx.req;
+		ss->watchdog_rx_done = ss->rx_done.cnt;
+	}
+	/* if we've sent or received no traffic, poll the NIC to
+	 * ensure it is still there.  Otherwise, we risk not noticing
+	 * an error in a timely fashion */
+	if (busy_slice_cnt == 0) {
+		pci_read_config_word(mgp->pdev, PCI_COMMAND, &cmd);
+		if ((cmd & PCI_COMMAND_MASTER) == 0) {
+			reset_needed = 1;
+		}
 	}
 	mgp->watchdog_pause = rx_pause_cnt;