forcedeth: ring access

This patch modifys ring access by using pointers. This avoids computing
the current index and avoids accessing the base address of the rings.

Signed-Off-By: Ayaz Abdulla <aabdulla@nvidia.com>

Signed-off-by: Jeff Garzik <jeff@garzik.org>
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 3ef1aec..bea826b 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -691,6 +691,12 @@
 	{ 0,0 }
 };
 
+struct nv_skb_map {
+	struct sk_buff *skb;
+	dma_addr_t dma;
+	unsigned int dma_len;
+};
+
 /*
  * SMP locking:
  * All hardware access under dev->priv->lock, except the performance
@@ -741,10 +747,12 @@
 	/* rx specific fields.
 	 * Locking: Within irq hander or disable_irq+spin_lock(&np->lock);
 	 */
+	union ring_type get_rx, put_rx, first_rx, last_rx;
+	struct nv_skb_map *get_rx_ctx, *put_rx_ctx;
+	struct nv_skb_map *first_rx_ctx, *last_rx_ctx;
+	struct nv_skb_map *rx_skb;
+
 	union ring_type rx_ring;
-	unsigned int cur_rx, refill_rx;
-	struct sk_buff **rx_skbuff;
-	dma_addr_t *rx_dma;
 	unsigned int rx_buf_sz;
 	unsigned int pkt_limit;
 	struct timer_list oom_kick;
@@ -761,11 +769,12 @@
 	/*
 	 * tx specific fields.
 	 */
+	union ring_type get_tx, put_tx, first_tx, last_tx;
+	struct nv_skb_map *get_tx_ctx, *put_tx_ctx;
+	struct nv_skb_map *first_tx_ctx, *last_tx_ctx;
+	struct nv_skb_map *tx_skb;
+
 	union ring_type tx_ring;
-	unsigned int next_tx, nic_tx;
-	struct sk_buff **tx_skbuff;
-	dma_addr_t *tx_dma;
-	unsigned int *tx_dma_len;
 	u32 tx_flags;
 	int tx_ring_size;
 	int tx_limit_start;
@@ -921,16 +930,10 @@
 			pci_free_consistent(np->pci_dev, sizeof(struct ring_desc_ex) * (np->rx_ring_size + np->tx_ring_size),
 					    np->rx_ring.ex, np->ring_addr);
 	}
-	if (np->rx_skbuff)
-		kfree(np->rx_skbuff);
-	if (np->rx_dma)
-		kfree(np->rx_dma);
-	if (np->tx_skbuff)
-		kfree(np->tx_skbuff);
-	if (np->tx_dma)
-		kfree(np->tx_dma);
-	if (np->tx_dma_len)
-		kfree(np->tx_dma_len);
+	if (np->rx_skb)
+		kfree(np->rx_skb);
+	if (np->tx_skb)
+		kfree(np->tx_skb);
 }
 
 static int using_multi_irqs(struct net_device *dev)
@@ -1304,43 +1307,60 @@
 static int nv_alloc_rx(struct net_device *dev)
 {
 	struct fe_priv *np = netdev_priv(dev);
-	unsigned int refill_rx = np->refill_rx;
-	int nr;
+	union ring_type less_rx;
 
-	while (np->cur_rx != refill_rx) {
+	if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
+		less_rx.orig = np->get_rx.orig;
+		if (less_rx.orig-- == np->first_rx.orig)
+			less_rx.orig = np->last_rx.orig;
+	} else {
+		less_rx.ex = np->get_rx.ex;
+		if (less_rx.ex-- == np->first_rx.ex)
+			less_rx.ex = np->last_rx.ex;
+	}
+
+	while (1) {
 		struct sk_buff *skb;
 
-		nr = refill_rx % np->rx_ring_size;
-		if (np->rx_skbuff[nr] == NULL) {
+		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
+			if (np->put_rx.orig == less_rx.orig)
+				break;
+		} else {
+			if (np->put_rx.ex == less_rx.ex)
+				break;
+		}
+
+		if (np->put_rx_ctx->skb == NULL) {
 
 			skb = dev_alloc_skb(np->rx_buf_sz + NV_RX_ALLOC_PAD);
 			if (!skb)
-				break;
+				return 1;
 
 			skb->dev = dev;
-			np->rx_skbuff[nr] = skb;
+			np->put_rx_ctx->skb = skb;
 		} else {
-			skb = np->rx_skbuff[nr];
+			skb = np->put_rx_ctx->skb;
 		}
-		np->rx_dma[nr] = pci_map_single(np->pci_dev, skb->data,
+		np->put_rx_ctx->dma = pci_map_single(np->pci_dev, skb->data,
 					skb->end-skb->data, PCI_DMA_FROMDEVICE);
+		np->put_rx_ctx->dma_len = skb->end-skb->data;
 		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-			np->rx_ring.orig[nr].buf = cpu_to_le32(np->rx_dma[nr]);
+			np->put_rx.orig->buf = cpu_to_le32(np->put_rx_ctx->dma);
 			wmb();
-			np->rx_ring.orig[nr].flaglen = cpu_to_le32(np->rx_buf_sz | NV_RX_AVAIL);
+			np->put_rx.orig->flaglen = cpu_to_le32(np->rx_buf_sz | NV_RX_AVAIL);
+			if (np->put_rx.orig++ == np->last_rx.orig)
+				np->put_rx.orig = np->first_rx.orig;
 		} else {
-			np->rx_ring.ex[nr].bufhigh = cpu_to_le64(np->rx_dma[nr]) >> 32;
-			np->rx_ring.ex[nr].buflow = cpu_to_le64(np->rx_dma[nr]) & 0x0FFFFFFFF;
+			np->put_rx.ex->bufhigh = cpu_to_le64(np->put_rx_ctx->dma) >> 32;
+			np->put_rx.ex->buflow = cpu_to_le64(np->put_rx_ctx->dma) & 0x0FFFFFFFF;
 			wmb();
-			np->rx_ring.ex[nr].flaglen = cpu_to_le32(np->rx_buf_sz | NV_RX2_AVAIL);
+			np->put_rx.ex->flaglen = cpu_to_le32(np->rx_buf_sz | NV_RX2_AVAIL);
+			if (np->put_rx.ex++ == np->last_rx.ex)
+				np->put_rx.ex = np->first_rx.ex;
 		}
-		dprintk(KERN_DEBUG "%s: nv_alloc_rx: Packet %d marked as Available\n",
-					dev->name, refill_rx);
-		refill_rx++;
+		if (np->put_rx_ctx++ == np->last_rx_ctx)
+			np->put_rx_ctx = np->first_rx_ctx;
 	}
-	np->refill_rx = refill_rx;
-	if (np->cur_rx - refill_rx == np->rx_ring_size)
-		return 1;
 	return 0;
 }
 
@@ -1388,29 +1408,53 @@
 {
 	struct fe_priv *np = netdev_priv(dev);
 	int i;
+	np->get_rx = np->put_rx = np->first_rx = np->rx_ring;
+	if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+		np->last_rx.orig = &np->rx_ring.orig[np->rx_ring_size-1];
+	else
+		np->last_rx.ex = &np->rx_ring.ex[np->rx_ring_size-1];
+	np->get_rx_ctx = np->put_rx_ctx = np->first_rx_ctx = np->rx_skb;
+	np->last_rx_ctx = &np->rx_skb[np->rx_ring_size-1];
 
-	np->cur_rx = np->rx_ring_size;
-	np->refill_rx = 0;
-	for (i = 0; i < np->rx_ring_size; i++)
-		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+	for (i = 0; i < np->rx_ring_size; i++) {
+		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
 			np->rx_ring.orig[i].flaglen = 0;
-	        else
+			np->rx_ring.orig[i].buf = 0;
+		} else {
 			np->rx_ring.ex[i].flaglen = 0;
+			np->rx_ring.ex[i].txvlan = 0;
+			np->rx_ring.ex[i].bufhigh = 0;
+			np->rx_ring.ex[i].buflow = 0;
+		}
+		np->rx_skb[i].skb = NULL;
+		np->rx_skb[i].dma = 0;
+	}
 }
 
 static void nv_init_tx(struct net_device *dev)
 {
 	struct fe_priv *np = netdev_priv(dev);
 	int i;
+	np->get_tx = np->put_tx = np->first_tx = np->tx_ring;
+	if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+		np->last_tx.orig = &np->tx_ring.orig[np->tx_ring_size-1];
+	else
+		np->last_tx.ex = &np->tx_ring.ex[np->tx_ring_size-1];
+	np->get_tx_ctx = np->put_tx_ctx = np->first_tx_ctx = np->tx_skb;
+	np->last_tx_ctx = &np->tx_skb[np->tx_ring_size-1];
 
-	np->next_tx = np->nic_tx = 0;
 	for (i = 0; i < np->tx_ring_size; i++) {
-		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
 			np->tx_ring.orig[i].flaglen = 0;
-	        else
+			np->tx_ring.orig[i].buf = 0;
+		} else {
 			np->tx_ring.ex[i].flaglen = 0;
-		np->tx_skbuff[i] = NULL;
-		np->tx_dma[i] = 0;
+			np->tx_ring.ex[i].txvlan = 0;
+			np->tx_ring.ex[i].bufhigh = 0;
+			np->tx_ring.ex[i].buflow = 0;
+		}
+		np->tx_skb[i].skb = NULL;
+		np->tx_skb[i].dma = 0;
 	}
 }
 
@@ -1421,23 +1465,19 @@
 	return nv_alloc_rx(dev);
 }
 
-static int nv_release_txskb(struct net_device *dev, unsigned int skbnr)
+static int nv_release_txskb(struct net_device *dev, struct nv_skb_map* tx_skb)
 {
 	struct fe_priv *np = netdev_priv(dev);
 
-	dprintk(KERN_INFO "%s: nv_release_txskb for skbnr %d\n",
-		dev->name, skbnr);
-
-	if (np->tx_dma[skbnr]) {
-		pci_unmap_page(np->pci_dev, np->tx_dma[skbnr],
-			       np->tx_dma_len[skbnr],
+	if (tx_skb->dma) {
+		pci_unmap_page(np->pci_dev, tx_skb->dma,
+			       tx_skb->dma_len,
 			       PCI_DMA_TODEVICE);
-		np->tx_dma[skbnr] = 0;
+		tx_skb->dma = 0;
 	}
-
-	if (np->tx_skbuff[skbnr]) {
-		dev_kfree_skb_any(np->tx_skbuff[skbnr]);
-		np->tx_skbuff[skbnr] = NULL;
+	if (tx_skb->skb) {
+		dev_kfree_skb_any(tx_skb->skb);
+		tx_skb->skb = NULL;
 		return 1;
 	} else {
 		return 0;
@@ -1450,11 +1490,16 @@
 	unsigned int i;
 
 	for (i = 0; i < np->tx_ring_size; i++) {
-		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
 			np->tx_ring.orig[i].flaglen = 0;
-		else
+			np->tx_ring.orig[i].buf = 0;
+		} else {
 			np->tx_ring.ex[i].flaglen = 0;
-		if (nv_release_txskb(dev, i))
+			np->tx_ring.ex[i].txvlan = 0;
+			np->tx_ring.ex[i].bufhigh = 0;
+			np->tx_ring.ex[i].buflow = 0;
+		}
+		if (nv_release_txskb(dev, &np->tx_skb[i]))
 			np->stats.tx_dropped++;
 	}
 }
@@ -1463,18 +1508,24 @@
 {
 	struct fe_priv *np = netdev_priv(dev);
 	int i;
+
 	for (i = 0; i < np->rx_ring_size; i++) {
-		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
 			np->rx_ring.orig[i].flaglen = 0;
-		else
+			np->rx_ring.orig[i].buf = 0;
+		} else {
 			np->rx_ring.ex[i].flaglen = 0;
+			np->rx_ring.ex[i].txvlan = 0;
+			np->rx_ring.ex[i].bufhigh = 0;
+			np->rx_ring.ex[i].buflow = 0;
+		}
 		wmb();
-		if (np->rx_skbuff[i]) {
-			pci_unmap_single(np->pci_dev, np->rx_dma[i],
-						np->rx_skbuff[i]->end-np->rx_skbuff[i]->data,
+		if (np->rx_skb[i].skb) {
+			pci_unmap_single(np->pci_dev, np->rx_skb[i].dma,
+						np->rx_skb[i].skb->end-np->rx_skb[i].skb->data,
 						PCI_DMA_FROMDEVICE);
-			dev_kfree_skb(np->rx_skbuff[i]);
-			np->rx_skbuff[i] = NULL;
+			dev_kfree_skb(np->rx_skb[i].skb);
+			np->rx_skb[i].skb = NULL;
 		}
 	}
 }
@@ -1485,6 +1536,11 @@
 	nv_drain_rx(dev);
 }
 
+static inline u32 nv_get_empty_tx_slots(struct fe_priv *np)
+{
+	return (u32)(np->tx_ring_size - ((np->tx_ring_size + (np->put_tx_ctx - np->get_tx_ctx)) % np->tx_ring_size));
+}
+
 /*
  * nv_start_xmit: dev->hard_start_xmit function
  * Called with netif_tx_lock held.
@@ -1495,14 +1551,17 @@
 	u32 tx_flags = 0;
 	u32 tx_flags_extra = (np->desc_ver == DESC_VER_1 ? NV_TX_LASTPACKET : NV_TX2_LASTPACKET);
 	unsigned int fragments = skb_shinfo(skb)->nr_frags;
-	unsigned int nr = (np->next_tx - 1) % np->tx_ring_size;
-	unsigned int start_nr = np->next_tx % np->tx_ring_size;
 	unsigned int i;
 	u32 offset = 0;
 	u32 bcnt;
 	u32 size = skb->len-skb->data_len;
 	u32 entries = (size >> NV_TX2_TSO_MAX_SHIFT) + ((size & (NV_TX2_TSO_MAX_SIZE-1)) ? 1 : 0);
+	u32 empty_slots;
 	u32 tx_flags_vlan = 0;
+	union ring_type put_tx;
+	union ring_type start_tx;
+	union ring_type prev_tx;
+	struct nv_skb_map* prev_tx_ctx;
 
 	/* add fragments to entries count */
 	for (i = 0; i < fragments; i++) {
@@ -1512,32 +1571,46 @@
 
 	spin_lock_irq(&np->lock);
 
-	if ((np->next_tx - np->nic_tx + entries - 1) > np->tx_limit_stop) {
+	empty_slots = nv_get_empty_tx_slots(np);
+	if ((empty_slots - np->tx_limit_stop) <= entries) {
 		spin_unlock_irq(&np->lock);
 		netif_stop_queue(dev);
 		return NETDEV_TX_BUSY;
 	}
 
+	if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+		start_tx.orig = put_tx.orig = np->put_tx.orig;
+	else
+		start_tx.ex = put_tx.ex = np->put_tx.ex;
+
 	/* setup the header buffer */
 	do {
+		prev_tx = put_tx;
+		prev_tx_ctx = np->put_tx_ctx;
 		bcnt = (size > NV_TX2_TSO_MAX_SIZE) ? NV_TX2_TSO_MAX_SIZE : size;
-		nr = (nr + 1) % np->tx_ring_size;
-
-		np->tx_dma[nr] = pci_map_single(np->pci_dev, skb->data + offset, bcnt,
+		np->put_tx_ctx->dma = pci_map_single(np->pci_dev, skb->data + offset, bcnt,
 						PCI_DMA_TODEVICE);
-		np->tx_dma_len[nr] = bcnt;
-
+		np->put_tx_ctx->dma_len = bcnt;
 		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-			np->tx_ring.orig[nr].buf = cpu_to_le32(np->tx_dma[nr]);
-			np->tx_ring.orig[nr].flaglen = cpu_to_le32((bcnt-1) | tx_flags);
+			put_tx.orig->buf = cpu_to_le32(np->put_tx_ctx->dma);
+			put_tx.orig->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
 		} else {
-			np->tx_ring.ex[nr].bufhigh = cpu_to_le64(np->tx_dma[nr]) >> 32;
-			np->tx_ring.ex[nr].buflow = cpu_to_le64(np->tx_dma[nr]) & 0x0FFFFFFFF;
-			np->tx_ring.ex[nr].flaglen = cpu_to_le32((bcnt-1) | tx_flags);
+			put_tx.ex->bufhigh = cpu_to_le64(np->put_tx_ctx->dma) >> 32;
+			put_tx.ex->buflow = cpu_to_le64(np->put_tx_ctx->dma) & 0x0FFFFFFFF;
+			put_tx.ex->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
 		}
 		tx_flags = np->tx_flags;
 		offset += bcnt;
 		size -= bcnt;
+		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
+			if (put_tx.orig++ == np->last_tx.orig)
+				put_tx.orig = np->first_tx.orig;
+		} else {
+			if (put_tx.ex++ == np->last_tx.ex)
+				put_tx.ex = np->first_tx.ex;
+		}
+		if (np->put_tx_ctx++ == np->last_tx_ctx)
+			np->put_tx_ctx = np->first_tx_ctx;
 	} while (size);
 
 	/* setup the fragments */
@@ -1547,34 +1620,43 @@
 		offset = 0;
 
 		do {
+			prev_tx = put_tx;
+			prev_tx_ctx = np->put_tx_ctx;
 			bcnt = (size > NV_TX2_TSO_MAX_SIZE) ? NV_TX2_TSO_MAX_SIZE : size;
-			nr = (nr + 1) % np->tx_ring_size;
-
-			np->tx_dma[nr] = pci_map_page(np->pci_dev, frag->page, frag->page_offset+offset, bcnt,
-						      PCI_DMA_TODEVICE);
-			np->tx_dma_len[nr] = bcnt;
+			np->put_tx_ctx->dma = pci_map_page(np->pci_dev, frag->page, frag->page_offset+offset, bcnt,
+							   PCI_DMA_TODEVICE);
+			np->put_tx_ctx->dma_len = bcnt;
 
 			if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-				np->tx_ring.orig[nr].buf = cpu_to_le32(np->tx_dma[nr]);
-				np->tx_ring.orig[nr].flaglen = cpu_to_le32((bcnt-1) | tx_flags);
+				put_tx.orig->buf = cpu_to_le32(np->put_tx_ctx->dma);
+				put_tx.orig->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
 			} else {
-				np->tx_ring.ex[nr].bufhigh = cpu_to_le64(np->tx_dma[nr]) >> 32;
-				np->tx_ring.ex[nr].buflow = cpu_to_le64(np->tx_dma[nr]) & 0x0FFFFFFFF;
-				np->tx_ring.ex[nr].flaglen = cpu_to_le32((bcnt-1) | tx_flags);
+				put_tx.ex->bufhigh = cpu_to_le64(np->put_tx_ctx->dma) >> 32;
+				put_tx.ex->buflow = cpu_to_le64(np->put_tx_ctx->dma) & 0x0FFFFFFFF;
+				put_tx.ex->flaglen = cpu_to_le32((bcnt-1) | tx_flags);
 			}
 			offset += bcnt;
 			size -= bcnt;
+			if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
+				if (put_tx.orig++ == np->last_tx.orig)
+					put_tx.orig = np->first_tx.orig;
+			} else {
+				if (put_tx.ex++ == np->last_tx.ex)
+					put_tx.ex = np->first_tx.ex;
+			}
+			if (np->put_tx_ctx++ == np->last_tx_ctx)
+				np->put_tx_ctx = np->first_tx_ctx;
 		} while (size);
 	}
 
 	/* set last fragment flag  */
-	if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-		np->tx_ring.orig[nr].flaglen |= cpu_to_le32(tx_flags_extra);
-	} else {
-		np->tx_ring.ex[nr].flaglen |= cpu_to_le32(tx_flags_extra);
-	}
+	if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
+		prev_tx.orig->flaglen |= cpu_to_le32(tx_flags_extra);
+	else
+		prev_tx.ex->flaglen |= cpu_to_le32(tx_flags_extra);
 
-	np->tx_skbuff[nr] = skb;
+	/* save skb in this slot's context area */
+	prev_tx_ctx->skb = skb;
 
 	if (skb_is_gso(skb))
 		tx_flags_extra = NV_TX2_TSO | (skb_shinfo(skb)->gso_size << NV_TX2_TSO_SHIFT);
@@ -1589,14 +1671,17 @@
 
 	/* set tx flags */
 	if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-		np->tx_ring.orig[start_nr].flaglen |= cpu_to_le32(tx_flags | tx_flags_extra);
+		start_tx.orig->flaglen |= cpu_to_le32(tx_flags | tx_flags_extra);
+		np->put_tx.orig = put_tx.orig;
 	} else {
-		np->tx_ring.ex[start_nr].txvlan = cpu_to_le32(tx_flags_vlan);
-		np->tx_ring.ex[start_nr].flaglen |= cpu_to_le32(tx_flags | tx_flags_extra);
+		start_tx.ex->txvlan = cpu_to_le32(tx_flags_vlan);
+		start_tx.ex->flaglen |= cpu_to_le32(tx_flags | tx_flags_extra);
+		np->put_tx.ex = put_tx.ex;
 	}
 
-	dprintk(KERN_DEBUG "%s: nv_start_xmit: packet %d (entries %d) queued for transmission. tx_flags_extra: %x\n",
-		dev->name, np->next_tx, entries, tx_flags_extra);
+
+	dprintk(KERN_DEBUG "%s: nv_start_xmit: entries %d queued for transmission. tx_flags_extra: %x\n",
+		dev->name, entries, tx_flags_extra);
 	{
 		int j;
 		for (j=0; j<64; j++) {
@@ -1607,8 +1692,6 @@
 		dprintk("\n");
 	}
 
-	np->next_tx += entries;
-
 	dev->trans_start = jiffies;
 	spin_unlock_irq(&np->lock);
 	writel(NVREG_TXRXCTL_KICK|np->txrxctl_bits, get_hwbase(dev) + NvRegTxRxControl);
@@ -1625,24 +1708,26 @@
 {
 	struct fe_priv *np = netdev_priv(dev);
 	u32 flags;
-	unsigned int i;
 	struct sk_buff *skb;
 
-	while (np->nic_tx != np->next_tx) {
-		i = np->nic_tx % np->tx_ring_size;
+ 	while (1) {
+		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
+			if (np->get_tx.orig == np->put_tx.orig)
+				break;
+			flags = le32_to_cpu(np->get_tx.orig->flaglen);
+		} else {
+			if (np->get_tx.ex == np->put_tx.ex)
+				break;
+			flags = le32_to_cpu(np->get_tx.ex->flaglen);
+		}
 
-		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2)
-			flags = le32_to_cpu(np->tx_ring.orig[i].flaglen);
-		else
-			flags = le32_to_cpu(np->tx_ring.ex[i].flaglen);
-
-		dprintk(KERN_DEBUG "%s: nv_tx_done: looking at packet %d, flags 0x%x.\n",
-					dev->name, np->nic_tx, flags);
+		dprintk(KERN_DEBUG "%s: nv_tx_done: flags 0x%x.\n",
+					dev->name, flags);
 		if (flags & NV_TX_VALID)
 			break;
 		if (np->desc_ver == DESC_VER_1) {
 			if (flags & NV_TX_LASTPACKET) {
-				skb = np->tx_skbuff[i];
+				skb = np->get_tx_ctx->skb;
 				if (flags & (NV_TX_RETRYERROR|NV_TX_CARRIERLOST|NV_TX_LATECOLLISION|
 					     NV_TX_UNDERFLOW|NV_TX_ERROR)) {
 					if (flags & NV_TX_UNDERFLOW)
@@ -1657,7 +1742,7 @@
 			}
 		} else {
 			if (flags & NV_TX2_LASTPACKET) {
-				skb = np->tx_skbuff[i];
+				skb = np->get_tx_ctx->skb;
 				if (flags & (NV_TX2_RETRYERROR|NV_TX2_CARRIERLOST|NV_TX2_LATECOLLISION|
 					     NV_TX2_UNDERFLOW|NV_TX2_ERROR)) {
 					if (flags & NV_TX2_UNDERFLOW)
@@ -1671,10 +1756,18 @@
 				}
 			}
 		}
-		nv_release_txskb(dev, i);
-		np->nic_tx++;
+		nv_release_txskb(dev, np->get_tx_ctx);
+		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
+			if (np->get_tx.orig++ == np->last_tx.orig)
+				np->get_tx.orig = np->first_tx.orig;
+		} else {
+			if (np->get_tx.ex++ == np->last_tx.ex)
+				np->get_tx.ex = np->first_tx.ex;
+		}
+		if (np->get_tx_ctx++ == np->last_tx_ctx)
+			np->get_tx_ctx = np->first_tx_ctx;
 	}
-	if (np->next_tx - np->nic_tx < np->tx_limit_start)
+	if (nv_get_empty_tx_slots(np) > np->tx_limit_start)
 		netif_wake_queue(dev);
 }
 
@@ -1698,9 +1791,8 @@
 	{
 		int i;
 
-		printk(KERN_INFO "%s: Ring at %lx: next %d nic %d\n",
-				dev->name, (unsigned long)np->ring_addr,
-				np->next_tx, np->nic_tx);
+		printk(KERN_INFO "%s: Ring at %lx\n",
+		       dev->name, (unsigned long)np->ring_addr);
 		printk(KERN_INFO "%s: Dumping tx registers\n", dev->name);
 		for (i=0;i<=np->register_size;i+= 32) {
 			printk(KERN_INFO "%3x: %08x %08x %08x %08x %08x %08x %08x %08x\n",
@@ -1751,10 +1843,10 @@
 	nv_tx_done(dev);
 
 	/* 3) if there are dead entries: clear everything */
-	if (np->next_tx != np->nic_tx) {
+	if (np->get_tx_ctx != np->put_tx_ctx) {
 		printk(KERN_DEBUG "%s: tx_timeout: dead entries!\n", dev->name);
 		nv_drain_tx(dev);
-		np->next_tx = np->nic_tx = 0;
+		nv_init_tx(dev);
 		setup_hw_rings(dev, NV_SETUP_TX_RING);
 		netif_wake_queue(dev);
 	}
@@ -1827,22 +1919,22 @@
  	for (count = 0; count < limit; ++count) {
 		struct sk_buff *skb;
 		int len;
-		int i;
-		if (np->cur_rx - np->refill_rx >= np->rx_ring_size)
-			break;	/* we scanned the whole ring - do not continue */
 
-		i = np->cur_rx % np->rx_ring_size;
 		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
-			flags = le32_to_cpu(np->rx_ring.orig[i].flaglen);
-			len = nv_descr_getlength(&np->rx_ring.orig[i], np->desc_ver);
+			if (np->get_rx.orig == np->put_rx.orig)
+				break;	/* we scanned the whole ring - do not continue */
+			flags = le32_to_cpu(np->get_rx.orig->flaglen);
+			len = nv_descr_getlength(np->get_rx.orig, np->desc_ver);
 		} else {
-			flags = le32_to_cpu(np->rx_ring.ex[i].flaglen);
-			len = nv_descr_getlength_ex(&np->rx_ring.ex[i], np->desc_ver);
-			vlanflags = le32_to_cpu(np->rx_ring.ex[i].buflow);
+			if (np->get_rx.ex == np->put_rx.ex)
+				break;	/* we scanned the whole ring - do not continue */
+			flags = le32_to_cpu(np->get_rx.ex->flaglen);
+			len = nv_descr_getlength_ex(np->get_rx.ex, np->desc_ver);
+			vlanflags = le32_to_cpu(np->get_rx.ex->buflow);
 		}
 
-		dprintk(KERN_DEBUG "%s: nv_rx_process: looking at packet %d, flags 0x%x.\n",
-					dev->name, np->cur_rx, flags);
+		dprintk(KERN_DEBUG "%s: nv_rx_process: flags 0x%x.\n",
+					dev->name, flags);
 
 		if (flags & NV_RX_AVAIL)
 			break;	/* still owned by hardware, */
@@ -1852,8 +1944,8 @@
 		 * TODO: check if a prefetch of the first cacheline improves
 		 * the performance.
 		 */
-		pci_unmap_single(np->pci_dev, np->rx_dma[i],
-				np->rx_skbuff[i]->end-np->rx_skbuff[i]->data,
+		pci_unmap_single(np->pci_dev, np->get_rx_ctx->dma,
+				np->get_rx_ctx->dma_len,
 				PCI_DMA_FROMDEVICE);
 
 		{
@@ -1862,7 +1954,7 @@
 			for (j=0; j<64; j++) {
 				if ((j%16) == 0)
 					dprintk("\n%03x:", j);
-				dprintk(" %02x", ((unsigned char*)np->rx_skbuff[i]->data)[j]);
+				dprintk(" %02x", ((unsigned char*)np->get_rx_ctx->skb->data)[j]);
 			}
 			dprintk("\n");
 		}
@@ -1892,7 +1984,7 @@
 					goto next_pkt;
 				}
 				if (flags & NV_RX_ERROR4) {
-					len = nv_getlen(dev, np->rx_skbuff[i]->data, len);
+					len = nv_getlen(dev, np->get_rx_ctx->skb->data, len);
 					if (len < 0) {
 						np->stats.rx_errors++;
 						goto next_pkt;
@@ -1925,7 +2017,7 @@
 					goto next_pkt;
 				}
 				if (flags & NV_RX2_ERROR4) {
-					len = nv_getlen(dev, np->rx_skbuff[i]->data, len);
+					len = nv_getlen(dev, np->get_rx_ctx->skb->data, len);
 					if (len < 0) {
 						np->stats.rx_errors++;
 						goto next_pkt;
@@ -1944,20 +2036,20 @@
 				    flags == NV_RX2_CHECKSUMOK2 ||
 				    flags == NV_RX2_CHECKSUMOK3) {
 					dprintk(KERN_DEBUG "%s: hw checksum hit!.\n", dev->name);
-					np->rx_skbuff[i]->ip_summed = CHECKSUM_UNNECESSARY;
+					np->get_rx_ctx->skb->ip_summed = CHECKSUM_UNNECESSARY;
 				} else {
 					dprintk(KERN_DEBUG "%s: hwchecksum miss!.\n", dev->name);
 				}
 			}
 		}
 		/* got a valid packet - forward it to the network core */
-		skb = np->rx_skbuff[i];
-		np->rx_skbuff[i] = NULL;
+		skb = np->get_rx_ctx->skb;
+		np->get_rx_ctx->skb = NULL;
 
 		skb_put(skb, len);
 		skb->protocol = eth_type_trans(skb, dev);
-		dprintk(KERN_DEBUG "%s: nv_rx_process: packet %d with %d bytes, proto %d accepted.\n",
-					dev->name, np->cur_rx, len, skb->protocol);
+		dprintk(KERN_DEBUG "%s: nv_rx_process: %d bytes, proto %d accepted.\n",
+					dev->name, len, skb->protocol);
 #ifdef CONFIG_FORCEDETH_NAPI
 		if (np->vlangrp && (vlanflags & NV_RX3_VLAN_TAG_PRESENT))
 			vlan_hwaccel_receive_skb(skb, np->vlangrp,
@@ -1975,7 +2067,15 @@
 		np->stats.rx_packets++;
 		np->stats.rx_bytes += len;
 next_pkt:
-		np->cur_rx++;
+		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
+			if (np->get_rx.orig++ == np->last_rx.orig)
+				np->get_rx.orig = np->first_rx.orig;
+		} else {
+			if (np->get_rx.ex++ == np->last_rx.ex)
+				np->get_rx.ex = np->first_rx.ex;
+		}
+		if (np->get_rx_ctx++ == np->last_rx_ctx)
+			np->get_rx_ctx = np->first_rx_ctx;
 	}
 
 	return count;
@@ -3463,7 +3563,7 @@
 {
 	struct fe_priv *np = netdev_priv(dev);
 	u8 __iomem *base = get_hwbase(dev);
-	u8 *rxtx_ring, *rx_skbuff, *tx_skbuff, *rx_dma, *tx_dma, *tx_dma_len;
+	u8 *rxtx_ring, *rx_skbuff, *tx_skbuff;
 	dma_addr_t ring_addr;
 
 	if (ring->rx_pending < RX_RING_MIN ||
@@ -3489,12 +3589,9 @@
 					    sizeof(struct ring_desc_ex) * (ring->rx_pending + ring->tx_pending),
 					    &ring_addr);
 	}
-	rx_skbuff = kmalloc(sizeof(struct sk_buff*) * ring->rx_pending, GFP_KERNEL);
-	rx_dma = kmalloc(sizeof(dma_addr_t) * ring->rx_pending, GFP_KERNEL);
-	tx_skbuff = kmalloc(sizeof(struct sk_buff*) * ring->tx_pending, GFP_KERNEL);
-	tx_dma = kmalloc(sizeof(dma_addr_t) * ring->tx_pending, GFP_KERNEL);
-	tx_dma_len = kmalloc(sizeof(unsigned int) * ring->tx_pending, GFP_KERNEL);
-	if (!rxtx_ring || !rx_skbuff || !rx_dma || !tx_skbuff || !tx_dma || !tx_dma_len) {
+	rx_skbuff = kmalloc(sizeof(struct nv_skb_map) * ring->rx_pending, GFP_KERNEL);
+	tx_skbuff = kmalloc(sizeof(struct nv_skb_map) * ring->tx_pending, GFP_KERNEL);
+	if (!rxtx_ring || !rx_skbuff || !tx_skbuff) {
 		/* fall back to old rings */
 		if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
 			if (rxtx_ring)
@@ -3507,14 +3604,8 @@
 		}
 		if (rx_skbuff)
 			kfree(rx_skbuff);
-		if (rx_dma)
-			kfree(rx_dma);
 		if (tx_skbuff)
 			kfree(tx_skbuff);
-		if (tx_dma)
-			kfree(tx_dma);
-		if (tx_dma_len)
-			kfree(tx_dma_len);
 		goto exit;
 	}
 
@@ -3536,8 +3627,8 @@
 	/* set new values */
 	np->rx_ring_size = ring->rx_pending;
 	np->tx_ring_size = ring->tx_pending;
-	np->tx_limit_stop = ring->tx_pending - TX_LIMIT_DIFFERENCE;
-	np->tx_limit_start = ring->tx_pending - TX_LIMIT_DIFFERENCE - 1;
+	np->tx_limit_stop = TX_LIMIT_DIFFERENCE;
+	np->tx_limit_start = TX_LIMIT_DIFFERENCE;
 	if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
 		np->rx_ring.orig = (struct ring_desc*)rxtx_ring;
 		np->tx_ring.orig = &np->rx_ring.orig[np->rx_ring_size];
@@ -3545,18 +3636,12 @@
 		np->rx_ring.ex = (struct ring_desc_ex*)rxtx_ring;
 		np->tx_ring.ex = &np->rx_ring.ex[np->rx_ring_size];
 	}
-	np->rx_skbuff = (struct sk_buff**)rx_skbuff;
-	np->rx_dma = (dma_addr_t*)rx_dma;
-	np->tx_skbuff = (struct sk_buff**)tx_skbuff;
-	np->tx_dma = (dma_addr_t*)tx_dma;
-	np->tx_dma_len = (unsigned int*)tx_dma_len;
+	np->rx_skb = (struct nv_skb_map*)rx_skbuff;
+	np->tx_skb = (struct nv_skb_map*)tx_skbuff;
 	np->ring_addr = ring_addr;
 
-	memset(np->rx_skbuff, 0, sizeof(struct sk_buff*) * np->rx_ring_size);
-	memset(np->rx_dma, 0, sizeof(dma_addr_t) * np->rx_ring_size);
-	memset(np->tx_skbuff, 0, sizeof(struct sk_buff*) * np->tx_ring_size);
-	memset(np->tx_dma, 0, sizeof(dma_addr_t) * np->tx_ring_size);
-	memset(np->tx_dma_len, 0, sizeof(unsigned int) * np->tx_ring_size);
+	memset(np->rx_skb, 0, sizeof(struct nv_skb_map) * np->rx_ring_size);
+	memset(np->tx_skb, 0, sizeof(struct nv_skb_map) * np->tx_ring_size);
 
 	if (netif_running(dev)) {
 		/* reinit driver view of the queues */
@@ -3953,7 +4038,7 @@
 			dprintk(KERN_DEBUG "%s: loopback len mismatch %d vs %d\n",
 				dev->name, len, pkt_len);
 		} else {
-			rx_skb = np->rx_skbuff[0];
+			rx_skb = np->rx_skb[0].skb;
 			for (i = 0; i < pkt_len; i++) {
 				if (rx_skb->data[i] != (u8)(i & 0xff)) {
 					ret = 0;
@@ -4508,8 +4593,8 @@
 
 	np->rx_ring_size = RX_RING_DEFAULT;
 	np->tx_ring_size = TX_RING_DEFAULT;
-	np->tx_limit_stop = np->tx_ring_size - TX_LIMIT_DIFFERENCE;
-	np->tx_limit_start = np->tx_ring_size - TX_LIMIT_DIFFERENCE - 1;
+	np->tx_limit_stop = TX_LIMIT_DIFFERENCE;
+	np->tx_limit_start = TX_LIMIT_DIFFERENCE;
 
 	if (np->desc_ver == DESC_VER_1 || np->desc_ver == DESC_VER_2) {
 		np->rx_ring.orig = pci_alloc_consistent(pci_dev,
@@ -4526,18 +4611,12 @@
 			goto out_unmap;
 		np->tx_ring.ex = &np->rx_ring.ex[np->rx_ring_size];
 	}
-	np->rx_skbuff = kmalloc(sizeof(struct sk_buff*) * np->rx_ring_size, GFP_KERNEL);
-	np->rx_dma = kmalloc(sizeof(dma_addr_t) * np->rx_ring_size, GFP_KERNEL);
-	np->tx_skbuff = kmalloc(sizeof(struct sk_buff*) * np->tx_ring_size, GFP_KERNEL);
-	np->tx_dma = kmalloc(sizeof(dma_addr_t) * np->tx_ring_size, GFP_KERNEL);
-	np->tx_dma_len = kmalloc(sizeof(unsigned int) * np->tx_ring_size, GFP_KERNEL);
-	if (!np->rx_skbuff || !np->rx_dma || !np->tx_skbuff || !np->tx_dma || !np->tx_dma_len)
+	np->rx_skb = kmalloc(sizeof(struct nv_skb_map) * np->rx_ring_size, GFP_KERNEL);
+	np->tx_skb = kmalloc(sizeof(struct nv_skb_map) * np->tx_ring_size, GFP_KERNEL);
+	if (!np->rx_skb || !np->tx_skb)
 		goto out_freering;
-	memset(np->rx_skbuff, 0, sizeof(struct sk_buff*) * np->rx_ring_size);
-	memset(np->rx_dma, 0, sizeof(dma_addr_t) * np->rx_ring_size);
-	memset(np->tx_skbuff, 0, sizeof(struct sk_buff*) * np->tx_ring_size);
-	memset(np->tx_dma, 0, sizeof(dma_addr_t) * np->tx_ring_size);
-	memset(np->tx_dma_len, 0, sizeof(unsigned int) * np->tx_ring_size);
+	memset(np->rx_skb, 0, sizeof(struct nv_skb_map) * np->rx_ring_size);
+	memset(np->tx_skb, 0, sizeof(struct nv_skb_map) * np->tx_ring_size);
 
 	dev->open = nv_open;
 	dev->stop = nv_close;