[SCSI] iscsi: convert xmit path to iscsi chunks

Convert xmit to iscsi chunks.

from michaelc@cs.wisc.edu:

Bug fixes, more digest integration, sg chaining conversion and other
sg wrapper changes, coding style sync up, and removal of io fields,
like pdu_sent, that are not needed.

Signed-off-by: Olaf Kirch <olaf.kirch@oracle.com>
Signed-off-by: Mike Christie <michaelc@cs.wisc.edu>
Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index 9b41852..7212fe9 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -68,56 +68,10 @@
 module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
 
 static int iscsi_tcp_hdr_recv_done(struct iscsi_tcp_conn *tcp_conn,
-				   struct iscsi_chunk *chunk);
-
-static inline void
-iscsi_buf_init_iov(struct iscsi_buf *ibuf, char *vbuf, int size)
-{
-	ibuf->sg.page = virt_to_page(vbuf);
-	ibuf->sg.offset = offset_in_page(vbuf);
-	ibuf->sg.length = size;
-	ibuf->sent = 0;
-	ibuf->use_sendmsg = 1;
-}
-
-static inline void
-iscsi_buf_init_sg(struct iscsi_buf *ibuf, struct scatterlist *sg)
-{
-	ibuf->sg.page = sg->page;
-	ibuf->sg.offset = sg->offset;
-	ibuf->sg.length = sg->length;
-	/*
-	 * Fastpath: sg element fits into single page
-	 */
-	if (sg->length + sg->offset <= PAGE_SIZE && !PageSlab(sg->page))
-		ibuf->use_sendmsg = 0;
-	else
-		ibuf->use_sendmsg = 1;
-	ibuf->sent = 0;
-}
-
-static inline int
-iscsi_buf_left(struct iscsi_buf *ibuf)
-{
-	int rc;
-
-	rc = ibuf->sg.length - ibuf->sent;
-	BUG_ON(rc < 0);
-	return rc;
-}
-
-static inline void
-iscsi_hdr_digest(struct iscsi_conn *conn, struct iscsi_buf *buf,
-		 u8* crc)
-{
-	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
-
-	crypto_hash_digest(&tcp_conn->tx_hash, &buf->sg, buf->sg.length, crc);
-	buf->sg.length += ISCSI_DIGEST_SIZE;
-}
+				   struct iscsi_segment *segment);
 
 /*
- * Scatterlist handling: inside the iscsi_chunk, we
+ * Scatterlist handling: inside the iscsi_segment, we
  * remember an index into the scatterlist, and set data/size
  * to the current scatterlist entry. For highmem pages, we
  * kmap as needed.
@@ -130,60 +84,72 @@
  */
 
 /**
- * iscsi_tcp_chunk_init_sg - init indicated scatterlist entry
- * @chunk: the buffer object
- * @idx: index into scatterlist
+ * iscsi_tcp_segment_init_sg - init indicated scatterlist entry
+ * @segment: the buffer object
+ * @sg: scatterlist
  * @offset: byte offset into that sg entry
  *
- * This function sets up the chunk so that subsequent
+ * This function sets up the segment so that subsequent
  * data is copied to the indicated sg entry, at the given
  * offset.
  */
 static inline void
-iscsi_tcp_chunk_init_sg(struct iscsi_chunk *chunk,
-			unsigned int idx, unsigned int offset)
+iscsi_tcp_segment_init_sg(struct iscsi_segment *segment,
+			  struct scatterlist *sg, unsigned int offset)
 {
-	struct scatterlist *sg;
-
-	BUG_ON(chunk->sg == NULL);
-
-	sg = &chunk->sg[idx];
-	chunk->sg_index = idx;
-	chunk->sg_offset = offset;
-	chunk->size = min(sg->length - offset, chunk->total_size);
-	chunk->data = NULL;
+	segment->sg = sg;
+	segment->sg_offset = offset;
+	segment->size = min(sg->length - offset,
+			    segment->total_size - segment->total_copied);
+	segment->data = NULL;
 }
 
 /**
- * iscsi_tcp_chunk_map - map the current S/G page
- * @chunk: iscsi chunk
+ * iscsi_tcp_segment_map - map the current S/G page
+ * @segment: iscsi_segment
+ * @recv: 1 if called from recv path
  *
  * We only need to possibly kmap data if scatter lists are being used,
  * because the iscsi passthrough and internal IO paths will never use high
  * mem pages.
  */
 static inline void
-iscsi_tcp_chunk_map(struct iscsi_chunk *chunk)
+iscsi_tcp_segment_map(struct iscsi_segment *segment, int recv)
 {
 	struct scatterlist *sg;
 
-	if (chunk->data != NULL || !chunk->sg)
+	if (segment->data != NULL || !segment->sg)
 		return;
 
-	sg = &chunk->sg[chunk->sg_index];
-	BUG_ON(chunk->sg_mapped);
+	sg = segment->sg;
+	BUG_ON(segment->sg_mapped);
 	BUG_ON(sg->length == 0);
-	chunk->sg_mapped = kmap_atomic(sg->page, KM_SOFTIRQ0);
-	chunk->data = chunk->sg_mapped + sg->offset + chunk->sg_offset;
+
+	/*
+	 * If the page count is greater than one it is ok to send
+	 * to the network layer's zero copy send path. If not we
+	 * have to go the slow sendmsg path. We always map for the
+	 * recv path.
+	 */
+	if (page_count(sg_page(sg)) >= 1 && !recv)
+		return;
+
+	debug_tcp("iscsi_tcp_segment_map %s %p\n", recv ? "recv" : "xmit",
+		  segment);
+	segment->sg_mapped = kmap_atomic(sg_page(sg), KM_SOFTIRQ0);
+	segment->data = segment->sg_mapped + sg->offset + segment->sg_offset;
 }
 
 static inline void
-iscsi_tcp_chunk_unmap(struct iscsi_chunk *chunk)
+iscsi_tcp_segment_unmap(struct iscsi_segment *segment)
 {
-	if (chunk->sg_mapped) {
-		kunmap_atomic(chunk->sg_mapped, KM_SOFTIRQ0);
-		chunk->sg_mapped = NULL;
-		chunk->data = NULL;
+	debug_tcp("iscsi_tcp_segment_unmap %p\n", segment);
+
+	if (segment->sg_mapped) {
+		debug_tcp("iscsi_tcp_segment_unmap valid\n");
+		kunmap_atomic(segment->sg_mapped, KM_SOFTIRQ0);
+		segment->sg_mapped = NULL;
+		segment->data = NULL;
 	}
 }
 
@@ -191,23 +157,24 @@
  * Splice the digest buffer into the buffer
  */
 static inline void
-iscsi_tcp_chunk_splice_digest(struct iscsi_chunk *chunk, void *digest)
+iscsi_tcp_segment_splice_digest(struct iscsi_segment *segment, void *digest)
 {
-	chunk->data = digest;
-	chunk->digest_len = ISCSI_DIGEST_SIZE;
-	chunk->total_size += ISCSI_DIGEST_SIZE;
-	chunk->size = ISCSI_DIGEST_SIZE;
-	chunk->copied = 0;
-	chunk->sg = NULL;
-	chunk->sg_index = 0;
-	chunk->hash = NULL;
+	segment->data = digest;
+	segment->digest_len = ISCSI_DIGEST_SIZE;
+	segment->total_size += ISCSI_DIGEST_SIZE;
+	segment->size = ISCSI_DIGEST_SIZE;
+	segment->copied = 0;
+	segment->sg = NULL;
+	segment->hash = NULL;
 }
 
 /**
- * iscsi_tcp_chunk_done - check whether the chunk is complete
- * @chunk: iscsi chunk to check
+ * iscsi_tcp_segment_done - check whether the segment is complete
+ * @segment: iscsi segment to check
+ * @recv: set to one of this is called from the recv path
+ * @copied: number of bytes copied
  *
- * Check if we're done receiving this chunk. If the receive
+ * Check if we're done receiving this segment. If the receive
  * buffer is full but we expect more data, move on to the
  * next entry in the scatterlist.
  *
@@ -217,62 +184,145 @@
  * This function must be re-entrant.
  */
 static inline int
-iscsi_tcp_chunk_done(struct iscsi_chunk *chunk)
+iscsi_tcp_segment_done(struct iscsi_segment *segment, int recv, unsigned copied)
 {
 	static unsigned char padbuf[ISCSI_PAD_LEN];
+	struct scatterlist sg;
 	unsigned int pad;
 
-	if (chunk->copied < chunk->size) {
-		iscsi_tcp_chunk_map(chunk);
+	debug_tcp("copied %u %u size %u %s\n", segment->copied, copied,
+		  segment->size, recv ? "recv" : "xmit");
+	if (segment->hash && copied) {
+		/*
+		 * If a segment is kmapd we must unmap it before sending
+		 * to the crypto layer since that will try to kmap it again.
+		 */
+		iscsi_tcp_segment_unmap(segment);
+
+		if (!segment->data) {
+			sg_init_table(&sg, 1);
+			sg_set_page(&sg, sg_page(segment->sg), copied,
+				    segment->copied + segment->sg_offset +
+							segment->sg->offset);
+		} else
+			sg_init_one(&sg, segment->data + segment->copied,
+				    copied);
+		crypto_hash_update(segment->hash, &sg, copied);
+	}
+
+	segment->copied += copied;
+	if (segment->copied < segment->size) {
+		iscsi_tcp_segment_map(segment, recv);
 		return 0;
 	}
 
-	chunk->total_copied += chunk->copied;
-	chunk->copied = 0;
-	chunk->size = 0;
+	segment->total_copied += segment->copied;
+	segment->copied = 0;
+	segment->size = 0;
 
 	/* Unmap the current scatterlist page, if there is one. */
-	iscsi_tcp_chunk_unmap(chunk);
+	iscsi_tcp_segment_unmap(segment);
 
 	/* Do we have more scatterlist entries? */
-	if (chunk->total_copied < chunk->total_size) {
+	debug_tcp("total copied %u total size %u\n", segment->total_copied,
+		   segment->total_size);
+	if (segment->total_copied < segment->total_size) {
 		/* Proceed to the next entry in the scatterlist. */
-		iscsi_tcp_chunk_init_sg(chunk, chunk->sg_index + 1, 0);
-		iscsi_tcp_chunk_map(chunk);
-		BUG_ON(chunk->size == 0);
+		iscsi_tcp_segment_init_sg(segment, sg_next(segment->sg),
+					  0);
+		iscsi_tcp_segment_map(segment, recv);
+		BUG_ON(segment->size == 0);
 		return 0;
 	}
 
 	/* Do we need to handle padding? */
-	pad = iscsi_padding(chunk->total_copied);
+	pad = iscsi_padding(segment->total_copied);
 	if (pad != 0) {
 		debug_tcp("consume %d pad bytes\n", pad);
-		chunk->total_size += pad;
-		chunk->size = pad;
-		chunk->data = padbuf;
+		segment->total_size += pad;
+		segment->size = pad;
+		segment->data = padbuf;
 		return 0;
 	}
 
 	/*
-	 * Set us up for receiving the data digest. hdr digest
+	 * Set us up for transferring the data digest. hdr digest
 	 * is completely handled in hdr done function.
 	 */
-	if (chunk->hash) {
-		if (chunk->digest_len == 0) {
-			crypto_hash_final(chunk->hash, chunk->digest);
-			iscsi_tcp_chunk_splice_digest(chunk,
-						      chunk->recv_digest);
-			return 0;
-		}
+	if (segment->hash) {
+		crypto_hash_final(segment->hash, segment->digest);
+		iscsi_tcp_segment_splice_digest(segment,
+				 recv ? segment->recv_digest : segment->digest);
+		return 0;
 	}
 
 	return 1;
 }
 
 /**
- * iscsi_tcp_chunk_recv - copy data to chunk
+ * iscsi_tcp_xmit_segment - transmit segment
  * @tcp_conn: the iSCSI TCP connection
- * @chunk: the buffer to copy to
+ * @segment: the buffer to transmnit
+ *
+ * This function transmits as much of the buffer as
+ * the network layer will accept, and returns the number of
+ * bytes transmitted.
+ *
+ * If CRC hashing is enabled, the function will compute the
+ * hash as it goes. When the entire segment has been transmitted,
+ * it will retrieve the hash value and send it as well.
+ */
+static int
+iscsi_tcp_xmit_segment(struct iscsi_tcp_conn *tcp_conn,
+		       struct iscsi_segment *segment)
+{
+	struct socket *sk = tcp_conn->sock;
+	unsigned int copied = 0;
+	int r = 0;
+
+	while (!iscsi_tcp_segment_done(segment, 0, r)) {
+		struct scatterlist *sg;
+		unsigned int offset, copy;
+		int flags = 0;
+
+		r = 0;
+		offset = segment->copied;
+		copy = segment->size - offset;
+
+		if (segment->total_copied + segment->size < segment->total_size)
+			flags |= MSG_MORE;
+
+		/* Use sendpage if we can; else fall back to sendmsg */
+		if (!segment->data) {
+			sg = segment->sg;
+			offset += segment->sg_offset + sg->offset;
+			r = tcp_conn->sendpage(sk, sg_page(sg), offset, copy,
+					       flags);
+		} else {
+			struct msghdr msg = { .msg_flags = flags };
+			struct kvec iov = {
+				.iov_base = segment->data + offset,
+				.iov_len = copy
+			};
+
+			r = kernel_sendmsg(sk, &msg, &iov, 1, copy);
+		}
+
+		if (r < 0) {
+			iscsi_tcp_segment_unmap(segment);
+			if (copied || r == -EAGAIN)
+				break;
+			return r;
+		}
+		copied += r;
+	}
+	return copied;
+}
+
+/**
+ * iscsi_tcp_segment_recv - copy data to segment
+ * @tcp_conn: the iSCSI TCP connection
+ * @segment: the buffer to copy to
  * @ptr: data pointer
  * @len: amount of data available
  *
@@ -287,29 +337,24 @@
  * just way we do for network layer checksums.
  */
 static int
-iscsi_tcp_chunk_recv(struct iscsi_tcp_conn *tcp_conn,
-		     struct iscsi_chunk *chunk, const void *ptr,
-		     unsigned int len)
+iscsi_tcp_segment_recv(struct iscsi_tcp_conn *tcp_conn,
+		       struct iscsi_segment *segment, const void *ptr,
+		       unsigned int len)
 {
-	struct scatterlist sg;
-	unsigned int copy, copied = 0;
+	unsigned int copy = 0, copied = 0;
 
-	while (!iscsi_tcp_chunk_done(chunk)) {
-		if (copied == len)
-			goto out;
-
-		copy = min(len - copied, chunk->size - chunk->copied);
-		memcpy(chunk->data + chunk->copied, ptr + copied, copy);
-
-		if (chunk->hash) {
-			sg_init_one(&sg, ptr + copied, copy);
-			crypto_hash_update(chunk->hash, &sg, copy);
+	while (!iscsi_tcp_segment_done(segment, 1, copy)) {
+		if (copied == len) {
+			debug_tcp("iscsi_tcp_segment_recv copied %d bytes\n",
+				  len);
+			break;
 		}
-		chunk->copied += copy;
+
+		copy = min(len - copied, segment->size - segment->copied);
+		debug_tcp("iscsi_tcp_segment_recv copying %d\n", copy);
+		memcpy(segment->data + segment->copied, ptr + copied, copy);
 		copied += copy;
 	}
-
-out:
 	return copied;
 }
 
@@ -325,12 +370,13 @@
 
 static inline int
 iscsi_tcp_dgst_verify(struct iscsi_tcp_conn *tcp_conn,
-		      struct iscsi_chunk *chunk)
+		      struct iscsi_segment *segment)
 {
-	if (!chunk->digest_len)
+	if (!segment->digest_len)
 		return 1;
 
-	if (memcmp(chunk->recv_digest, chunk->digest, chunk->digest_len)) {
+	if (memcmp(segment->recv_digest, segment->digest,
+		   segment->digest_len)) {
 		debug_scsi("digest mismatch\n");
 		return 0;
 	}
@@ -339,55 +385,59 @@
 }
 
 /*
- * Helper function to set up chunk buffer
+ * Helper function to set up segment buffer
  */
 static inline void
-__iscsi_chunk_init(struct iscsi_chunk *chunk, size_t size,
-		   iscsi_chunk_done_fn_t *done, struct hash_desc *hash)
+__iscsi_segment_init(struct iscsi_segment *segment, size_t size,
+		     iscsi_segment_done_fn_t *done, struct hash_desc *hash)
 {
-	memset(chunk, 0, sizeof(*chunk));
-	chunk->total_size = size;
-	chunk->done = done;
+	memset(segment, 0, sizeof(*segment));
+	segment->total_size = size;
+	segment->done = done;
 
 	if (hash) {
-		chunk->hash = hash;
+		segment->hash = hash;
 		crypto_hash_init(hash);
 	}
 }
 
 static inline void
-iscsi_chunk_init_linear(struct iscsi_chunk *chunk, void *data, size_t size,
-			iscsi_chunk_done_fn_t *done, struct hash_desc *hash)
+iscsi_segment_init_linear(struct iscsi_segment *segment, void *data,
+			  size_t size, iscsi_segment_done_fn_t *done,
+			  struct hash_desc *hash)
 {
-	__iscsi_chunk_init(chunk, size, done, hash);
-	chunk->data = data;
-	chunk->size = size;
+	__iscsi_segment_init(segment, size, done, hash);
+	segment->data = data;
+	segment->size = size;
 }
 
 static inline int
-iscsi_chunk_seek_sg(struct iscsi_chunk *chunk,
-		    struct scatterlist *sg, unsigned int sg_count,
-		    unsigned int offset, size_t size,
-		    iscsi_chunk_done_fn_t *done, struct hash_desc *hash)
+iscsi_segment_seek_sg(struct iscsi_segment *segment,
+		      struct scatterlist *sg_list, unsigned int sg_count,
+		      unsigned int offset, size_t size,
+		      iscsi_segment_done_fn_t *done, struct hash_desc *hash)
 {
+	struct scatterlist *sg;
 	unsigned int i;
 
-	__iscsi_chunk_init(chunk, size, done, hash);
-	for (i = 0; i < sg_count; ++i) {
-		if (offset < sg[i].length) {
-			chunk->sg = sg;
-			chunk->sg_count = sg_count;
-			iscsi_tcp_chunk_init_sg(chunk, i, offset);
+	debug_scsi("iscsi_segment_seek_sg offset %u size %llu\n",
+		  offset, size);
+	__iscsi_segment_init(segment, size, done, hash);
+	for_each_sg(sg_list, sg, sg_count, i) {
+		debug_scsi("sg %d, len %u offset %u\n", i, sg->length,
+			   sg->offset);
+		if (offset < sg->length) {
+			iscsi_tcp_segment_init_sg(segment, sg, offset);
 			return 0;
 		}
-		offset -= sg[i].length;
+		offset -= sg->length;
 	}
 
 	return ISCSI_ERR_DATA_OFFSET;
 }
 
 /**
- * iscsi_tcp_hdr_recv_prep - prep chunk for hdr reception
+ * iscsi_tcp_hdr_recv_prep - prep segment for hdr reception
  * @tcp_conn: iscsi connection to prep for
  *
  * This function always passes NULL for the hash argument, because when this
@@ -399,7 +449,7 @@
 {
 	debug_tcp("iscsi_tcp_hdr_recv_prep(%p%s)\n", tcp_conn,
 		  tcp_conn->iscsi_conn->hdrdgst_en ? ", digest enabled" : "");
-	iscsi_chunk_init_linear(&tcp_conn->in.chunk,
+	iscsi_segment_init_linear(&tcp_conn->in.segment,
 				tcp_conn->in.hdr_buf, sizeof(struct iscsi_hdr),
 				iscsi_tcp_hdr_recv_done, NULL);
 }
@@ -409,12 +459,12 @@
  */
 static int
 iscsi_tcp_data_recv_done(struct iscsi_tcp_conn *tcp_conn,
-			 struct iscsi_chunk *chunk)
+			 struct iscsi_segment *segment)
 {
 	struct iscsi_conn *conn = tcp_conn->iscsi_conn;
 	int rc = 0;
 
-	if (!iscsi_tcp_dgst_verify(tcp_conn, chunk))
+	if (!iscsi_tcp_dgst_verify(tcp_conn, segment))
 		return ISCSI_ERR_DATA_DGST;
 
 	rc = iscsi_complete_pdu(conn, tcp_conn->in.hdr,
@@ -435,7 +485,7 @@
 	if (conn->datadgst_en)
 		rx_hash = &tcp_conn->rx_hash;
 
-	iscsi_chunk_init_linear(&tcp_conn->in.chunk,
+	iscsi_segment_init_linear(&tcp_conn->in.segment,
 				conn->data, tcp_conn->in.datalen,
 				iscsi_tcp_data_recv_done, rx_hash);
 }
@@ -448,7 +498,6 @@
 {
 	struct iscsi_tcp_cmd_task *tcp_ctask = ctask->dd_data;
 	struct iscsi_r2t_info *r2t;
-	struct scsi_cmnd *sc;
 
 	/* flush ctask's r2t queues */
 	while (__kfifo_get(tcp_ctask->r2tqueue, (void*)&r2t, sizeof(void*))) {
@@ -457,12 +506,12 @@
 		debug_scsi("iscsi_tcp_cleanup_ctask pending r2t dropped\n");
 	}
 
-	sc = ctask->sc;
-	if (unlikely(!sc))
-		return;
-
-	tcp_ctask->xmstate = XMSTATE_IDLE;
-	tcp_ctask->r2t = NULL;
+	r2t = tcp_ctask->r2t;
+	if (r2t != NULL) {
+		__kfifo_put(tcp_ctask->r2tpool.queue, (void*)&r2t,
+			    sizeof(void*));
+		tcp_ctask->r2t = NULL;
+	}
 }
 
 /**
@@ -481,11 +530,6 @@
 	int datasn = be32_to_cpu(rhdr->datasn);
 
 	iscsi_update_cmdsn(session, (struct iscsi_nopin*)rhdr);
-	/*
-	 * setup Data-In byte counter (gets decremented..)
-	 */
-	ctask->data_count = tcp_conn->in.datalen;
-
 	if (tcp_conn->in.datalen == 0)
 		return 0;
 
@@ -543,9 +587,6 @@
 			struct iscsi_r2t_info *r2t)
 {
 	struct iscsi_data *hdr;
-	struct scsi_cmnd *sc = ctask->sc;
-	int i, sg_count = 0;
-	struct scatterlist *sg;
 
 	hdr = &r2t->dtask.hdr;
 	memset(hdr, 0, sizeof(struct iscsi_data));
@@ -569,34 +610,6 @@
 	conn->dataout_pdus_cnt++;
 
 	r2t->sent = 0;
-
-	iscsi_buf_init_iov(&r2t->headbuf, (char*)hdr,
-			   sizeof(struct iscsi_hdr));
-
-	sg = scsi_sglist(sc);
-	r2t->sg = NULL;
-	for (i = 0; i < scsi_sg_count(sc); i++, sg += 1) {
-		/* FIXME: prefetch ? */
-		if (sg_count + sg->length > r2t->data_offset) {
-			int page_offset;
-
-			/* sg page found! */
-
-			/* offset within this page */
-			page_offset = r2t->data_offset - sg_count;
-
-			/* fill in this buffer */
-			iscsi_buf_init_sg(&r2t->sendbuf, sg);
-			r2t->sendbuf.sg.offset += page_offset;
-			r2t->sendbuf.sg.length -= page_offset;
-
-			/* xmit logic will continue with next one */
-			r2t->sg = sg + 1;
-			break;
-		}
-		sg_count += sg->length;
-	}
-	BUG_ON(r2t->sg == NULL);
 }
 
 /**
@@ -670,7 +683,6 @@
 
 	tcp_ctask->exp_datasn = r2tsn + 1;
 	__kfifo_put(tcp_ctask->r2tqueue, (void*)&r2t, sizeof(void*));
-	tcp_ctask->xmstate |= XMSTATE_SOL_HDR_INIT;
 	conn->r2t_pdus_cnt++;
 
 	iscsi_requeue_ctask(ctask);
@@ -684,13 +696,13 @@
  */
 static int
 iscsi_tcp_process_data_in(struct iscsi_tcp_conn *tcp_conn,
-			  struct iscsi_chunk *chunk)
+			  struct iscsi_segment *segment)
 {
 	struct iscsi_conn *conn = tcp_conn->iscsi_conn;
 	struct iscsi_hdr *hdr = tcp_conn->in.hdr;
 	int rc;
 
-	if (!iscsi_tcp_dgst_verify(tcp_conn, chunk))
+	if (!iscsi_tcp_dgst_verify(tcp_conn, segment))
 		return ISCSI_ERR_DATA_DGST;
 
 	/* check for non-exceptional status */
@@ -762,7 +774,7 @@
 			/*
 			 * Setup copy of Data-In into the Scsi_Cmnd
 			 * Scatterlist case:
-			 * We set up the iscsi_chunk to point to the next
+			 * We set up the iscsi_segment to point to the next
 			 * scatterlist entry to copy to. As we go along,
 			 * we move on to the next scatterlist entry and
 			 * update the digest per-entry.
@@ -774,13 +786,13 @@
 				  "datalen=%d)\n", tcp_conn,
 				  tcp_ctask->data_offset,
 				  tcp_conn->in.datalen);
-			return iscsi_chunk_seek_sg(&tcp_conn->in.chunk,
-						scsi_sglist(ctask->sc),
-						scsi_sg_count(ctask->sc),
-						tcp_ctask->data_offset,
-						tcp_conn->in.datalen,
-						iscsi_tcp_process_data_in,
-						rx_hash);
+			return iscsi_segment_seek_sg(&tcp_conn->in.segment,
+						     scsi_sglist(ctask->sc),
+						     scsi_sg_count(ctask->sc),
+						     tcp_ctask->data_offset,
+						     tcp_conn->in.datalen,
+						     iscsi_tcp_process_data_in,
+						     rx_hash);
 		}
 		/* fall through */
 	case ISCSI_OP_SCSI_CMD_RSP:
@@ -846,17 +858,6 @@
 	return rc;
 }
 
-static inline void
-partial_sg_digest_update(struct hash_desc *desc, struct scatterlist *sg,
-			 int offset, int length)
-{
-	struct scatterlist temp;
-
-	sg_init_table(&temp, 1);
-	sg_set_page(&temp, sg_page(sg), length, offset);
-	crypto_hash_update(desc, &temp, length);
-}
-
 /**
  * iscsi_tcp_hdr_recv_done - process PDU header
  *
@@ -866,7 +867,7 @@
  */
 static int
 iscsi_tcp_hdr_recv_done(struct iscsi_tcp_conn *tcp_conn,
-			struct iscsi_chunk *chunk)
+			struct iscsi_segment *segment)
 {
 	struct iscsi_conn *conn = tcp_conn->iscsi_conn;
 	struct iscsi_hdr *hdr;
@@ -876,7 +877,7 @@
 	 * may need to go back to the caller for more.
 	 */
 	hdr = (struct iscsi_hdr *) tcp_conn->in.hdr_buf;
-	if (chunk->copied == sizeof(struct iscsi_hdr) && hdr->hlength) {
+	if (segment->copied == sizeof(struct iscsi_hdr) && hdr->hlength) {
 		/* Bump the header length - the caller will
 		 * just loop around and get the AHS for us, and
 		 * call again. */
@@ -886,8 +887,8 @@
 		if (sizeof(*hdr) + ahslen > sizeof(tcp_conn->in.hdr_buf))
 			return ISCSI_ERR_AHSLEN;
 
-		chunk->total_size += ahslen;
-		chunk->size += ahslen;
+		segment->total_size += ahslen;
+		segment->size += ahslen;
 		return 0;
 	}
 
@@ -895,16 +896,16 @@
 	 * header digests; if so, set up the recv_digest buffer
 	 * and go back for more. */
 	if (conn->hdrdgst_en) {
-		if (chunk->digest_len == 0) {
-			iscsi_tcp_chunk_splice_digest(chunk,
-						      chunk->recv_digest);
+		if (segment->digest_len == 0) {
+			iscsi_tcp_segment_splice_digest(segment,
+							segment->recv_digest);
 			return 0;
 		}
 		iscsi_tcp_dgst_header(&tcp_conn->rx_hash, hdr,
-				      chunk->total_copied - ISCSI_DIGEST_SIZE,
-				      chunk->digest);
+				      segment->total_copied - ISCSI_DIGEST_SIZE,
+				      segment->digest);
 
-		if (!iscsi_tcp_dgst_verify(tcp_conn, chunk))
+		if (!iscsi_tcp_dgst_verify(tcp_conn, segment))
 			return ISCSI_ERR_HDR_DGST;
 	}
 
@@ -925,7 +926,7 @@
 {
 	struct iscsi_conn *conn = rd_desc->arg.data;
 	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
-	struct iscsi_chunk *chunk = &tcp_conn->in.chunk;
+	struct iscsi_segment *segment = &tcp_conn->in.segment;
 	struct skb_seq_state seq;
 	unsigned int consumed = 0;
 	int rc = 0;
@@ -943,27 +944,31 @@
 		const u8 *ptr;
 
 		avail = skb_seq_read(consumed, &ptr, &seq);
-		if (avail == 0)
+		if (avail == 0) {
+			debug_tcp("no more data avail. Consumed %d\n",
+				  consumed);
 			break;
-		BUG_ON(chunk->copied >= chunk->size);
+		}
+		BUG_ON(segment->copied >= segment->size);
 
 		debug_tcp("skb %p ptr=%p avail=%u\n", skb, ptr, avail);
-		rc = iscsi_tcp_chunk_recv(tcp_conn, chunk, ptr, avail);
+		rc = iscsi_tcp_segment_recv(tcp_conn, segment, ptr, avail);
 		BUG_ON(rc == 0);
 		consumed += rc;
 
-		if (chunk->total_copied >= chunk->total_size) {
-			rc = chunk->done(tcp_conn, chunk);
+		if (segment->total_copied >= segment->total_size) {
+			debug_tcp("segment done\n");
+			rc = segment->done(tcp_conn, segment);
 			if (rc != 0) {
 				skb_abort_seq_read(&seq);
 				goto error;
 			}
 
 			/* The done() functions sets up the
-			 * next chunk. */
+			 * next segment. */
 		}
 	}
-
+	skb_abort_seq_read(&seq);
 	conn->rxdata_octets += consumed;
 	return consumed;
 
@@ -996,7 +1001,7 @@
 
 	/* If we had to (atomically) map a highmem page,
 	 * unmap it now. */
-	iscsi_tcp_chunk_unmap(&tcp_conn->in.chunk);
+	iscsi_tcp_segment_unmap(&tcp_conn->in.segment);
 }
 
 static void
@@ -1076,121 +1081,173 @@
 }
 
 /**
- * iscsi_send - generic send routine
- * @sk: kernel's socket
- * @buf: buffer to write from
- * @size: actual size to write
- * @flags: socket's flags
- */
-static inline int
-iscsi_send(struct iscsi_conn *conn, struct iscsi_buf *buf, int size, int flags)
+ * iscsi_xmit - TCP transmit
+ **/
+static int
+iscsi_xmit(struct iscsi_conn *conn)
 {
 	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
-	struct socket *sk = tcp_conn->sock;
-	int offset = buf->sg.offset + buf->sent, res;
+	struct iscsi_segment *segment = &tcp_conn->out.segment;
+	unsigned int consumed = 0;
+	int rc = 0;
 
-	/*
-	 * if we got use_sg=0 or are sending something we kmallocd
-	 * then we did not have to do kmap (kmap returns page_address)
-	 *
-	 * if we got use_sg > 0, but had to drop down, we do not
-	 * set clustering so this should only happen for that
-	 * slab case.
+	while (1) {
+		rc = iscsi_tcp_xmit_segment(tcp_conn, segment);
+		if (rc < 0)
+			goto error;
+		if (rc == 0)
+			break;
+
+		consumed += rc;
+
+		if (segment->total_copied >= segment->total_size) {
+			if (segment->done != NULL) {
+				rc = segment->done(tcp_conn, segment);
+				if (rc < 0)
+					goto error;
+			}
+		}
+	}
+
+	debug_tcp("xmit %d bytes\n", consumed);
+
+	conn->txdata_octets += consumed;
+	return consumed;
+
+error:
+	/* Transmit error. We could initiate error recovery
+	 * here. */
+	debug_tcp("Error sending PDU, errno=%d\n", rc);
+	iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
+	return rc;
+}
+
+/**
+ * iscsi_tcp_xmit_qlen - return the number of bytes queued for xmit
+ */
+static inline int
+iscsi_tcp_xmit_qlen(struct iscsi_conn *conn)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct iscsi_segment *segment = &tcp_conn->out.segment;
+
+	return segment->total_copied - segment->total_size;
+}
+
+static inline int
+iscsi_tcp_flush(struct iscsi_conn *conn)
+{
+	int rc;
+
+	while (iscsi_tcp_xmit_qlen(conn)) {
+		rc = iscsi_xmit(conn);
+		if (rc == 0)
+			return -EAGAIN;
+		if (rc < 0)
+			return rc;
+	}
+
+	return 0;
+}
+
+/*
+ * This is called when we're done sending the header.
+ * Simply copy the data_segment to the send segment, and return.
+ */
+static int
+iscsi_tcp_send_hdr_done(struct iscsi_tcp_conn *tcp_conn,
+			struct iscsi_segment *segment)
+{
+	tcp_conn->out.segment = tcp_conn->out.data_segment;
+	debug_tcp("Header done. Next segment size %u total_size %u\n",
+		  tcp_conn->out.segment.size, tcp_conn->out.segment.total_size);
+	return 0;
+}
+
+static void
+iscsi_tcp_send_hdr_prep(struct iscsi_conn *conn, void *hdr, size_t hdrlen)
+{
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+
+	debug_tcp("%s(%p%s)\n", __FUNCTION__, tcp_conn,
+			conn->hdrdgst_en? ", digest enabled" : "");
+
+	/* Clear the data segment - needs to be filled in by the
+	 * caller using iscsi_tcp_send_data_prep() */
+	memset(&tcp_conn->out.data_segment, 0, sizeof(struct iscsi_segment));
+
+	/* If header digest is enabled, compute the CRC and
+	 * place the digest into the same buffer. We make
+	 * sure that both iscsi_tcp_ctask and mtask have
+	 * sufficient room.
 	 */
-	if (buf->use_sendmsg)
-		res = sock_no_sendpage(sk, buf->sg.page, offset, size, flags);
-	else
-		res = tcp_conn->sendpage(sk, buf->sg.page, offset, size, flags);
-
-	if (res >= 0) {
-		conn->txdata_octets += res;
-		buf->sent += res;
-		return res;
+	if (conn->hdrdgst_en) {
+		iscsi_tcp_dgst_header(&tcp_conn->tx_hash, hdr, hdrlen,
+				      hdr + hdrlen);
+		hdrlen += ISCSI_DIGEST_SIZE;
 	}
 
-	tcp_conn->sendpage_failures_cnt++;
-	if (res == -EAGAIN)
-		res = -ENOBUFS;
-	else
-		iscsi_conn_failure(conn, ISCSI_ERR_CONN_FAILED);
-	return res;
+	/* Remember header pointer for later, when we need
+	 * to decide whether there's a payload to go along
+	 * with the header. */
+	tcp_conn->out.hdr = hdr;
+
+	iscsi_segment_init_linear(&tcp_conn->out.segment, hdr, hdrlen,
+				iscsi_tcp_send_hdr_done, NULL);
 }
 
-/**
- * iscsi_sendhdr - send PDU Header via tcp_sendpage()
- * @conn: iscsi connection
- * @buf: buffer to write from
- * @datalen: lenght of data to be sent after the header
- *
- * Notes:
- *	(Tx, Fast Path)
- **/
-static inline int
-iscsi_sendhdr(struct iscsi_conn *conn, struct iscsi_buf *buf, int datalen)
+/*
+ * Prepare the send buffer for the payload data.
+ * Padding and checksumming will all be taken care
+ * of by the iscsi_segment routines.
+ */
+static int
+iscsi_tcp_send_data_prep(struct iscsi_conn *conn, struct scatterlist *sg,
+			 unsigned int count, unsigned int offset,
+			 unsigned int len)
 {
-	int flags = 0; /* MSG_DONTWAIT; */
-	int res, size;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct hash_desc *tx_hash = NULL;
+	unsigned int hdr_spec_len;
 
-	size = buf->sg.length - buf->sent;
-	BUG_ON(buf->sent + size > buf->sg.length);
-	if (buf->sent + size != buf->sg.length || datalen)
-		flags |= MSG_MORE;
+	debug_tcp("%s(%p, offset=%d, datalen=%d%s)\n", __FUNCTION__,
+			tcp_conn, offset, len,
+			conn->datadgst_en? ", digest enabled" : "");
 
-	res = iscsi_send(conn, buf, size, flags);
-	debug_tcp("sendhdr %d bytes, sent %d res %d\n", size, buf->sent, res);
-	if (res >= 0) {
-		if (size != res)
-			return -EAGAIN;
-		return 0;
-	}
+	/* Make sure the datalen matches what the caller
+	   said he would send. */
+	hdr_spec_len = ntoh24(tcp_conn->out.hdr->dlength);
+	WARN_ON(iscsi_padded(len) != iscsi_padded(hdr_spec_len));
 
-	return res;
+	if (conn->datadgst_en)
+		tx_hash = &tcp_conn->tx_hash;
+
+	return iscsi_segment_seek_sg(&tcp_conn->out.data_segment,
+				   sg, count, offset, len,
+				   NULL, tx_hash);
 }
 
-/**
- * iscsi_sendpage - send one page of iSCSI Data-Out.
- * @conn: iscsi connection
- * @buf: buffer to write from
- * @count: remaining data
- * @sent: number of bytes sent
- *
- * Notes:
- *	(Tx, Fast Path)
- **/
-static inline int
-iscsi_sendpage(struct iscsi_conn *conn, struct iscsi_buf *buf,
-	       int *count, int *sent)
+static void
+iscsi_tcp_send_linear_data_prepare(struct iscsi_conn *conn, void *data,
+				   size_t len)
 {
-	int flags = 0; /* MSG_DONTWAIT; */
-	int res, size;
+	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
+	struct hash_desc *tx_hash = NULL;
+	unsigned int hdr_spec_len;
 
-	size = buf->sg.length - buf->sent;
-	BUG_ON(buf->sent + size > buf->sg.length);
-	if (size > *count)
-		size = *count;
-	if (buf->sent + size != buf->sg.length || *count != size)
-		flags |= MSG_MORE;
+	debug_tcp("%s(%p, datalen=%d%s)\n", __FUNCTION__, tcp_conn, len,
+		  conn->datadgst_en? ", digest enabled" : "");
 
-	res = iscsi_send(conn, buf, size, flags);
-	debug_tcp("sendpage: %d bytes, sent %d left %d sent %d res %d\n",
-		  size, buf->sent, *count, *sent, res);
-	if (res >= 0) {
-		*count -= res;
-		*sent += res;
-		if (size != res)
-			return -EAGAIN;
-		return 0;
-	}
+	/* Make sure the datalen matches what the caller
+	   said he would send. */
+	hdr_spec_len = ntoh24(tcp_conn->out.hdr->dlength);
+	WARN_ON(iscsi_padded(len) != iscsi_padded(hdr_spec_len));
 
-	return res;
-}
+	if (conn->datadgst_en)
+		tx_hash = &tcp_conn->tx_hash;
 
-static inline void
-iscsi_data_digest_init(struct iscsi_tcp_conn *tcp_conn,
-		      struct iscsi_tcp_cmd_task *tcp_ctask)
-{
-	crypto_hash_init(&tcp_conn->tx_hash);
-	tcp_ctask->digest_count = 4;
+	iscsi_segment_init_linear(&tcp_conn->out.data_segment,
+				data, len, NULL, tx_hash);
 }
 
 /**
@@ -1206,12 +1263,17 @@
  *
  *	Called under connection lock.
  **/
-static void
+static int
 iscsi_solicit_data_cont(struct iscsi_conn *conn, struct iscsi_cmd_task *ctask,
-			struct iscsi_r2t_info *r2t, int left)
+			struct iscsi_r2t_info *r2t)
 {
 	struct iscsi_data *hdr;
-	int new_offset;
+	int new_offset, left;
+
+	BUG_ON(r2t->data_length - r2t->sent < 0);
+	left = r2t->data_length - r2t->sent;
+	if (left == 0)
+		return 0;
 
 	hdr = &r2t->dtask.hdr;
 	memset(hdr, 0, sizeof(struct iscsi_data));
@@ -1232,43 +1294,46 @@
 		r2t->data_count = left;
 		hdr->flags = ISCSI_FLAG_CMD_FINAL;
 	}
+
 	conn->dataout_pdus_cnt++;
-
-	iscsi_buf_init_iov(&r2t->headbuf, (char*)hdr,
-			   sizeof(struct iscsi_hdr));
-
-	if (iscsi_buf_left(&r2t->sendbuf))
-		return;
-
-	iscsi_buf_init_sg(&r2t->sendbuf, r2t->sg);
-	r2t->sg += 1;
-}
-
-static void iscsi_set_padding(struct iscsi_tcp_cmd_task *tcp_ctask,
-			      unsigned long len)
-{
-	tcp_ctask->pad_count = len & (ISCSI_PAD_LEN - 1);
-	if (!tcp_ctask->pad_count)
-		return;
-
-	tcp_ctask->pad_count = ISCSI_PAD_LEN - tcp_ctask->pad_count;
-	debug_scsi("write padding %d bytes\n", tcp_ctask->pad_count);
-	tcp_ctask->xmstate |= XMSTATE_W_PAD;
+	return 1;
 }
 
 /**
- * iscsi_tcp_cmd_init - Initialize iSCSI SCSI_READ or SCSI_WRITE commands
+ * iscsi_tcp_ctask - Initialize iSCSI SCSI_READ or SCSI_WRITE commands
  * @conn: iscsi connection
  * @ctask: scsi command task
  * @sc: scsi command
  **/
-static void
-iscsi_tcp_cmd_init(struct iscsi_cmd_task *ctask)
+static int
+iscsi_tcp_ctask_init(struct iscsi_cmd_task *ctask)
 {
 	struct iscsi_tcp_cmd_task *tcp_ctask = ctask->dd_data;
+	struct iscsi_conn *conn = ctask->conn;
+	struct scsi_cmnd *sc = ctask->sc;
+	int err;
 
 	BUG_ON(__kfifo_len(tcp_ctask->r2tqueue));
-	tcp_ctask->xmstate = XMSTATE_CMD_HDR_INIT;
+	tcp_ctask->sent = 0;
+	tcp_ctask->exp_datasn = 0;
+
+	/* Prepare PDU, optionally w/ immediate data */
+	debug_scsi("ctask deq [cid %d itt 0x%x imm %d unsol %d]\n",
+		    conn->id, ctask->itt, ctask->imm_count,
+		    ctask->unsol_count);
+	iscsi_tcp_send_hdr_prep(conn, ctask->hdr, ctask->hdr_len);
+
+	if (!ctask->imm_count)
+		return 0;
+
+	/* If we have immediate data, attach a payload */
+	err = iscsi_tcp_send_data_prep(conn, scsi_sglist(sc), scsi_sg_count(sc),
+				       0, ctask->imm_count);
+	if (err)
+		return err;
+	tcp_ctask->sent += ctask->imm_count;
+	ctask->imm_count = 0;
+	return 0;
 }
 
 /**
@@ -1280,71 +1345,17 @@
  *	The function can return -EAGAIN in which case caller must
  *	call it again later, or recover. '0' return code means successful
  *	xmit.
- *
- *	Management xmit state machine consists of these states:
- *		XMSTATE_IMM_HDR_INIT	- calculate digest of PDU Header
- *		XMSTATE_IMM_HDR 	- PDU Header xmit in progress
- *		XMSTATE_IMM_DATA 	- PDU Data xmit in progress
- *		XMSTATE_IDLE		- management PDU is done
  **/
 static int
 iscsi_tcp_mtask_xmit(struct iscsi_conn *conn, struct iscsi_mgmt_task *mtask)
 {
-	struct iscsi_tcp_mgmt_task *tcp_mtask = mtask->dd_data;
 	int rc;
 
-	debug_scsi("mtask deq [cid %d state %x itt 0x%x]\n",
-		conn->id, tcp_mtask->xmstate, mtask->itt);
+	/* Flush any pending data first. */
+	rc = iscsi_tcp_flush(conn);
+	if (rc < 0)
+		return rc;
 
-	if (tcp_mtask->xmstate & XMSTATE_IMM_HDR_INIT) {
-		iscsi_buf_init_iov(&tcp_mtask->headbuf, (char*)mtask->hdr,
-				   sizeof(struct iscsi_hdr));
-
-		if (mtask->data_count) {
-			tcp_mtask->xmstate |= XMSTATE_IMM_DATA;
-			iscsi_buf_init_iov(&tcp_mtask->sendbuf,
-					   (char*)mtask->data,
-					   mtask->data_count);
-		}
-
-		if (conn->c_stage != ISCSI_CONN_INITIAL_STAGE &&
-		    conn->stop_stage != STOP_CONN_RECOVER &&
-		    conn->hdrdgst_en)
-			iscsi_hdr_digest(conn, &tcp_mtask->headbuf,
-					(u8*)tcp_mtask->hdrext);
-
-		tcp_mtask->sent = 0;
-		tcp_mtask->xmstate &= ~XMSTATE_IMM_HDR_INIT;
-		tcp_mtask->xmstate |= XMSTATE_IMM_HDR;
-	}
-
-	if (tcp_mtask->xmstate & XMSTATE_IMM_HDR) {
-		rc = iscsi_sendhdr(conn, &tcp_mtask->headbuf,
-				   mtask->data_count);
-		if (rc)
-			return rc;
-		tcp_mtask->xmstate &= ~XMSTATE_IMM_HDR;
-	}
-
-	if (tcp_mtask->xmstate & XMSTATE_IMM_DATA) {
-		BUG_ON(!mtask->data_count);
-		tcp_mtask->xmstate &= ~XMSTATE_IMM_DATA;
-		/* FIXME: implement.
-		 * Virtual buffer could be spreaded across multiple pages...
-		 */
-		do {
-			int rc;
-
-			rc = iscsi_sendpage(conn, &tcp_mtask->sendbuf,
-					&mtask->data_count, &tcp_mtask->sent);
-			if (rc) {
-				tcp_mtask->xmstate |= XMSTATE_IMM_DATA;
-				return rc;
-			}
-		} while (mtask->data_count);
-	}
-
-	BUG_ON(tcp_mtask->xmstate != XMSTATE_IDLE);
 	if (mtask->hdr->itt == RESERVED_ITT) {
 		struct iscsi_session *session = conn->session;
 
@@ -1352,411 +1363,112 @@
 		iscsi_free_mgmt_task(conn, mtask);
 		spin_unlock_bh(&session->lock);
 	}
+
 	return 0;
 }
 
-static int
-iscsi_send_cmd_hdr(struct iscsi_conn *conn, struct iscsi_cmd_task *ctask)
-{
-	struct scsi_cmnd *sc = ctask->sc;
-	struct iscsi_tcp_cmd_task *tcp_ctask = ctask->dd_data;
-	int rc = 0;
-
-	if (tcp_ctask->xmstate & XMSTATE_CMD_HDR_INIT) {
-		tcp_ctask->sent = 0;
-		tcp_ctask->sg_count = 0;
-		tcp_ctask->exp_datasn = 0;
-
-		if (sc->sc_data_direction == DMA_TO_DEVICE) {
-			struct scatterlist *sg = scsi_sglist(sc);
-
-			iscsi_buf_init_sg(&tcp_ctask->sendbuf, sg);
-			tcp_ctask->sg = sg + 1;
-			tcp_ctask->bad_sg = sg + scsi_sg_count(sc);
-
-			debug_scsi("cmd [itt 0x%x total %d imm_data %d "
-				   "unsol count %d, unsol offset %d]\n",
-				   ctask->itt, scsi_bufflen(sc),
-				   ctask->imm_count, ctask->unsol_count,
-				   ctask->unsol_offset);
-		}
-
-		iscsi_buf_init_iov(&tcp_ctask->headbuf, (char*)ctask->hdr,
-				  ctask->hdr_len);
-
-		if (conn->hdrdgst_en)
-			iscsi_hdr_digest(conn, &tcp_ctask->headbuf,
-					 iscsi_next_hdr(ctask));
-		tcp_ctask->xmstate &= ~XMSTATE_CMD_HDR_INIT;
-		tcp_ctask->xmstate |= XMSTATE_CMD_HDR_XMIT;
-	}
-
-	if (tcp_ctask->xmstate & XMSTATE_CMD_HDR_XMIT) {
-		rc = iscsi_sendhdr(conn, &tcp_ctask->headbuf, ctask->imm_count);
-		if (rc)
-			return rc;
-		tcp_ctask->xmstate &= ~XMSTATE_CMD_HDR_XMIT;
-
-		if (sc->sc_data_direction != DMA_TO_DEVICE)
-			return 0;
-
-		if (ctask->imm_count) {
-			tcp_ctask->xmstate |= XMSTATE_IMM_DATA;
-			iscsi_set_padding(tcp_ctask, ctask->imm_count);
-
-			if (ctask->conn->datadgst_en) {
-				iscsi_data_digest_init(ctask->conn->dd_data,
-						       tcp_ctask);
-				tcp_ctask->immdigest = 0;
-			}
-		}
-
-		if (ctask->unsol_count)
-			tcp_ctask->xmstate |=
-					XMSTATE_UNS_HDR | XMSTATE_UNS_INIT;
-	}
-	return rc;
-}
-
-static int
-iscsi_send_padding(struct iscsi_conn *conn, struct iscsi_cmd_task *ctask)
-{
-	struct iscsi_tcp_cmd_task *tcp_ctask = ctask->dd_data;
-	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
-	int sent = 0, rc;
-
-	if (tcp_ctask->xmstate & XMSTATE_W_PAD) {
-		iscsi_buf_init_iov(&tcp_ctask->sendbuf, (char*)&tcp_ctask->pad,
-				   tcp_ctask->pad_count);
-		if (conn->datadgst_en)
-			crypto_hash_update(&tcp_conn->tx_hash,
-					   &tcp_ctask->sendbuf.sg,
-					   tcp_ctask->sendbuf.sg.length);
-	} else if (!(tcp_ctask->xmstate & XMSTATE_W_RESEND_PAD))
-		return 0;
-
-	tcp_ctask->xmstate &= ~XMSTATE_W_PAD;
-	tcp_ctask->xmstate &= ~XMSTATE_W_RESEND_PAD;
-	debug_scsi("sending %d pad bytes for itt 0x%x\n",
-		   tcp_ctask->pad_count, ctask->itt);
-	rc = iscsi_sendpage(conn, &tcp_ctask->sendbuf, &tcp_ctask->pad_count,
-			   &sent);
-	if (rc) {
-		debug_scsi("padding send failed %d\n", rc);
-		tcp_ctask->xmstate |= XMSTATE_W_RESEND_PAD;
-	}
-	return rc;
-}
-
-static int
-iscsi_send_digest(struct iscsi_conn *conn, struct iscsi_cmd_task *ctask,
-			struct iscsi_buf *buf, uint32_t *digest)
-{
-	struct iscsi_tcp_cmd_task *tcp_ctask;
-	struct iscsi_tcp_conn *tcp_conn;
-	int rc, sent = 0;
-
-	if (!conn->datadgst_en)
-		return 0;
-
-	tcp_ctask = ctask->dd_data;
-	tcp_conn = conn->dd_data;
-
-	if (!(tcp_ctask->xmstate & XMSTATE_W_RESEND_DATA_DIGEST)) {
-		crypto_hash_final(&tcp_conn->tx_hash, (u8*)digest);
-		iscsi_buf_init_iov(buf, (char*)digest, 4);
-	}
-	tcp_ctask->xmstate &= ~XMSTATE_W_RESEND_DATA_DIGEST;
-
-	rc = iscsi_sendpage(conn, buf, &tcp_ctask->digest_count, &sent);
-	if (!rc)
-		debug_scsi("sent digest 0x%x for itt 0x%x\n", *digest,
-			  ctask->itt);
-	else {
-		debug_scsi("sending digest 0x%x failed for itt 0x%x!\n",
-			  *digest, ctask->itt);
-		tcp_ctask->xmstate |= XMSTATE_W_RESEND_DATA_DIGEST;
-	}
-	return rc;
-}
-
-static int
-iscsi_send_data(struct iscsi_cmd_task *ctask, struct iscsi_buf *sendbuf,
-		struct scatterlist **sg, int *sent, int *count,
-		struct iscsi_buf *digestbuf, uint32_t *digest)
-{
-	struct iscsi_tcp_cmd_task *tcp_ctask = ctask->dd_data;
-	struct iscsi_conn *conn = ctask->conn;
-	struct iscsi_tcp_conn *tcp_conn = conn->dd_data;
-	int rc, buf_sent, offset;
-
-	while (*count) {
-		buf_sent = 0;
-		offset = sendbuf->sent;
-
-		rc = iscsi_sendpage(conn, sendbuf, count, &buf_sent);
-		*sent = *sent + buf_sent;
-		if (buf_sent && conn->datadgst_en)
-			partial_sg_digest_update(&tcp_conn->tx_hash,
-				&sendbuf->sg, sendbuf->sg.offset + offset,
-				buf_sent);
-		if (!iscsi_buf_left(sendbuf) && *sg != tcp_ctask->bad_sg) {
-			iscsi_buf_init_sg(sendbuf, *sg);
-			*sg = *sg + 1;
-		}
-
-		if (rc)
-			return rc;
-	}
-
-	rc = iscsi_send_padding(conn, ctask);
-	if (rc)
-		return rc;
-
-	return iscsi_send_digest(conn, ctask, digestbuf, digest);
-}
-
-static int
-iscsi_send_unsol_hdr(struct iscsi_conn *conn, struct iscsi_cmd_task *ctask)
-{
-	struct iscsi_tcp_cmd_task *tcp_ctask = ctask->dd_data;
-	struct iscsi_data_task *dtask;
-	int rc;
-
-	tcp_ctask->xmstate |= XMSTATE_UNS_DATA;
-	if (tcp_ctask->xmstate & XMSTATE_UNS_INIT) {
-		dtask = &tcp_ctask->unsol_dtask;
-
-		iscsi_prep_unsolicit_data_pdu(ctask, &dtask->hdr);
-		iscsi_buf_init_iov(&tcp_ctask->headbuf, (char*)&dtask->hdr,
-				   sizeof(struct iscsi_hdr));
-		if (conn->hdrdgst_en)
-			iscsi_hdr_digest(conn, &tcp_ctask->headbuf,
-					(u8*)dtask->hdrext);
-
-		tcp_ctask->xmstate &= ~XMSTATE_UNS_INIT;
-		iscsi_set_padding(tcp_ctask, ctask->data_count);
-	}
-
-	rc = iscsi_sendhdr(conn, &tcp_ctask->headbuf, ctask->data_count);
-	if (rc) {
-		tcp_ctask->xmstate &= ~XMSTATE_UNS_DATA;
-		tcp_ctask->xmstate |= XMSTATE_UNS_HDR;
-		return rc;
-	}
-
-	if (conn->datadgst_en) {
-		dtask = &tcp_ctask->unsol_dtask;
-		iscsi_data_digest_init(ctask->conn->dd_data, tcp_ctask);
-		dtask->digest = 0;
-	}
-
-	debug_scsi("uns dout [itt 0x%x dlen %d sent %d]\n",
-		   ctask->itt, ctask->unsol_count, tcp_ctask->sent);
-	return 0;
-}
-
-static int
-iscsi_send_unsol_pdu(struct iscsi_conn *conn, struct iscsi_cmd_task *ctask)
-{
-	struct iscsi_tcp_cmd_task *tcp_ctask = ctask->dd_data;
-	int rc;
-
-	if (tcp_ctask->xmstate & XMSTATE_UNS_HDR) {
-		BUG_ON(!ctask->unsol_count);
-		tcp_ctask->xmstate &= ~XMSTATE_UNS_HDR;
-send_hdr:
-		rc = iscsi_send_unsol_hdr(conn, ctask);
-		if (rc)
-			return rc;
-	}
-
-	if (tcp_ctask->xmstate & XMSTATE_UNS_DATA) {
-		struct iscsi_data_task *dtask = &tcp_ctask->unsol_dtask;
-		int start = tcp_ctask->sent;
-
-		rc = iscsi_send_data(ctask, &tcp_ctask->sendbuf, &tcp_ctask->sg,
-				     &tcp_ctask->sent, &ctask->data_count,
-				     &dtask->digestbuf, &dtask->digest);
-		ctask->unsol_count -= tcp_ctask->sent - start;
-		if (rc)
-			return rc;
-		tcp_ctask->xmstate &= ~XMSTATE_UNS_DATA;
-		/*
-		 * Done with the Data-Out. Next, check if we need
-		 * to send another unsolicited Data-Out.
-		 */
-		if (ctask->unsol_count) {
-			debug_scsi("sending more uns\n");
-			tcp_ctask->xmstate |= XMSTATE_UNS_INIT;
-			goto send_hdr;
-		}
-	}
-	return 0;
-}
-
-static int iscsi_send_sol_pdu(struct iscsi_conn *conn,
-			      struct iscsi_cmd_task *ctask)
-{
-	struct iscsi_tcp_cmd_task *tcp_ctask = ctask->dd_data;
-	struct iscsi_session *session = conn->session;
-	struct iscsi_r2t_info *r2t;
-	struct iscsi_data_task *dtask;
-	int left, rc;
-
-	if (tcp_ctask->xmstate & XMSTATE_SOL_HDR_INIT) {
-		if (!tcp_ctask->r2t) {
-			spin_lock_bh(&session->lock);
-			__kfifo_get(tcp_ctask->r2tqueue, (void*)&tcp_ctask->r2t,
-				    sizeof(void*));
-			spin_unlock_bh(&session->lock);
-		}
-send_hdr:
-		r2t = tcp_ctask->r2t;
-		dtask = &r2t->dtask;
-
-		if (conn->hdrdgst_en)
-			iscsi_hdr_digest(conn, &r2t->headbuf,
-					(u8*)dtask->hdrext);
-		tcp_ctask->xmstate &= ~XMSTATE_SOL_HDR_INIT;
-		tcp_ctask->xmstate |= XMSTATE_SOL_HDR;
-	}
-
-	if (tcp_ctask->xmstate & XMSTATE_SOL_HDR) {
-		r2t = tcp_ctask->r2t;
-		dtask = &r2t->dtask;
-
-		rc = iscsi_sendhdr(conn, &r2t->headbuf, r2t->data_count);
-		if (rc)
-			return rc;
-		tcp_ctask->xmstate &= ~XMSTATE_SOL_HDR;
-		tcp_ctask->xmstate |= XMSTATE_SOL_DATA;
-
-		if (conn->datadgst_en) {
-			iscsi_data_digest_init(conn->dd_data, tcp_ctask);
-			dtask->digest = 0;
-		}
-
-		iscsi_set_padding(tcp_ctask, r2t->data_count);
-		debug_scsi("sol dout [dsn %d itt 0x%x dlen %d sent %d]\n",
-			r2t->solicit_datasn - 1, ctask->itt, r2t->data_count,
-			r2t->sent);
-	}
-
-	if (tcp_ctask->xmstate & XMSTATE_SOL_DATA) {
-		r2t = tcp_ctask->r2t;
-		dtask = &r2t->dtask;
-
-		rc = iscsi_send_data(ctask, &r2t->sendbuf, &r2t->sg,
-				     &r2t->sent, &r2t->data_count,
-				     &dtask->digestbuf, &dtask->digest);
-		if (rc)
-			return rc;
-		tcp_ctask->xmstate &= ~XMSTATE_SOL_DATA;
-
-		/*
-		 * Done with this Data-Out. Next, check if we have
-		 * to send another Data-Out for this R2T.
-		 */
-		BUG_ON(r2t->data_length - r2t->sent < 0);
-		left = r2t->data_length - r2t->sent;
-		if (left) {
-			iscsi_solicit_data_cont(conn, ctask, r2t, left);
-			goto send_hdr;
-		}
-
-		/*
-		 * Done with this R2T. Check if there are more
-		 * outstanding R2Ts ready to be processed.
-		 */
-		spin_lock_bh(&session->lock);
-		tcp_ctask->r2t = NULL;
-		__kfifo_put(tcp_ctask->r2tpool.queue, (void*)&r2t,
-			    sizeof(void*));
-		if (__kfifo_get(tcp_ctask->r2tqueue, (void*)&r2t,
-				sizeof(void*))) {
-			tcp_ctask->r2t = r2t;
-			spin_unlock_bh(&session->lock);
-			goto send_hdr;
-		}
-		spin_unlock_bh(&session->lock);
-	}
-	return 0;
-}
-
-/**
+/*
  * iscsi_tcp_ctask_xmit - xmit normal PDU task
  * @conn: iscsi connection
  * @ctask: iscsi command task
  *
- * Notes:
- *	The function can return -EAGAIN in which case caller must
- *	call it again later, or recover. '0' return code means successful
- *	xmit.
- *	The function is devided to logical helpers (above) for the different
- *	xmit stages.
- *
- *iscsi_send_cmd_hdr()
- *	XMSTATE_CMD_HDR_INIT - prepare Header and Data buffers Calculate
- *	                       Header Digest
- *	XMSTATE_CMD_HDR_XMIT - Transmit header in progress
- *
- *iscsi_send_padding
- *	XMSTATE_W_PAD        - Prepare and send pading
- *	XMSTATE_W_RESEND_PAD - retry send pading
- *
- *iscsi_send_digest
- *	XMSTATE_W_RESEND_DATA_DIGEST - Finalize and send Data Digest
- *	XMSTATE_W_RESEND_DATA_DIGEST - retry sending digest
- *
- *iscsi_send_unsol_hdr
- *	XMSTATE_UNS_INIT     - prepare un-solicit data header and digest
- *	XMSTATE_UNS_HDR      - send un-solicit header
- *
- *iscsi_send_unsol_pdu
- *	XMSTATE_UNS_DATA     - send un-solicit data in progress
- *
- *iscsi_send_sol_pdu
- *	XMSTATE_SOL_HDR_INIT - solicit data header and digest initialize
- *	XMSTATE_SOL_HDR      - send solicit header
- *	XMSTATE_SOL_DATA     - send solicit data
- *
- *iscsi_tcp_ctask_xmit
- *	XMSTATE_IMM_DATA     - xmit managment data (??)
- **/
+ * We're expected to return 0 when everything was transmitted succesfully,
+ * -EAGAIN if there's still data in the queue, or != 0 for any other kind
+ * of error.
+ */
 static int
 iscsi_tcp_ctask_xmit(struct iscsi_conn *conn, struct iscsi_cmd_task *ctask)
 {
 	struct iscsi_tcp_cmd_task *tcp_ctask = ctask->dd_data;
+	struct scsi_cmnd *sc = ctask->sc;
 	int rc = 0;
 
-	debug_scsi("ctask deq [cid %d xmstate %x itt 0x%x]\n",
-		conn->id, tcp_ctask->xmstate, ctask->itt);
-
-	rc = iscsi_send_cmd_hdr(conn, ctask);
-	if (rc)
+flush:
+	/* Flush any pending data first. */
+	rc = iscsi_tcp_flush(conn);
+	if (rc < 0)
 		return rc;
-	if (ctask->sc->sc_data_direction != DMA_TO_DEVICE)
+
+	/* Are we done already? */
+	if (sc->sc_data_direction != DMA_TO_DEVICE)
 		return 0;
 
-	if (tcp_ctask->xmstate & XMSTATE_IMM_DATA) {
-		rc = iscsi_send_data(ctask, &tcp_ctask->sendbuf, &tcp_ctask->sg,
-				     &tcp_ctask->sent, &ctask->imm_count,
-				     &tcp_ctask->immbuf, &tcp_ctask->immdigest);
+	if (ctask->unsol_count != 0) {
+		struct iscsi_data *hdr = &tcp_ctask->unsol_dtask.hdr;
+
+		/* Prepare a header for the unsolicited PDU.
+		 * The amount of data we want to send will be
+		 * in ctask->data_count.
+		 * FIXME: return the data count instead.
+		 */
+		iscsi_prep_unsolicit_data_pdu(ctask, hdr);
+
+		debug_tcp("unsol dout [itt 0x%x doff %d dlen %d]\n",
+				ctask->itt, tcp_ctask->sent, ctask->data_count);
+
+		iscsi_tcp_send_hdr_prep(conn, hdr, sizeof(*hdr));
+		rc = iscsi_tcp_send_data_prep(conn, scsi_sglist(sc),
+					      scsi_sg_count(sc),
+					      tcp_ctask->sent,
+					      ctask->data_count);
 		if (rc)
-			return rc;
-		tcp_ctask->xmstate &= ~XMSTATE_IMM_DATA;
+			goto fail;
+		tcp_ctask->sent += ctask->data_count;
+		ctask->unsol_count -= ctask->data_count;
+		goto flush;
+	} else {
+		struct iscsi_session *session = conn->session;
+		struct iscsi_r2t_info *r2t;
+
+		/* All unsolicited PDUs sent. Check for solicited PDUs.
+		 */
+		spin_lock_bh(&session->lock);
+		r2t = tcp_ctask->r2t;
+		if (r2t != NULL) {
+			/* Continue with this R2T? */
+			if (!iscsi_solicit_data_cont(conn, ctask, r2t)) {
+				debug_scsi("  done with r2t %p\n", r2t);
+
+				__kfifo_put(tcp_ctask->r2tpool.queue,
+					    (void*)&r2t, sizeof(void*));
+				tcp_ctask->r2t = r2t = NULL;
+			}
+		}
+
+		if (r2t == NULL) {
+			__kfifo_get(tcp_ctask->r2tqueue, (void*)&tcp_ctask->r2t,
+				    sizeof(void*));
+			r2t = tcp_ctask->r2t;
+		}
+		spin_unlock_bh(&session->lock);
+
+		/* Waiting for more R2Ts to arrive. */
+		if (r2t == NULL) {
+			debug_tcp("no R2Ts yet\n");
+			return 0;
+		}
+
+		debug_scsi("sol dout %p [dsn %d itt 0x%x doff %d dlen %d]\n",
+			r2t, r2t->solicit_datasn - 1, ctask->itt,
+			r2t->data_offset + r2t->sent, r2t->data_count);
+
+		iscsi_tcp_send_hdr_prep(conn, &r2t->dtask.hdr,
+					sizeof(struct iscsi_hdr));
+
+		rc = iscsi_tcp_send_data_prep(conn, scsi_sglist(sc),
+					      scsi_sg_count(sc),
+					      r2t->data_offset + r2t->sent,
+					      r2t->data_count);
+		if (rc)
+			goto fail;
+		tcp_ctask->sent += r2t->data_count;
+		r2t->sent += r2t->data_count;
+		goto flush;
 	}
-
-	rc = iscsi_send_unsol_pdu(conn, ctask);
-	if (rc)
-		return rc;
-
-	rc = iscsi_send_sol_pdu(conn, ctask);
-	if (rc)
-		return rc;
-
-	return rc;
+	return 0;
+fail:
+	iscsi_conn_failure(conn, rc);
+	return -EIO;
 }
 
 static struct iscsi_cls_conn *
@@ -1970,10 +1682,17 @@
 
 /* called with host lock */
 static void
-iscsi_tcp_mgmt_init(struct iscsi_conn *conn, struct iscsi_mgmt_task *mtask)
+iscsi_tcp_mtask_init(struct iscsi_conn *conn, struct iscsi_mgmt_task *mtask)
 {
-	struct iscsi_tcp_mgmt_task *tcp_mtask = mtask->dd_data;
-	tcp_mtask->xmstate = XMSTATE_IMM_HDR_INIT;
+	debug_scsi("mtask deq [cid %d itt 0x%x]\n", conn->id, mtask->itt);
+
+	/* Prepare PDU, optionally w/ immediate data */
+	iscsi_tcp_send_hdr_prep(conn, mtask->hdr, sizeof(*mtask->hdr));
+
+	/* If we have immediate data, attach a payload */
+	if (mtask->data_count)
+		iscsi_tcp_send_linear_data_prepare(conn, mtask->data,
+						   mtask->data_count);
 }
 
 static int
@@ -2177,7 +1896,7 @@
 		struct iscsi_mgmt_task *mtask = session->mgmt_cmds[cmd_i];
 		struct iscsi_tcp_mgmt_task *tcp_mtask = mtask->dd_data;
 
-		mtask->hdr = &tcp_mtask->hdr;
+		mtask->hdr = (struct iscsi_hdr *) &tcp_mtask->hdr;
 	}
 
 	if (iscsi_r2tpool_alloc(class_to_transport_session(cls_session)))
@@ -2274,8 +1993,8 @@
 	/* IO */
 	.send_pdu		= iscsi_conn_send_pdu,
 	.get_stats		= iscsi_conn_get_stats,
-	.init_cmd_task		= iscsi_tcp_cmd_init,
-	.init_mgmt_task		= iscsi_tcp_mgmt_init,
+	.init_cmd_task		= iscsi_tcp_ctask_init,
+	.init_mgmt_task		= iscsi_tcp_mtask_init,
 	.xmit_cmd_task		= iscsi_tcp_ctask_xmit,
 	.xmit_mgmt_task		= iscsi_tcp_mtask_xmit,
 	.cleanup_cmd_task	= iscsi_tcp_cleanup_ctask,