ext4: Use end_io callback to avoid direct I/O fallback to buffered I/O Currently the DIO VFS code passes create = 0 when writing to the middle of file. It does this to avoid block allocation for holes, so as not to expose stale data out when there is a parallel buffered read (which does not hold the i_mutex lock). Direct I/O writes into holes falls back to buffered IO for this reason. Since preallocated extents are treated as holes when doing a get_block() look up (buffer is not mapped), direct IO over fallocate also falls back to buffered IO. Thus ext4 actually silently falls back to buffered IO in above two cases, which is undesirable. To fix this, this patch creates unitialized extents when a direct I/O write into holes in sparse files, and registering an end_io callback which converts the uninitialized extent to an initialized extent after the I/O is completed. Singed-Off-By: Mingming Cao <cmm@us.ibm.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>

commit: 4c0425ff68b1b87b802ffeda7b6a46ff7da7241c [log] [tgz]
author: Mingming Cao <cmm@us.ibm.com> Mon Sep 28 15:48:41 2009 -0400
committer: Theodore Ts'o <tytso@mit.edu> Mon Sep 28 15:48:41 2009 -0400
tree: a8718f5f4574af8e15fd876b24f4aec88d62451b
parent: 0031462b5b392f90d17f1d75abb795883c44e969 [diff]
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2b4293a..ccb4dbf 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h

@@ -999,6 +999,9 @@
 
 	unsigned int s_log_groups_per_flex;
 	struct flex_groups *s_flex_groups;
+
+	/* workqueue for dio unwritten */
+	struct workqueue_struct *dio_unwritten_wq;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index da4f2ec..5633af6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c

@@ -37,6 +37,7 @@
 #include <linux/namei.h>
 #include <linux/uio.h>
 #include <linux/bio.h>
+#include <linux/workqueue.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -3356,6 +3357,8 @@
 }
 
 /*
+ * O_DIRECT for ext3 (or indirect map) based files
+ *
  * If the O_DIRECT write will extend the file then add this inode to the
  * orphan list.  So recovery will truncate it back to the original size
  * if the machine crashes during the write.
@@ -3364,7 +3367,7 @@
  * crashes then stale disk data _may_ be exposed inside the file. But current
  * VFS code falls back into buffered path in that case so we are safe.
  */
-static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+static ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
 			      const struct iovec *iov, loff_t offset,
 			      unsigned long nr_segs)
 {
@@ -3438,6 +3441,198 @@
 	return ret;
 }
 
+/* Maximum number of blocks we map for direct IO at once. */
+
+static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock,
+		   struct buffer_head *bh_result, int create)
+{
+	handle_t *handle = NULL;
+	int ret = 0;
+	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int dio_credits;
+
+	/*
+	 * DIO VFS code passes create = 0 flag for write to
+	 * the middle of file. It does this to avoid block
+	 * allocation for holes, to prevent expose stale data
+	 * out when there is parallel buffered read (which does
+	 * not hold the i_mutex lock) while direct IO write has
+	 * not completed. DIO request on holes finally falls back
+	 * to buffered IO for this reason.
+	 *
+	 * For ext4 extent based file, since we support fallocate,
+	 * new allocated extent as uninitialized, for holes, we
+	 * could fallocate blocks for holes, thus parallel
+	 * buffered IO read will zero out the page when read on
+	 * a hole while parallel DIO write to the hole has not completed.
+	 *
+	 * when we come here, we know it's a direct IO write to
+	 * to the middle of file (<i_size)
+	 * so it's safe to override the create flag from VFS.
+	 */
+	create = EXT4_GET_BLOCKS_DIO_CREATE_EXT;
+
+	if (max_blocks > DIO_MAX_BLOCKS)
+		max_blocks = DIO_MAX_BLOCKS;
+	dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+	handle = ext4_journal_start(inode, dio_credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto out;
+	}
+	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+			      create);
+	if (ret > 0) {
+		bh_result->b_size = (ret << inode->i_blkbits);
+		ret = 0;
+	}
+	ext4_journal_stop(handle);
+out:
+	return ret;
+}
+
+#define		DIO_AIO		0x1
+
+static void ext4_free_io_end(ext4_io_end_t *io)
+{
+	kfree(io);
+}
+
+/*
+ * IO write completion for unwritten extents.
+ *
+ * check a range of space and convert unwritten extents to written.
+ */
+static void ext4_end_dio_unwritten(struct work_struct *work)
+{
+	ext4_io_end_t *io = container_of(work, ext4_io_end_t, work);
+	struct inode *inode = io->inode;
+	loff_t offset = io->offset;
+	size_t size = io->size;
+	int ret = 0;
+	int aio = io->flag & DIO_AIO;
+
+	if (aio)
+		mutex_lock(&inode->i_mutex);
+	if (offset + size <= i_size_read(inode))
+		ret = ext4_convert_unwritten_extents(inode, offset, size);
+
+	if (ret < 0)
+		printk(KERN_EMERG "%s: failed to convert unwritten"
+			"extents to written extents, error is %d\n",
+                       __func__, ret);
+
+	ext4_free_io_end(io);
+	if (aio)
+		mutex_unlock(&inode->i_mutex);
+}
+
+static ext4_io_end_t *ext4_init_io_end (struct inode *inode, unsigned int flag)
+{
+	ext4_io_end_t *io = NULL;
+
+	io = kmalloc(sizeof(*io), GFP_NOFS);
+
+	if (io) {
+		io->inode = inode;
+		io->flag = flag;
+		io->offset = 0;
+		io->size = 0;
+		io->error = 0;
+		INIT_WORK(&io->work, ext4_end_dio_unwritten);
+	}
+
+	return io;
+}
+
+static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
+			    ssize_t size, void *private)
+{
+        ext4_io_end_t *io_end = iocb->private;
+	struct workqueue_struct *wq;
+
+	/* if not hole or unwritten extents, just simple return */
+	if (!io_end || !size || !iocb->private)
+		return;
+	io_end->offset = offset;
+	io_end->size = size;
+	wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
+
+	/* We need to convert unwritten extents to written */
+	queue_work(wq, &io_end->work);
+
+        if (is_sync_kiocb(iocb))
+		flush_workqueue(wq);
+
+	iocb->private = NULL;
+}
+/*
+ * For ext4 extent files, ext4 will do direct-io write to holes,
+ * preallocated extents, and those write extend the file, no need to
+ * fall back to buffered IO.
+ *
+ * For holes, we fallocate those blocks, mark them as unintialized
+ * If those blocks were preallocated, we mark sure they are splited, but
+ * still keep the range to write as unintialized.
+ *
+ * When end_io call back function called at the last IO complete time,
+ * those extents will be converted to written extents.
+ *
+ * If the O_DIRECT write will extend the file then add this inode to the
+ * orphan list.  So recovery will truncate it back to the original size
+ * if the machine crashes during the write.
+ *
+ */
+static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+	size_t count = iov_length(iov, nr_segs);
+
+	loff_t final_size = offset + count;
+	if (rw == WRITE && final_size <= inode->i_size) {
+		/*
+ 		 * For DIO we fallocate blocks for holes, we fallocate blocks
+ 		 * The fallocated extent for hole is marked as uninitialized
+ 		 * to prevent paralel buffered read to expose the stale data
+ 		 * before DIO complete the data IO.
+ 		 * as for previously fallocated extents, ext4 get_block
+ 		 * will just simply mark the buffer mapped but still
+ 		 * keep the extents uninitialized.
+ 		 *
+ 		 * At the end of IO, the ext4 end_io callback function
+ 		 * will convert those unwritten extents to written,
+ 		 *
+ 		 */
+		iocb->private = ext4_init_io_end(inode, !is_sync_kiocb(iocb));
+		if (!iocb->private)
+			return -ENOMEM;
+		ret = blockdev_direct_IO(rw, iocb, inode,
+					 inode->i_sb->s_bdev, iov,
+					 offset, nr_segs,
+					 ext4_get_block_dio_write,
+					 ext4_end_io_dio);
+		return ret;
+	}
+	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
+static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
+			      const struct iovec *iov, loff_t offset,
+			      unsigned long nr_segs)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+		return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
+
+	return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
+}
+
 /*
  * Pages can be marked dirty completely asynchronously from ext4's journalling
  * activity.  By filemap_sync_pte(), try_to_unmap_one(), etc.  We cannot do

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1681773..1a03ea9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c

@@ -580,6 +580,9 @@
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	flush_workqueue(sbi->dio_unwritten_wq);
+	destroy_workqueue(sbi->dio_unwritten_wq);
+
 	lock_super(sb);
 	lock_kernel();
 	if (sb->s_dirt)
@@ -2801,6 +2804,12 @@
 			clear_opt(sbi->s_mount_opt, NOBH);
 		}
 	}
+	EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten");
+	if (!EXT4_SB(sb)->dio_unwritten_wq) {
+		printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n");
+		goto failed_mount_wq;
+	}
+
 	/*
 	 * The jbd2_journal_load will have done any necessary log recovery,
 	 * so we can safely mount the rest of the filesystem now.
@@ -2913,6 +2922,8 @@
 
 failed_mount4:
 	ext4_msg(sb, KERN_ERR, "mount failed");
+	destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq);
+failed_mount_wq:
 	ext4_release_system_zone(sb);
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
commit	4c0425ff68b1b87b802ffeda7b6a46ff7da7241c	[log] [tgz]
author	Mingming Cao <cmm@us.ibm.com>	Mon Sep 28 15:48:41 2009 -0400
committer	Theodore Ts'o <tytso@mit.edu>	Mon Sep 28 15:48:41 2009 -0400
tree	a8718f5f4574af8e15fd876b24f4aec88d62451b
parent	0031462b5b392f90d17f1d75abb795883c44e969 [diff]