ceph: writeback congestion control

Set bdi congestion bit when amount of write data in flight exceeds adjustable
threshold.

Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d0cdceb..a6850a1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -47,6 +47,12 @@
  * accounting is preserved.
  */
 
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb)				\
+	(CONGESTION_ON_THRESH(congestion_kb) -				\
+	 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+
 
 /*
  * Dirty a page.  Optimistically adjust accounting, on the assumption
@@ -377,6 +383,7 @@
 {
 	struct inode *inode;
 	struct ceph_inode_info *ci;
+	struct ceph_client *client;
 	struct ceph_osd_client *osdc;
 	loff_t page_off = page->index << PAGE_CACHE_SHIFT;
 	int len = PAGE_CACHE_SIZE;
@@ -384,6 +391,7 @@
 	int err = 0;
 	struct ceph_snap_context *snapc;
 	u64 snap_size = 0;
+	long writeback_stat;
 
 	dout("writepage %p idx %lu\n", page, page->index);
 
@@ -393,7 +401,8 @@
 	}
 	inode = page->mapping->host;
 	ci = ceph_inode(inode);
-	osdc = &ceph_inode_to_client(inode)->osdc;
+	client = ceph_inode_to_client(inode);
+	osdc = &client->osdc;
 
 	/* verify this is a writeable snap context */
 	snapc = (void *)page->private;
@@ -420,6 +429,11 @@
 	dout("writepage %p page %p index %lu on %llu~%u\n",
 	     inode, page, page->index, page_off, len);
 
+	writeback_stat = atomic_long_inc_return(&client->writeback_count);
+	if (writeback_stat >
+	    CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+		set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+
 	set_page_writeback(page);
 	err = ceph_osdc_writepages(osdc, ceph_vino(inode),
 				   &ci->i_layout, snapc,
@@ -499,6 +513,8 @@
 	struct writeback_control *wbc = req->r_wbc;
 	__s32 rc = -EIO;
 	u64 bytes = 0;
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	long writeback_stat;
 
 	/* parse reply */
 	replyhead = msg->front.iov_base;
@@ -524,6 +540,13 @@
 		BUG_ON(!page);
 		WARN_ON(!PageUptodate(page));
 
+		writeback_stat =
+			atomic_long_dec_return(&client->writeback_count);
+		if (writeback_stat <
+		    CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+			clear_bdi_congested(&client->backing_dev_info,
+					    BLK_RW_ASYNC);
+
 		if (i >= wrote) {
 			dout("inode %p skipping page %p\n", inode, page);
 			wbc->pages_skipped++;
@@ -666,6 +689,7 @@
 		u64 offset, len;
 		struct ceph_osd_request_head *reqhead;
 		struct ceph_osd_op *op;
+		long writeback_stat;
 
 		next = 0;
 		locked_pages = 0;
@@ -773,6 +797,12 @@
 				first = i;
 			dout("%p will write page %p idx %lu\n",
 			     inode, page, page->index);
+
+			writeback_stat = atomic_long_inc_return(&client->writeback_count);
+			if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+				set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+			}
+
 			set_page_writeback(page);
 			req->r_pages[locked_pages] = page;
 			locked_pages++;
@@ -998,7 +1028,8 @@
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+	struct ceph_client *client = ceph_inode_to_client(inode);
+	struct ceph_mds_client *mdsc = &client->mdsc;
 	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
 	int check_cap = 0;