ceph: writeback congestion control
Set bdi congestion bit when amount of write data in flight exceeds adjustable
threshold.
Signed-off-by: Yehuda Sadeh <yehuda@hq.newdream.net>
Signed-off-by: Sage Weil <sage@newdream.net>
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index d0cdceb..a6850a1 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -47,6 +47,12 @@
* accounting is preserved.
*/
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb) \
+ (CONGESTION_ON_THRESH(congestion_kb) - \
+ (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+
/*
* Dirty a page. Optimistically adjust accounting, on the assumption
@@ -377,6 +383,7 @@
{
struct inode *inode;
struct ceph_inode_info *ci;
+ struct ceph_client *client;
struct ceph_osd_client *osdc;
loff_t page_off = page->index << PAGE_CACHE_SHIFT;
int len = PAGE_CACHE_SIZE;
@@ -384,6 +391,7 @@
int err = 0;
struct ceph_snap_context *snapc;
u64 snap_size = 0;
+ long writeback_stat;
dout("writepage %p idx %lu\n", page, page->index);
@@ -393,7 +401,8 @@
}
inode = page->mapping->host;
ci = ceph_inode(inode);
- osdc = &ceph_inode_to_client(inode)->osdc;
+ client = ceph_inode_to_client(inode);
+ osdc = &client->osdc;
/* verify this is a writeable snap context */
snapc = (void *)page->private;
@@ -420,6 +429,11 @@
dout("writepage %p page %p index %lu on %llu~%u\n",
inode, page, page->index, page_off, len);
+ writeback_stat = atomic_long_inc_return(&client->writeback_count);
+ if (writeback_stat >
+ CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+ set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+
set_page_writeback(page);
err = ceph_osdc_writepages(osdc, ceph_vino(inode),
&ci->i_layout, snapc,
@@ -499,6 +513,8 @@
struct writeback_control *wbc = req->r_wbc;
__s32 rc = -EIO;
u64 bytes = 0;
+ struct ceph_client *client = ceph_inode_to_client(inode);
+ long writeback_stat;
/* parse reply */
replyhead = msg->front.iov_base;
@@ -524,6 +540,13 @@
BUG_ON(!page);
WARN_ON(!PageUptodate(page));
+ writeback_stat =
+ atomic_long_dec_return(&client->writeback_count);
+ if (writeback_stat <
+ CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+ clear_bdi_congested(&client->backing_dev_info,
+ BLK_RW_ASYNC);
+
if (i >= wrote) {
dout("inode %p skipping page %p\n", inode, page);
wbc->pages_skipped++;
@@ -666,6 +689,7 @@
u64 offset, len;
struct ceph_osd_request_head *reqhead;
struct ceph_osd_op *op;
+ long writeback_stat;
next = 0;
locked_pages = 0;
@@ -773,6 +797,12 @@
first = i;
dout("%p will write page %p idx %lu\n",
inode, page, page->index);
+
+ writeback_stat = atomic_long_inc_return(&client->writeback_count);
+ if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+ set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+ }
+
set_page_writeback(page);
req->r_pages[locked_pages] = page;
locked_pages++;
@@ -998,7 +1028,8 @@
struct page *page, void *fsdata)
{
struct inode *inode = file->f_dentry->d_inode;
- struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = &client->mdsc;
unsigned from = pos & (PAGE_CACHE_SIZE - 1);
int check_cap = 0;
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 441484a..22d3b47 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -320,6 +320,30 @@
DEFINE_SHOW_FUNC(dentry_lru_show)
DEFINE_SHOW_FUNC(caps_show)
+static int congestion_kb_set(void *data, u64 val)
+{
+ struct ceph_client *client = (struct ceph_client *)data;
+
+ if (client)
+ client->mount_args->congestion_kb = (int)val;
+
+ return 0;
+}
+
+static int congestion_kb_get(void *data, u64 *val)
+{
+ struct ceph_client *client = (struct ceph_client *)data;
+
+ if (client)
+ *val = (u64)client->mount_args->congestion_kb;
+
+ return 0;
+}
+
+
+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
+ congestion_kb_set, "%llu\n");
+
int __init ceph_debugfs_init(void)
{
ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
@@ -409,6 +433,14 @@
if (!client->debugfs_caps)
goto out;
+ client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &congestion_kb_fops);
+ if (!client->debugfs_congestion_kb)
+ goto out;
+
sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
name);
@@ -431,6 +463,7 @@
debugfs_remove(client->osdc.debugfs_file);
debugfs_remove(client->mdsc.debugfs_file);
debugfs_remove(client->monc.debugfs_file);
+ debugfs_remove(client->debugfs_congestion_kb);
debugfs_remove(client->debugfs_dir);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 6d02a16..b9cb8ce 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -150,6 +150,35 @@
inode_init_once(&ci->vfs_inode);
}
+static int default_congestion_kb(void)
+{
+ int congestion_kb;
+
+ /*
+ * Copied from NFS
+ *
+ * congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ * Limit the default to 256M
+ */
+ congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ if (congestion_kb > 256*1024)
+ congestion_kb = 256*1024;
+
+ return congestion_kb;
+}
+
static int __init init_caches(void)
{
ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -267,6 +296,7 @@
Opt_caps_wanted_delay_min,
Opt_caps_wanted_delay_max,
Opt_readdir_max_entries,
+ Opt_congestion_kb,
Opt_last_int,
/* int args above */
Opt_snapdirname,
@@ -295,6 +325,7 @@
{Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
{Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
{Opt_readdir_max_entries, "readdir_max_entries=%d"},
+ {Opt_congestion_kb, "write_congestion_kb=%d"},
/* int args above */
{Opt_snapdirname, "snapdirname=%s"},
{Opt_name, "name=%s"},
@@ -342,6 +373,7 @@
args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
args->max_readdir = 1024;
+ args->congestion_kb = default_congestion_kb();
/* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
err = -EINVAL;
@@ -445,6 +477,9 @@
case Opt_readdir_max_entries:
args->max_readdir = intval;
break;
+ case Opt_congestion_kb:
+ args->congestion_kb = intval;
+ break;
case Opt_noshare:
args->flags |= CEPH_OPT_NOSHARE;
@@ -516,6 +551,7 @@
client->msgr = NULL;
client->mount_err = 0;
+ atomic_long_set(&client->writeback_count, 0);
err = bdi_init(&client->backing_dev_info);
if (err < 0)
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 2304bd2..62d9ae48 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -59,6 +59,7 @@
int wsize;
int rsize; /* max readahead */
int max_readdir; /* max readdir size */
+ int congestion_kb; /* max readdir size */
int osd_timeout;
char *snapdir_name; /* default ".snap" */
char *name;
@@ -136,6 +137,7 @@
struct workqueue_struct *wb_wq;
struct workqueue_struct *pg_inv_wq;
struct workqueue_struct *trunc_wq;
+ atomic_long_t writeback_count;
struct backing_dev_info backing_dev_info;
@@ -143,6 +145,7 @@
struct dentry *debugfs_monmap;
struct dentry *debugfs_mdsmap, *debugfs_osdmap;
struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+ struct dentry *debugfs_congestion_kb;
struct dentry *debugfs_bdi;
#endif
};