blob: 553949eefd515bc36d38ca92c27400b3c8d87699 [file] [log] [blame]
Kent Overstreetcafe5632013-03-23 16:11:31 -07001/*
2 * background writeback - scan btree for dirty data and write it to the backing
3 * device
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12
Kent Overstreetc37511b2013-04-26 15:39:55 -070013#include <trace/events/bcache.h>
14
Kent Overstreetcafe5632013-03-23 16:11:31 -070015static struct workqueue_struct *dirty_wq;
16
17static void read_dirty(struct closure *);
18
19struct dirty_io {
20 struct closure cl;
21 struct cached_dev *dc;
22 struct bio bio;
23};
24
25/* Rate limiting */
26
27static void __update_writeback_rate(struct cached_dev *dc)
28{
29 struct cache_set *c = dc->disk.c;
30 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size;
31 uint64_t cache_dirty_target =
32 div_u64(cache_sectors * dc->writeback_percent, 100);
33
34 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
35 c->cached_dev_sectors);
36
37 /* PD controller */
38
39 int change = 0;
40 int64_t error;
41 int64_t dirty = atomic_long_read(&dc->disk.sectors_dirty);
42 int64_t derivative = dirty - dc->disk.sectors_dirty_last;
43
44 dc->disk.sectors_dirty_last = dirty;
45
46 derivative *= dc->writeback_rate_d_term;
47 derivative = clamp(derivative, -dirty, dirty);
48
49 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
50 dc->writeback_rate_d_smooth, 0);
51
52 /* Avoid divide by zero */
53 if (!target)
54 goto out;
55
56 error = div64_s64((dirty + derivative - target) << 8, target);
57
58 change = div_s64((dc->writeback_rate.rate * error) >> 8,
59 dc->writeback_rate_p_term_inverse);
60
61 /* Don't increase writeback rate if the device isn't keeping up */
62 if (change > 0 &&
63 time_after64(local_clock(),
64 dc->writeback_rate.next + 10 * NSEC_PER_MSEC))
65 change = 0;
66
67 dc->writeback_rate.rate =
68 clamp_t(int64_t, dc->writeback_rate.rate + change,
69 1, NSEC_PER_MSEC);
70out:
71 dc->writeback_rate_derivative = derivative;
72 dc->writeback_rate_change = change;
73 dc->writeback_rate_target = target;
74
75 schedule_delayed_work(&dc->writeback_rate_update,
76 dc->writeback_rate_update_seconds * HZ);
77}
78
79static void update_writeback_rate(struct work_struct *work)
80{
81 struct cached_dev *dc = container_of(to_delayed_work(work),
82 struct cached_dev,
83 writeback_rate_update);
84
85 down_read(&dc->writeback_lock);
86
87 if (atomic_read(&dc->has_dirty) &&
88 dc->writeback_percent)
89 __update_writeback_rate(dc);
90
91 up_read(&dc->writeback_lock);
92}
93
94static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
95{
96 if (atomic_read(&dc->disk.detaching) ||
97 !dc->writeback_percent)
98 return 0;
99
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600100 return bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700101}
102
103/* Background writeback */
104
105static bool dirty_pred(struct keybuf *buf, struct bkey *k)
106{
107 return KEY_DIRTY(k);
108}
109
110static void dirty_init(struct keybuf_key *w)
111{
112 struct dirty_io *io = w->private;
113 struct bio *bio = &io->bio;
114
115 bio_init(bio);
116 if (!io->dc->writeback_percent)
117 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
118
119 bio->bi_size = KEY_SIZE(&w->key) << 9;
120 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS);
121 bio->bi_private = w;
122 bio->bi_io_vec = bio->bi_inline_vecs;
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600123 bch_bio_map(bio, NULL);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700124}
125
126static void refill_dirty(struct closure *cl)
127{
128 struct cached_dev *dc = container_of(cl, struct cached_dev,
129 writeback.cl);
130 struct keybuf *buf = &dc->writeback_keys;
131 bool searched_from_start = false;
132 struct bkey end = MAX_KEY;
133 SET_KEY_INODE(&end, dc->disk.id);
134
135 if (!atomic_read(&dc->disk.detaching) &&
136 !dc->writeback_running)
137 closure_return(cl);
138
139 down_write(&dc->writeback_lock);
140
141 if (!atomic_read(&dc->has_dirty)) {
142 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
143 bch_write_bdev_super(dc, NULL);
144
145 up_write(&dc->writeback_lock);
146 closure_return(cl);
147 }
148
149 if (bkey_cmp(&buf->last_scanned, &end) >= 0) {
150 buf->last_scanned = KEY(dc->disk.id, 0, 0);
151 searched_from_start = true;
152 }
153
154 bch_refill_keybuf(dc->disk.c, buf, &end);
155
156 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) {
157 /* Searched the entire btree - delay awhile */
158
159 if (RB_EMPTY_ROOT(&buf->keys)) {
160 atomic_set(&dc->has_dirty, 0);
161 cached_dev_put(dc);
162 }
163
164 if (!atomic_read(&dc->disk.detaching))
165 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
166 }
167
168 up_write(&dc->writeback_lock);
169
170 ratelimit_reset(&dc->writeback_rate);
171
172 /* Punt to workqueue only so we don't recurse and blow the stack */
173 continue_at(cl, read_dirty, dirty_wq);
174}
175
176void bch_writeback_queue(struct cached_dev *dc)
177{
178 if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) {
179 if (!atomic_read(&dc->disk.detaching))
180 closure_delay(&dc->writeback, dc->writeback_delay * HZ);
181
182 continue_at(&dc->writeback.cl, refill_dirty, dirty_wq);
183 }
184}
185
186void bch_writeback_add(struct cached_dev *dc, unsigned sectors)
187{
188 atomic_long_add(sectors, &dc->disk.sectors_dirty);
189
190 if (!atomic_read(&dc->has_dirty) &&
191 !atomic_xchg(&dc->has_dirty, 1)) {
192 atomic_inc(&dc->count);
193
194 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) {
195 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY);
196 /* XXX: should do this synchronously */
197 bch_write_bdev_super(dc, NULL);
198 }
199
200 bch_writeback_queue(dc);
201
202 if (dc->writeback_percent)
203 schedule_delayed_work(&dc->writeback_rate_update,
204 dc->writeback_rate_update_seconds * HZ);
205 }
206}
207
208/* Background writeback - IO loop */
209
210static void dirty_io_destructor(struct closure *cl)
211{
212 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
213 kfree(io);
214}
215
216static void write_dirty_finish(struct closure *cl)
217{
218 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
219 struct keybuf_key *w = io->bio.bi_private;
220 struct cached_dev *dc = io->dc;
221 struct bio_vec *bv = bio_iovec_idx(&io->bio, io->bio.bi_vcnt);
222
223 while (bv-- != io->bio.bi_io_vec)
224 __free_page(bv->bv_page);
225
226 /* This is kind of a dumb way of signalling errors. */
227 if (KEY_DIRTY(&w->key)) {
228 unsigned i;
229 struct btree_op op;
230 bch_btree_op_init_stack(&op);
231
232 op.type = BTREE_REPLACE;
233 bkey_copy(&op.replace, &w->key);
234
235 SET_KEY_DIRTY(&w->key, false);
236 bch_keylist_add(&op.keys, &w->key);
237
238 for (i = 0; i < KEY_PTRS(&w->key); i++)
239 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
240
Kent Overstreetcafe5632013-03-23 16:11:31 -0700241 bch_btree_insert(&op, dc->disk.c);
242 closure_sync(&op.cl);
243
Kent Overstreetc37511b2013-04-26 15:39:55 -0700244 if (op.insert_collision)
245 trace_bcache_writeback_collision(&w->key);
246
Kent Overstreetcafe5632013-03-23 16:11:31 -0700247 atomic_long_inc(op.insert_collision
248 ? &dc->disk.c->writeback_keys_failed
249 : &dc->disk.c->writeback_keys_done);
250 }
251
252 bch_keybuf_del(&dc->writeback_keys, w);
253 atomic_dec_bug(&dc->in_flight);
254
255 closure_wake_up(&dc->writeback_wait);
256
257 closure_return_with_destructor(cl, dirty_io_destructor);
258}
259
260static void dirty_endio(struct bio *bio, int error)
261{
262 struct keybuf_key *w = bio->bi_private;
263 struct dirty_io *io = w->private;
264
265 if (error)
266 SET_KEY_DIRTY(&w->key, false);
267
268 closure_put(&io->cl);
269}
270
271static void write_dirty(struct closure *cl)
272{
273 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
274 struct keybuf_key *w = io->bio.bi_private;
275
276 dirty_init(w);
277 io->bio.bi_rw = WRITE;
278 io->bio.bi_sector = KEY_START(&w->key);
279 io->bio.bi_bdev = io->dc->bdev;
280 io->bio.bi_end_io = dirty_endio;
281
Kent Overstreetcafe5632013-03-23 16:11:31 -0700282 closure_bio_submit(&io->bio, cl, &io->dc->disk);
283
284 continue_at(cl, write_dirty_finish, dirty_wq);
285}
286
287static void read_dirty_endio(struct bio *bio, int error)
288{
289 struct keybuf_key *w = bio->bi_private;
290 struct dirty_io *io = w->private;
291
292 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
293 error, "reading dirty data from cache");
294
295 dirty_endio(bio, error);
296}
297
298static void read_dirty_submit(struct closure *cl)
299{
300 struct dirty_io *io = container_of(cl, struct dirty_io, cl);
301
Kent Overstreetcafe5632013-03-23 16:11:31 -0700302 closure_bio_submit(&io->bio, cl, &io->dc->disk);
303
304 continue_at(cl, write_dirty, dirty_wq);
305}
306
307static void read_dirty(struct closure *cl)
308{
309 struct cached_dev *dc = container_of(cl, struct cached_dev,
310 writeback.cl);
311 unsigned delay = writeback_delay(dc, 0);
312 struct keybuf_key *w;
313 struct dirty_io *io;
314
315 /*
316 * XXX: if we error, background writeback just spins. Should use some
317 * mempools.
318 */
319
320 while (1) {
321 w = bch_keybuf_next(&dc->writeback_keys);
322 if (!w)
323 break;
324
325 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
326
327 if (delay > 0 &&
328 (KEY_START(&w->key) != dc->last_read ||
329 jiffies_to_msecs(delay) > 50)) {
330 w->private = NULL;
331
332 closure_delay(&dc->writeback, delay);
333 continue_at(cl, read_dirty, dirty_wq);
334 }
335
336 dc->last_read = KEY_OFFSET(&w->key);
337
338 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
339 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
340 GFP_KERNEL);
341 if (!io)
342 goto err;
343
344 w->private = io;
345 io->dc = dc;
346
347 dirty_init(w);
348 io->bio.bi_sector = PTR_OFFSET(&w->key, 0);
349 io->bio.bi_bdev = PTR_CACHE(dc->disk.c,
350 &w->key, 0)->bdev;
351 io->bio.bi_rw = READ;
352 io->bio.bi_end_io = read_dirty_endio;
353
Kent Overstreet169ef1c2013-03-28 12:50:55 -0600354 if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
Kent Overstreetcafe5632013-03-23 16:11:31 -0700355 goto err_free;
356
Kent Overstreetc37511b2013-04-26 15:39:55 -0700357 trace_bcache_writeback(&w->key);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700358
359 closure_call(&io->cl, read_dirty_submit, NULL, &dc->disk.cl);
360
361 delay = writeback_delay(dc, KEY_SIZE(&w->key));
362
363 atomic_inc(&dc->in_flight);
364
365 if (!closure_wait_event(&dc->writeback_wait, cl,
366 atomic_read(&dc->in_flight) < 64))
367 continue_at(cl, read_dirty, dirty_wq);
368 }
369
370 if (0) {
371err_free:
372 kfree(w->private);
373err:
374 bch_keybuf_del(&dc->writeback_keys, w);
375 }
376
377 refill_dirty(cl);
378}
379
Kent Overstreet444fc0b2013-05-11 17:07:26 -0700380/* Init */
381
382static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op,
383 struct cached_dev *dc)
384{
385 struct bkey *k;
386 struct btree_iter iter;
387
388 bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0));
389 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad)))
390 if (!b->level) {
391 if (KEY_INODE(k) > dc->disk.id)
392 break;
393
394 if (KEY_DIRTY(k))
395 atomic_long_add(KEY_SIZE(k),
396 &dc->disk.sectors_dirty);
397 } else {
398 btree(sectors_dirty_init, k, b, op, dc);
399 if (KEY_INODE(k) > dc->disk.id)
400 break;
401
402 cond_resched();
403 }
404
405 return 0;
406}
407
408void bch_sectors_dirty_init(struct cached_dev *dc)
409{
410 struct btree_op op;
411
412 bch_btree_op_init_stack(&op);
413 btree_root(sectors_dirty_init, dc->disk.c, &op, dc);
414}
415
Kent Overstreetf59fce82013-05-15 00:11:26 -0700416void bch_cached_dev_writeback_init(struct cached_dev *dc)
Kent Overstreetcafe5632013-03-23 16:11:31 -0700417{
418 closure_init_unlocked(&dc->writeback);
419 init_rwsem(&dc->writeback_lock);
420
421 bch_keybuf_init(&dc->writeback_keys, dirty_pred);
422
423 dc->writeback_metadata = true;
424 dc->writeback_running = true;
425 dc->writeback_percent = 10;
426 dc->writeback_delay = 30;
427 dc->writeback_rate.rate = 1024;
428
429 dc->writeback_rate_update_seconds = 30;
430 dc->writeback_rate_d_term = 16;
431 dc->writeback_rate_p_term_inverse = 64;
432 dc->writeback_rate_d_smooth = 8;
433
434 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
435 schedule_delayed_work(&dc->writeback_rate_update,
436 dc->writeback_rate_update_seconds * HZ);
437}
438
439void bch_writeback_exit(void)
440{
441 if (dirty_wq)
442 destroy_workqueue(dirty_wq);
443}
444
445int __init bch_writeback_init(void)
446{
447 dirty_wq = create_singlethread_workqueue("bcache_writeback");
448 if (!dirty_wq)
449 return -ENOMEM;
450
451 return 0;
452}