blob: 1ab122a7576497ea5430e9bf9daacca78d009683 [file] [log] [blame]
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001/*
2 * Copyright (C) 2012 Red Hat. All rights reserved.
3 *
4 * This file is released under the GPL.
5 */
6
7#include "dm.h"
8#include "dm-bio-prison.h"
Darrick J. Wongb844fe62013-04-05 15:36:32 +01009#include "dm-bio-record.h"
Joe Thornberc6b4fcb2013-03-01 22:45:51 +000010#include "dm-cache-metadata.h"
11
12#include <linux/dm-io.h>
13#include <linux/dm-kcopyd.h>
14#include <linux/init.h>
15#include <linux/mempool.h>
16#include <linux/module.h>
17#include <linux/slab.h>
18#include <linux/vmalloc.h>
19
20#define DM_MSG_PREFIX "cache"
21
22DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
23 "A percentage of time allocated for copying to and/or from cache");
24
25/*----------------------------------------------------------------*/
26
27/*
28 * Glossary:
29 *
30 * oblock: index of an origin block
31 * cblock: index of a cache block
32 * promotion: movement of a block from origin to cache
33 * demotion: movement of a block from cache to origin
34 * migration: movement of a block between the origin and cache device,
35 * either direction
36 */
37
38/*----------------------------------------------------------------*/
39
40static size_t bitset_size_in_bytes(unsigned nr_entries)
41{
42 return sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG);
43}
44
45static unsigned long *alloc_bitset(unsigned nr_entries)
46{
47 size_t s = bitset_size_in_bytes(nr_entries);
48 return vzalloc(s);
49}
50
51static void clear_bitset(void *bitset, unsigned nr_entries)
52{
53 size_t s = bitset_size_in_bytes(nr_entries);
54 memset(bitset, 0, s);
55}
56
57static void free_bitset(unsigned long *bits)
58{
59 vfree(bits);
60}
61
62/*----------------------------------------------------------------*/
63
64#define PRISON_CELLS 1024
65#define MIGRATION_POOL_SIZE 128
66#define COMMIT_PERIOD HZ
67#define MIGRATION_COUNT_WINDOW 10
68
69/*
70 * The block size of the device holding cache data must be >= 32KB
71 */
72#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
73
74/*
75 * FIXME: the cache is read/write for the time being.
76 */
77enum cache_mode {
78 CM_WRITE, /* metadata may be changed */
79 CM_READ_ONLY, /* metadata may not be changed */
80};
81
82struct cache_features {
83 enum cache_mode mode;
84 bool write_through:1;
85};
86
87struct cache_stats {
88 atomic_t read_hit;
89 atomic_t read_miss;
90 atomic_t write_hit;
91 atomic_t write_miss;
92 atomic_t demotion;
93 atomic_t promotion;
94 atomic_t copies_avoided;
95 atomic_t cache_cell_clash;
96 atomic_t commit_count;
97 atomic_t discard_count;
98};
99
100struct cache {
101 struct dm_target *ti;
102 struct dm_target_callbacks callbacks;
103
104 /*
105 * Metadata is written to this device.
106 */
107 struct dm_dev *metadata_dev;
108
109 /*
110 * The slower of the two data devices. Typically a spindle.
111 */
112 struct dm_dev *origin_dev;
113
114 /*
115 * The faster of the two data devices. Typically an SSD.
116 */
117 struct dm_dev *cache_dev;
118
119 /*
120 * Cache features such as write-through.
121 */
122 struct cache_features features;
123
124 /*
125 * Size of the origin device in _complete_ blocks and native sectors.
126 */
127 dm_oblock_t origin_blocks;
128 sector_t origin_sectors;
129
130 /*
131 * Size of the cache device in blocks.
132 */
133 dm_cblock_t cache_size;
134
135 /*
136 * Fields for converting from sectors to blocks.
137 */
138 uint32_t sectors_per_block;
139 int sectors_per_block_shift;
140
141 struct dm_cache_metadata *cmd;
142
143 spinlock_t lock;
144 struct bio_list deferred_bios;
145 struct bio_list deferred_flush_bios;
Joe Thornbere2e74d62013-03-20 17:21:27 +0000146 struct bio_list deferred_writethrough_bios;
Joe Thornberc6b4fcb2013-03-01 22:45:51 +0000147 struct list_head quiesced_migrations;
148 struct list_head completed_migrations;
149 struct list_head need_commit_migrations;
150 sector_t migration_threshold;
151 atomic_t nr_migrations;
152 wait_queue_head_t migration_wait;
153
154 /*
155 * cache_size entries, dirty if set
156 */
157 dm_cblock_t nr_dirty;
158 unsigned long *dirty_bitset;
159
160 /*
161 * origin_blocks entries, discarded if set.
162 */
Joe Thornber414dd672013-03-20 17:21:25 +0000163 uint32_t discard_block_size; /* a power of 2 times sectors per block */
Joe Thornberc6b4fcb2013-03-01 22:45:51 +0000164 dm_dblock_t discard_nr_blocks;
165 unsigned long *discard_bitset;
166
167 struct dm_kcopyd_client *copier;
168 struct workqueue_struct *wq;
169 struct work_struct worker;
170
171 struct delayed_work waker;
172 unsigned long last_commit_jiffies;
173
174 struct dm_bio_prison *prison;
175 struct dm_deferred_set *all_io_ds;
176
177 mempool_t *migration_pool;
178 struct dm_cache_migration *next_migration;
179
180 struct dm_cache_policy *policy;
181 unsigned policy_nr_args;
182
183 bool need_tick_bio:1;
184 bool sized:1;
185 bool quiescing:1;
186 bool commit_requested:1;
187 bool loaded_mappings:1;
188 bool loaded_discards:1;
189
190 struct cache_stats stats;
191
192 /*
193 * Rather than reconstructing the table line for the status we just
194 * save it and regurgitate.
195 */
196 unsigned nr_ctr_args;
197 const char **ctr_args;
198};
199
200struct per_bio_data {
201 bool tick:1;
202 unsigned req_nr:2;
203 struct dm_deferred_entry *all_io_entry;
Joe Thornbere2e74d62013-03-20 17:21:27 +0000204
205 /* writethrough fields */
206 struct cache *cache;
207 dm_cblock_t cblock;
208 bio_end_io_t *saved_bi_end_io;
Darrick J. Wongb844fe62013-04-05 15:36:32 +0100209 struct dm_bio_details bio_details;
Joe Thornberc6b4fcb2013-03-01 22:45:51 +0000210};
211
212struct dm_cache_migration {
213 struct list_head list;
214 struct cache *cache;
215
216 unsigned long start_jiffies;
217 dm_oblock_t old_oblock;
218 dm_oblock_t new_oblock;
219 dm_cblock_t cblock;
220
221 bool err:1;
222 bool writeback:1;
223 bool demote:1;
224 bool promote:1;
225
226 struct dm_bio_prison_cell *old_ocell;
227 struct dm_bio_prison_cell *new_ocell;
228};
229
230/*
231 * Processing a bio in the worker thread may require these memory
232 * allocations. We prealloc to avoid deadlocks (the same worker thread
233 * frees them back to the mempool).
234 */
235struct prealloc {
236 struct dm_cache_migration *mg;
237 struct dm_bio_prison_cell *cell1;
238 struct dm_bio_prison_cell *cell2;
239};
240
241static void wake_worker(struct cache *cache)
242{
243 queue_work(cache->wq, &cache->worker);
244}
245
246/*----------------------------------------------------------------*/
247
248static struct dm_bio_prison_cell *alloc_prison_cell(struct cache *cache)
249{
250 /* FIXME: change to use a local slab. */
251 return dm_bio_prison_alloc_cell(cache->prison, GFP_NOWAIT);
252}
253
254static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell *cell)
255{
256 dm_bio_prison_free_cell(cache->prison, cell);
257}
258
259static int prealloc_data_structs(struct cache *cache, struct prealloc *p)
260{
261 if (!p->mg) {
262 p->mg = mempool_alloc(cache->migration_pool, GFP_NOWAIT);
263 if (!p->mg)
264 return -ENOMEM;
265 }
266
267 if (!p->cell1) {
268 p->cell1 = alloc_prison_cell(cache);
269 if (!p->cell1)
270 return -ENOMEM;
271 }
272
273 if (!p->cell2) {
274 p->cell2 = alloc_prison_cell(cache);
275 if (!p->cell2)
276 return -ENOMEM;
277 }
278
279 return 0;
280}
281
282static void prealloc_free_structs(struct cache *cache, struct prealloc *p)
283{
284 if (p->cell2)
285 free_prison_cell(cache, p->cell2);
286
287 if (p->cell1)
288 free_prison_cell(cache, p->cell1);
289
290 if (p->mg)
291 mempool_free(p->mg, cache->migration_pool);
292}
293
294static struct dm_cache_migration *prealloc_get_migration(struct prealloc *p)
295{
296 struct dm_cache_migration *mg = p->mg;
297
298 BUG_ON(!mg);
299 p->mg = NULL;
300
301 return mg;
302}
303
304/*
305 * You must have a cell within the prealloc struct to return. If not this
306 * function will BUG() rather than returning NULL.
307 */
308static struct dm_bio_prison_cell *prealloc_get_cell(struct prealloc *p)
309{
310 struct dm_bio_prison_cell *r = NULL;
311
312 if (p->cell1) {
313 r = p->cell1;
314 p->cell1 = NULL;
315
316 } else if (p->cell2) {
317 r = p->cell2;
318 p->cell2 = NULL;
319 } else
320 BUG();
321
322 return r;
323}
324
325/*
326 * You can't have more than two cells in a prealloc struct. BUG() will be
327 * called if you try and overfill.
328 */
329static void prealloc_put_cell(struct prealloc *p, struct dm_bio_prison_cell *cell)
330{
331 if (!p->cell2)
332 p->cell2 = cell;
333
334 else if (!p->cell1)
335 p->cell1 = cell;
336
337 else
338 BUG();
339}
340
341/*----------------------------------------------------------------*/
342
343static void build_key(dm_oblock_t oblock, struct dm_cell_key *key)
344{
345 key->virtual = 0;
346 key->dev = 0;
347 key->block = from_oblock(oblock);
348}
349
350/*
351 * The caller hands in a preallocated cell, and a free function for it.
352 * The cell will be freed if there's an error, or if it wasn't used because
353 * a cell with that key already exists.
354 */
355typedef void (*cell_free_fn)(void *context, struct dm_bio_prison_cell *cell);
356
357static int bio_detain(struct cache *cache, dm_oblock_t oblock,
358 struct bio *bio, struct dm_bio_prison_cell *cell_prealloc,
359 cell_free_fn free_fn, void *free_context,
360 struct dm_bio_prison_cell **cell_result)
361{
362 int r;
363 struct dm_cell_key key;
364
365 build_key(oblock, &key);
366 r = dm_bio_detain(cache->prison, &key, bio, cell_prealloc, cell_result);
367 if (r)
368 free_fn(free_context, cell_prealloc);
369
370 return r;
371}
372
373static int get_cell(struct cache *cache,
374 dm_oblock_t oblock,
375 struct prealloc *structs,
376 struct dm_bio_prison_cell **cell_result)
377{
378 int r;
379 struct dm_cell_key key;
380 struct dm_bio_prison_cell *cell_prealloc;
381
382 cell_prealloc = prealloc_get_cell(structs);
383
384 build_key(oblock, &key);
385 r = dm_get_cell(cache->prison, &key, cell_prealloc, cell_result);
386 if (r)
387 prealloc_put_cell(structs, cell_prealloc);
388
389 return r;
390}
391
392 /*----------------------------------------------------------------*/
393
394static bool is_dirty(struct cache *cache, dm_cblock_t b)
395{
396 return test_bit(from_cblock(b), cache->dirty_bitset);
397}
398
399static void set_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
400{
401 if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
402 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) + 1);
403 policy_set_dirty(cache->policy, oblock);
404 }
405}
406
407static void clear_dirty(struct cache *cache, dm_oblock_t oblock, dm_cblock_t cblock)
408{
409 if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
410 policy_clear_dirty(cache->policy, oblock);
411 cache->nr_dirty = to_cblock(from_cblock(cache->nr_dirty) - 1);
412 if (!from_cblock(cache->nr_dirty))
413 dm_table_event(cache->ti->table);
414 }
415}
416
417/*----------------------------------------------------------------*/
418static bool block_size_is_power_of_two(struct cache *cache)
419{
420 return cache->sectors_per_block_shift >= 0;
421}
422
Joe Thornber414dd672013-03-20 17:21:25 +0000423static dm_block_t block_div(dm_block_t b, uint32_t n)
424{
425 do_div(b, n);
426
427 return b;
428}
429
Joe Thornberc6b4fcb2013-03-01 22:45:51 +0000430static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
431{
Joe Thornber414dd672013-03-20 17:21:25 +0000432 uint32_t discard_blocks = cache->discard_block_size;
Joe Thornberc6b4fcb2013-03-01 22:45:51 +0000433 dm_block_t b = from_oblock(oblock);
434
435 if (!block_size_is_power_of_two(cache))
Joe Thornber414dd672013-03-20 17:21:25 +0000436 discard_blocks = discard_blocks / cache->sectors_per_block;
Joe Thornberc6b4fcb2013-03-01 22:45:51 +0000437 else
438 discard_blocks >>= cache->sectors_per_block_shift;
439
Joe Thornber414dd672013-03-20 17:21:25 +0000440 b = block_div(b, discard_blocks);
Joe Thornberc6b4fcb2013-03-01 22:45:51 +0000441
442 return to_dblock(b);
443}
444
445static void set_discard(struct cache *cache, dm_dblock_t b)
446{
447 unsigned long flags;
448
449 atomic_inc(&cache->stats.discard_count);
450
451 spin_lock_irqsave(&cache->lock, flags);
452 set_bit(from_dblock(b), cache->discard_bitset);
453 spin_unlock_irqrestore(&cache->lock, flags);
454}
455
456static void clear_discard(struct cache *cache, dm_dblock_t b)
457{
458 unsigned long flags;
459
460 spin_lock_irqsave(&cache->lock, flags);
461 clear_bit(from_dblock(b), cache->discard_bitset);
462 spin_unlock_irqrestore(&cache->lock, flags);
463}
464
465static bool is_discarded(struct cache *cache, dm_dblock_t b)
466{
467 int r;
468 unsigned long flags;
469
470 spin_lock_irqsave(&cache->lock, flags);
471 r = test_bit(from_dblock(b), cache->discard_bitset);
472 spin_unlock_irqrestore(&cache->lock, flags);
473
474 return r;
475}
476
477static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
478{
479 int r;
480 unsigned long flags;
481
482 spin_lock_irqsave(&cache->lock, flags);
483 r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
484 cache->discard_bitset);
485 spin_unlock_irqrestore(&cache->lock, flags);
486
487 return r;
488}
489
490/*----------------------------------------------------------------*/
491
492static void load_stats(struct cache *cache)
493{
494 struct dm_cache_statistics stats;
495
496 dm_cache_metadata_get_stats(cache->cmd, &stats);
497 atomic_set(&cache->stats.read_hit, stats.read_hits);
498 atomic_set(&cache->stats.read_miss, stats.read_misses);
499 atomic_set(&cache->stats.write_hit, stats.write_hits);
500 atomic_set(&cache->stats.write_miss, stats.write_misses);
501}
502
503static void save_stats(struct cache *cache)
504{
505 struct dm_cache_statistics stats;
506
507 stats.read_hits = atomic_read(&cache->stats.read_hit);
508 stats.read_misses = atomic_read(&cache->stats.read_miss);
509 stats.write_hits = atomic_read(&cache->stats.write_hit);
510 stats.write_misses = atomic_read(&cache->stats.write_miss);
511
512 dm_cache_metadata_set_stats(cache->cmd, &stats);
513}
514
515/*----------------------------------------------------------------
516 * Per bio data
517 *--------------------------------------------------------------*/
518static struct per_bio_data *get_per_bio_data(struct bio *bio)
519{
520 struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
521 BUG_ON(!pb);
522 return pb;
523}
524
525static struct per_bio_data *init_per_bio_data(struct bio *bio)
526{
527 struct per_bio_data *pb = get_per_bio_data(bio);
528
529 pb->tick = false;
530 pb->req_nr = dm_bio_get_target_bio_nr(bio);
531 pb->all_io_entry = NULL;
532
533 return pb;
534}
535
536/*----------------------------------------------------------------
537 * Remapping
538 *--------------------------------------------------------------*/
539static void remap_to_origin(struct cache *cache, struct bio *bio)
540{
541 bio->bi_bdev = cache->origin_dev->bdev;
542}
543
544static void remap_to_cache(struct cache *cache, struct bio *bio,
545 dm_cblock_t cblock)
546{
547 sector_t bi_sector = bio->bi_sector;
548
549 bio->bi_bdev = cache->cache_dev->bdev;
550 if (!block_size_is_power_of_two(cache))
551 bio->bi_sector = (from_cblock(cblock) * cache->sectors_per_block) +
552 sector_div(bi_sector, cache->sectors_per_block);
553 else
554 bio->bi_sector = (from_cblock(cblock) << cache->sectors_per_block_shift) |
555 (bi_sector & (cache->sectors_per_block - 1));
556}
557
558static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
559{
560 unsigned long flags;
561 struct per_bio_data *pb = get_per_bio_data(bio);
562
563 spin_lock_irqsave(&cache->lock, flags);
564 if (cache->need_tick_bio &&
565 !(bio->bi_rw & (REQ_FUA | REQ_FLUSH | REQ_DISCARD))) {
566 pb->tick = true;
567 cache->need_tick_bio = false;
568 }
569 spin_unlock_irqrestore(&cache->lock, flags);
570}
571
572static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
573 dm_oblock_t oblock)
574{
575 check_if_tick_bio_needed(cache, bio);
576 remap_to_origin(cache, bio);
577 if (bio_data_dir(bio) == WRITE)
578 clear_discard(cache, oblock_to_dblock(cache, oblock));
579}
580
581static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
582 dm_oblock_t oblock, dm_cblock_t cblock)
583{
584 remap_to_cache(cache, bio, cblock);
585 if (bio_data_dir(bio) == WRITE) {
586 set_dirty(cache, oblock, cblock);
587 clear_discard(cache, oblock_to_dblock(cache, oblock));
588 }
589}
590
591static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
592{
593 sector_t block_nr = bio->bi_sector;
594
595 if (!block_size_is_power_of_two(cache))
596 (void) sector_div(block_nr, cache->sectors_per_block);
597 else
598 block_nr >>= cache->sectors_per_block_shift;
599
600 return to_oblock(block_nr);
601}
602
603static int bio_triggers_commit(struct cache *cache, struct bio *bio)
604{
605 return bio->bi_rw & (REQ_FLUSH | REQ_FUA);
606}
607
608static void issue(struct cache *cache, struct bio *bio)
609{
610 unsigned long flags;
611
612 if (!bio_triggers_commit(cache, bio)) {
613 generic_make_request(bio);
614 return;
615 }
616
617 /*
618 * Batch together any bios that trigger commits and then issue a
619 * single commit for them in do_worker().
620 */
621 spin_lock_irqsave(&cache->lock, flags);
622 cache->commit_requested = true;
623 bio_list_add(&cache->deferred_flush_bios, bio);
624 spin_unlock_irqrestore(&cache->lock, flags);
625}
626
Joe Thornbere2e74d62013-03-20 17:21:27 +0000627static void defer_writethrough_bio(struct cache *cache, struct bio *bio)
628{
629 unsigned long flags;
630
631 spin_lock_irqsave(&cache->lock, flags);
632 bio_list_add(&cache->deferred_writethrough_bios, bio);
633 spin_unlock_irqrestore(&cache->lock, flags);
634
635 wake_worker(cache);
636}
637
638static void writethrough_endio(struct bio *bio, int err)
639{
640 struct per_bio_data *pb = get_per_bio_data(bio);
641 bio->bi_end_io = pb->saved_bi_end_io;
642
643 if (err) {
644 bio_endio(bio, err);
645 return;
646 }
647
Darrick J. Wongb844fe62013-04-05 15:36:32 +0100648 dm_bio_restore(&pb->bio_details, bio);
Joe Thornbere2e74d62013-03-20 17:21:27 +0000649 remap_to_cache(pb->cache, bio, pb->cblock);
650
651 /*
652 * We can't issue this bio directly, since we're in interrupt
653 * context. So it get's put on a bio list for processing by the
654 * worker thread.
655 */
656 defer_writethrough_bio(pb->cache, bio);
657}
658
659/*
660 * When running in writethrough mode we need to send writes to clean blocks
661 * to both the cache and origin devices. In future we'd like to clone the
662 * bio and send them in parallel, but for now we're doing them in
663 * series as this is easier.
664 */
665static void remap_to_origin_then_cache(struct cache *cache, struct bio *bio,
666 dm_oblock_t oblock, dm_cblock_t cblock)
667{
668 struct per_bio_data *pb = get_per_bio_data(bio);
669
670 pb->cache = cache;
671 pb->cblock = cblock;
672 pb->saved_bi_end_io = bio->bi_end_io;
Darrick J. Wongb844fe62013-04-05 15:36:32 +0100673 dm_bio_record(&pb->bio_details, bio);
Joe Thornbere2e74d62013-03-20 17:21:27 +0000674 bio->bi_end_io = writethrough_endio;
675
676 remap_to_origin_clear_discard(pb->cache, bio, oblock);
677}
678
Joe Thornberc6b4fcb2013-03-01 22:45:51 +0000679/*----------------------------------------------------------------
680 * Migration processing
681 *
682 * Migration covers moving data from the origin device to the cache, or
683 * vice versa.
684 *--------------------------------------------------------------*/
685static void free_migration(struct dm_cache_migration *mg)
686{
687 mempool_free(mg, mg->cache->migration_pool);
688}
689
690static void inc_nr_migrations(struct cache *cache)
691{
692 atomic_inc(&cache->nr_migrations);
693}
694
695static void dec_nr_migrations(struct cache *cache)
696{
697 atomic_dec(&cache->nr_migrations);
698
699 /*
700 * Wake the worker in case we're suspending the target.
701 */
702 wake_up(&cache->migration_wait);
703}
704
705static void __cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
706 bool holder)
707{
708 (holder ? dm_cell_release : dm_cell_release_no_holder)
709 (cache->prison, cell, &cache->deferred_bios);
710 free_prison_cell(cache, cell);
711}
712
713static void cell_defer(struct cache *cache, struct dm_bio_prison_cell *cell,
714 bool holder)
715{
716 unsigned long flags;
717
718 spin_lock_irqsave(&cache->lock, flags);
719 __cell_defer(cache, cell, holder);
720 spin_unlock_irqrestore(&cache->lock, flags);
721
722 wake_worker(cache);
723}
724
725static void cleanup_migration(struct dm_cache_migration *mg)
726{
727 dec_nr_migrations(mg->cache);
728 free_migration(mg);
729}
730
731static void migration_failure(struct dm_cache_migration *mg)
732{
733 struct cache *cache = mg->cache;
734
735 if (mg->writeback) {
736 DMWARN_LIMIT("writeback failed; couldn't copy block");
737 set_dirty(cache, mg->old_oblock, mg->cblock);
738 cell_defer(cache, mg->old_ocell, false);
739
740 } else if (mg->demote) {
741 DMWARN_LIMIT("demotion failed; couldn't copy block");
742 policy_force_mapping(cache->policy, mg->new_oblock, mg->old_oblock);
743
744 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
745 if (mg->promote)
746 cell_defer(cache, mg->new_ocell, 1);
747 } else {
748 DMWARN_LIMIT("promotion failed; couldn't copy block");
749 policy_remove_mapping(cache->policy, mg->new_oblock);
750 cell_defer(cache, mg->new_ocell, 1);
751 }
752
753 cleanup_migration(mg);
754}
755
756static void migration_success_pre_commit(struct dm_cache_migration *mg)
757{
758 unsigned long flags;
759 struct cache *cache = mg->cache;
760
761 if (mg->writeback) {
762 cell_defer(cache, mg->old_ocell, false);
763 clear_dirty(cache, mg->old_oblock, mg->cblock);
764 cleanup_migration(mg);
765 return;
766
767 } else if (mg->demote) {
768 if (dm_cache_remove_mapping(cache->cmd, mg->cblock)) {
769 DMWARN_LIMIT("demotion failed; couldn't update on disk metadata");
770 policy_force_mapping(cache->policy, mg->new_oblock,
771 mg->old_oblock);
772 if (mg->promote)
773 cell_defer(cache, mg->new_ocell, true);
774 cleanup_migration(mg);
775 return;
776 }
777 } else {
778 if (dm_cache_insert_mapping(cache->cmd, mg->cblock, mg->new_oblock)) {
779 DMWARN_LIMIT("promotion failed; couldn't update on disk metadata");
780 policy_remove_mapping(cache->policy, mg->new_oblock);
781 cleanup_migration(mg);
782 return;
783 }
784 }
785
786 spin_lock_irqsave(&cache->lock, flags);
787 list_add_tail(&mg->list, &cache->need_commit_migrations);
788 cache->commit_requested = true;
789 spin_unlock_irqrestore(&cache->lock, flags);
790}
791
792static void migration_success_post_commit(struct dm_cache_migration *mg)
793{
794 unsigned long flags;
795 struct cache *cache = mg->cache;
796
797 if (mg->writeback) {
798 DMWARN("writeback unexpectedly triggered commit");
799 return;
800
801 } else if (mg->demote) {
802 cell_defer(cache, mg->old_ocell, mg->promote ? 0 : 1);
803
804 if (mg->promote) {
805 mg->demote = false;
806
807 spin_lock_irqsave(&cache->lock, flags);
808 list_add_tail(&mg->list, &cache->quiesced_migrations);
809 spin_unlock_irqrestore(&cache->lock, flags);
810
811 } else
812 cleanup_migration(mg);
813
814 } else {
815 cell_defer(cache, mg->new_ocell, true);
816 clear_dirty(cache, mg->new_oblock, mg->cblock);
817 cleanup_migration(mg);
818 }
819}
820
821static void copy_complete(int read_err, unsigned long write_err, void *context)
822{
823 unsigned long flags;
824 struct dm_cache_migration *mg = (struct dm_cache_migration *) context;
825 struct cache *cache = mg->cache;
826
827 if (read_err || write_err)
828 mg->err = true;
829
830 spin_lock_irqsave(&cache->lock, flags);
831 list_add_tail(&mg->list, &cache->completed_migrations);
832 spin_unlock_irqrestore(&cache->lock, flags);
833
834 wake_worker(cache);
835}
836
837static void issue_copy_real(struct dm_cache_migration *mg)
838{
839 int r;
840 struct dm_io_region o_region, c_region;
841 struct cache *cache = mg->cache;
842
843 o_region.bdev = cache->origin_dev->bdev;
844 o_region.count = cache->sectors_per_block;
845
846 c_region.bdev = cache->cache_dev->bdev;
847 c_region.sector = from_cblock(mg->cblock) * cache->sectors_per_block;
848 c_region.count = cache->sectors_per_block;
849
850 if (mg->writeback || mg->demote) {
851 /* demote */
852 o_region.sector = from_oblock(mg->old_oblock) * cache->sectors_per_block;
853 r = dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, mg);
854 } else {
855 /* promote */
856 o_region.sector = from_oblock(mg->new_oblock) * cache->sectors_per_block;
857 r = dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, mg);
858 }
859
860 if (r < 0)
861 migration_failure(mg);
862}
863
864static void avoid_copy(struct dm_cache_migration *mg)
865{
866 atomic_inc(&mg->cache->stats.copies_avoided);
867 migration_success_pre_commit(mg);
868}
869
870static void issue_copy(struct dm_cache_migration *mg)
871{
872 bool avoid;
873 struct cache *cache = mg->cache;
874
875 if (mg->writeback || mg->demote)
876 avoid = !is_dirty(cache, mg->cblock) ||
877 is_discarded_oblock(cache, mg->old_oblock);
878 else
879 avoid = is_discarded_oblock(cache, mg->new_oblock);
880
881 avoid ? avoid_copy(mg) : issue_copy_real(mg);
882}
883
884static void complete_migration(struct dm_cache_migration *mg)
885{
886 if (mg->err)
887 migration_failure(mg);
888 else
889 migration_success_pre_commit(mg);
890}
891
892static void process_migrations(struct cache *cache, struct list_head *head,
893 void (*fn)(struct dm_cache_migration *))
894{
895 unsigned long flags;
896 struct list_head list;
897 struct dm_cache_migration *mg, *tmp;
898
899 INIT_LIST_HEAD(&list);
900 spin_lock_irqsave(&cache->lock, flags);
901 list_splice_init(head, &list);
902 spin_unlock_irqrestore(&cache->lock, flags);
903
904 list_for_each_entry_safe(mg, tmp, &list, list)
905 fn(mg);
906}
907
908static void __queue_quiesced_migration(struct dm_cache_migration *mg)
909{
910 list_add_tail(&mg->list, &mg->cache->quiesced_migrations);
911}
912
913static void queue_quiesced_migration(struct dm_cache_migration *mg)
914{
915 unsigned long flags;
916 struct cache *cache = mg->cache;
917
918 spin_lock_irqsave(&cache->lock, flags);
919 __queue_quiesced_migration(mg);
920 spin_unlock_irqrestore(&cache->lock, flags);
921
922 wake_worker(cache);
923}
924
925static void queue_quiesced_migrations(struct cache *cache, struct list_head *work)
926{
927 unsigned long flags;
928 struct dm_cache_migration *mg, *tmp;
929
930 spin_lock_irqsave(&cache->lock, flags);
931 list_for_each_entry_safe(mg, tmp, work, list)
932 __queue_quiesced_migration(mg);
933 spin_unlock_irqrestore(&cache->lock, flags);
934
935 wake_worker(cache);
936}
937
938static void check_for_quiesced_migrations(struct cache *cache,
939 struct per_bio_data *pb)
940{
941 struct list_head work;
942
943 if (!pb->all_io_entry)
944 return;
945
946 INIT_LIST_HEAD(&work);
947 if (pb->all_io_entry)
948 dm_deferred_entry_dec(pb->all_io_entry, &work);
949
950 if (!list_empty(&work))
951 queue_quiesced_migrations(cache, &work);
952}
953
954static void quiesce_migration(struct dm_cache_migration *mg)
955{
956 if (!dm_deferred_set_add_work(mg->cache->all_io_ds, &mg->list))
957 queue_quiesced_migration(mg);
958}
959
960static void promote(struct cache *cache, struct prealloc *structs,
961 dm_oblock_t oblock, dm_cblock_t cblock,
962 struct dm_bio_prison_cell *cell)
963{
964 struct dm_cache_migration *mg = prealloc_get_migration(structs);
965
966 mg->err = false;
967 mg->writeback = false;
968 mg->demote = false;
969 mg->promote = true;
970 mg->cache = cache;
971 mg->new_oblock = oblock;
972 mg->cblock = cblock;
973 mg->old_ocell = NULL;
974 mg->new_ocell = cell;
975 mg->start_jiffies = jiffies;
976
977 inc_nr_migrations(cache);
978 quiesce_migration(mg);
979}
980
981static void writeback(struct cache *cache, struct prealloc *structs,
982 dm_oblock_t oblock, dm_cblock_t cblock,
983 struct dm_bio_prison_cell *cell)
984{
985 struct dm_cache_migration *mg = prealloc_get_migration(structs);
986
987 mg->err = false;
988 mg->writeback = true;
989 mg->demote = false;
990 mg->promote = false;
991 mg->cache = cache;
992 mg->old_oblock = oblock;
993 mg->cblock = cblock;
994 mg->old_ocell = cell;
995 mg->new_ocell = NULL;
996 mg->start_jiffies = jiffies;
997
998 inc_nr_migrations(cache);
999 quiesce_migration(mg);
1000}
1001
1002static void demote_then_promote(struct cache *cache, struct prealloc *structs,
1003 dm_oblock_t old_oblock, dm_oblock_t new_oblock,
1004 dm_cblock_t cblock,
1005 struct dm_bio_prison_cell *old_ocell,
1006 struct dm_bio_prison_cell *new_ocell)
1007{
1008 struct dm_cache_migration *mg = prealloc_get_migration(structs);
1009
1010 mg->err = false;
1011 mg->writeback = false;
1012 mg->demote = true;
1013 mg->promote = true;
1014 mg->cache = cache;
1015 mg->old_oblock = old_oblock;
1016 mg->new_oblock = new_oblock;
1017 mg->cblock = cblock;
1018 mg->old_ocell = old_ocell;
1019 mg->new_ocell = new_ocell;
1020 mg->start_jiffies = jiffies;
1021
1022 inc_nr_migrations(cache);
1023 quiesce_migration(mg);
1024}
1025
1026/*----------------------------------------------------------------
1027 * bio processing
1028 *--------------------------------------------------------------*/
1029static void defer_bio(struct cache *cache, struct bio *bio)
1030{
1031 unsigned long flags;
1032
1033 spin_lock_irqsave(&cache->lock, flags);
1034 bio_list_add(&cache->deferred_bios, bio);
1035 spin_unlock_irqrestore(&cache->lock, flags);
1036
1037 wake_worker(cache);
1038}
1039
1040static void process_flush_bio(struct cache *cache, struct bio *bio)
1041{
1042 struct per_bio_data *pb = get_per_bio_data(bio);
1043
1044 BUG_ON(bio->bi_size);
1045 if (!pb->req_nr)
1046 remap_to_origin(cache, bio);
1047 else
1048 remap_to_cache(cache, bio, 0);
1049
1050 issue(cache, bio);
1051}
1052
1053/*
1054 * People generally discard large parts of a device, eg, the whole device
1055 * when formatting. Splitting these large discards up into cache block
1056 * sized ios and then quiescing (always neccessary for discard) takes too
1057 * long.
1058 *
1059 * We keep it simple, and allow any size of discard to come in, and just
1060 * mark off blocks on the discard bitset. No passdown occurs!
1061 *
1062 * To implement passdown we need to change the bio_prison such that a cell
1063 * can have a key that spans many blocks.
1064 */
1065static void process_discard_bio(struct cache *cache, struct bio *bio)
1066{
1067 dm_block_t start_block = dm_sector_div_up(bio->bi_sector,
1068 cache->discard_block_size);
1069 dm_block_t end_block = bio->bi_sector + bio_sectors(bio);
1070 dm_block_t b;
1071
Joe Thornber414dd672013-03-20 17:21:25 +00001072 end_block = block_div(end_block, cache->discard_block_size);
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001073
1074 for (b = start_block; b < end_block; b++)
1075 set_discard(cache, to_dblock(b));
1076
1077 bio_endio(bio, 0);
1078}
1079
1080static bool spare_migration_bandwidth(struct cache *cache)
1081{
1082 sector_t current_volume = (atomic_read(&cache->nr_migrations) + 1) *
1083 cache->sectors_per_block;
1084 return current_volume < cache->migration_threshold;
1085}
1086
1087static bool is_writethrough_io(struct cache *cache, struct bio *bio,
1088 dm_cblock_t cblock)
1089{
1090 return bio_data_dir(bio) == WRITE &&
1091 cache->features.write_through && !is_dirty(cache, cblock);
1092}
1093
1094static void inc_hit_counter(struct cache *cache, struct bio *bio)
1095{
1096 atomic_inc(bio_data_dir(bio) == READ ?
1097 &cache->stats.read_hit : &cache->stats.write_hit);
1098}
1099
1100static void inc_miss_counter(struct cache *cache, struct bio *bio)
1101{
1102 atomic_inc(bio_data_dir(bio) == READ ?
1103 &cache->stats.read_miss : &cache->stats.write_miss);
1104}
1105
1106static void process_bio(struct cache *cache, struct prealloc *structs,
1107 struct bio *bio)
1108{
1109 int r;
1110 bool release_cell = true;
1111 dm_oblock_t block = get_bio_block(cache, bio);
1112 struct dm_bio_prison_cell *cell_prealloc, *old_ocell, *new_ocell;
1113 struct policy_result lookup_result;
1114 struct per_bio_data *pb = get_per_bio_data(bio);
1115 bool discarded_block = is_discarded_oblock(cache, block);
1116 bool can_migrate = discarded_block || spare_migration_bandwidth(cache);
1117
1118 /*
1119 * Check to see if that block is currently migrating.
1120 */
1121 cell_prealloc = prealloc_get_cell(structs);
1122 r = bio_detain(cache, block, bio, cell_prealloc,
1123 (cell_free_fn) prealloc_put_cell,
1124 structs, &new_ocell);
1125 if (r > 0)
1126 return;
1127
1128 r = policy_map(cache->policy, block, true, can_migrate, discarded_block,
1129 bio, &lookup_result);
1130
1131 if (r == -EWOULDBLOCK)
1132 /* migration has been denied */
1133 lookup_result.op = POLICY_MISS;
1134
1135 switch (lookup_result.op) {
1136 case POLICY_HIT:
1137 inc_hit_counter(cache, bio);
1138 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
1139
Joe Thornbere2e74d62013-03-20 17:21:27 +00001140 if (is_writethrough_io(cache, bio, lookup_result.cblock))
1141 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
1142 else
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001143 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
1144
1145 issue(cache, bio);
1146 break;
1147
1148 case POLICY_MISS:
1149 inc_miss_counter(cache, bio);
1150 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
Joe Thornbere2e74d62013-03-20 17:21:27 +00001151 remap_to_origin_clear_discard(cache, bio, block);
1152 issue(cache, bio);
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001153 break;
1154
1155 case POLICY_NEW:
1156 atomic_inc(&cache->stats.promotion);
1157 promote(cache, structs, block, lookup_result.cblock, new_ocell);
1158 release_cell = false;
1159 break;
1160
1161 case POLICY_REPLACE:
1162 cell_prealloc = prealloc_get_cell(structs);
1163 r = bio_detain(cache, lookup_result.old_oblock, bio, cell_prealloc,
1164 (cell_free_fn) prealloc_put_cell,
1165 structs, &old_ocell);
1166 if (r > 0) {
1167 /*
1168 * We have to be careful to avoid lock inversion of
1169 * the cells. So we back off, and wait for the
1170 * old_ocell to become free.
1171 */
1172 policy_force_mapping(cache->policy, block,
1173 lookup_result.old_oblock);
1174 atomic_inc(&cache->stats.cache_cell_clash);
1175 break;
1176 }
1177 atomic_inc(&cache->stats.demotion);
1178 atomic_inc(&cache->stats.promotion);
1179
1180 demote_then_promote(cache, structs, lookup_result.old_oblock,
1181 block, lookup_result.cblock,
1182 old_ocell, new_ocell);
1183 release_cell = false;
1184 break;
1185
1186 default:
1187 DMERR_LIMIT("%s: erroring bio, unknown policy op: %u", __func__,
1188 (unsigned) lookup_result.op);
1189 bio_io_error(bio);
1190 }
1191
1192 if (release_cell)
1193 cell_defer(cache, new_ocell, false);
1194}
1195
1196static int need_commit_due_to_time(struct cache *cache)
1197{
1198 return jiffies < cache->last_commit_jiffies ||
1199 jiffies > cache->last_commit_jiffies + COMMIT_PERIOD;
1200}
1201
1202static int commit_if_needed(struct cache *cache)
1203{
1204 if (dm_cache_changed_this_transaction(cache->cmd) &&
1205 (cache->commit_requested || need_commit_due_to_time(cache))) {
1206 atomic_inc(&cache->stats.commit_count);
1207 cache->last_commit_jiffies = jiffies;
1208 cache->commit_requested = false;
1209 return dm_cache_commit(cache->cmd, false);
1210 }
1211
1212 return 0;
1213}
1214
1215static void process_deferred_bios(struct cache *cache)
1216{
1217 unsigned long flags;
1218 struct bio_list bios;
1219 struct bio *bio;
1220 struct prealloc structs;
1221
1222 memset(&structs, 0, sizeof(structs));
1223 bio_list_init(&bios);
1224
1225 spin_lock_irqsave(&cache->lock, flags);
1226 bio_list_merge(&bios, &cache->deferred_bios);
1227 bio_list_init(&cache->deferred_bios);
1228 spin_unlock_irqrestore(&cache->lock, flags);
1229
1230 while (!bio_list_empty(&bios)) {
1231 /*
1232 * If we've got no free migration structs, and processing
1233 * this bio might require one, we pause until there are some
1234 * prepared mappings to process.
1235 */
1236 if (prealloc_data_structs(cache, &structs)) {
1237 spin_lock_irqsave(&cache->lock, flags);
1238 bio_list_merge(&cache->deferred_bios, &bios);
1239 spin_unlock_irqrestore(&cache->lock, flags);
1240 break;
1241 }
1242
1243 bio = bio_list_pop(&bios);
1244
1245 if (bio->bi_rw & REQ_FLUSH)
1246 process_flush_bio(cache, bio);
1247 else if (bio->bi_rw & REQ_DISCARD)
1248 process_discard_bio(cache, bio);
1249 else
1250 process_bio(cache, &structs, bio);
1251 }
1252
1253 prealloc_free_structs(cache, &structs);
1254}
1255
1256static void process_deferred_flush_bios(struct cache *cache, bool submit_bios)
1257{
1258 unsigned long flags;
1259 struct bio_list bios;
1260 struct bio *bio;
1261
1262 bio_list_init(&bios);
1263
1264 spin_lock_irqsave(&cache->lock, flags);
1265 bio_list_merge(&bios, &cache->deferred_flush_bios);
1266 bio_list_init(&cache->deferred_flush_bios);
1267 spin_unlock_irqrestore(&cache->lock, flags);
1268
1269 while ((bio = bio_list_pop(&bios)))
1270 submit_bios ? generic_make_request(bio) : bio_io_error(bio);
1271}
1272
Joe Thornbere2e74d62013-03-20 17:21:27 +00001273static void process_deferred_writethrough_bios(struct cache *cache)
1274{
1275 unsigned long flags;
1276 struct bio_list bios;
1277 struct bio *bio;
1278
1279 bio_list_init(&bios);
1280
1281 spin_lock_irqsave(&cache->lock, flags);
1282 bio_list_merge(&bios, &cache->deferred_writethrough_bios);
1283 bio_list_init(&cache->deferred_writethrough_bios);
1284 spin_unlock_irqrestore(&cache->lock, flags);
1285
1286 while ((bio = bio_list_pop(&bios)))
1287 generic_make_request(bio);
1288}
1289
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001290static void writeback_some_dirty_blocks(struct cache *cache)
1291{
1292 int r = 0;
1293 dm_oblock_t oblock;
1294 dm_cblock_t cblock;
1295 struct prealloc structs;
1296 struct dm_bio_prison_cell *old_ocell;
1297
1298 memset(&structs, 0, sizeof(structs));
1299
1300 while (spare_migration_bandwidth(cache)) {
1301 if (prealloc_data_structs(cache, &structs))
1302 break;
1303
1304 r = policy_writeback_work(cache->policy, &oblock, &cblock);
1305 if (r)
1306 break;
1307
1308 r = get_cell(cache, oblock, &structs, &old_ocell);
1309 if (r) {
1310 policy_set_dirty(cache->policy, oblock);
1311 break;
1312 }
1313
1314 writeback(cache, &structs, oblock, cblock, old_ocell);
1315 }
1316
1317 prealloc_free_structs(cache, &structs);
1318}
1319
1320/*----------------------------------------------------------------
1321 * Main worker loop
1322 *--------------------------------------------------------------*/
1323static void start_quiescing(struct cache *cache)
1324{
1325 unsigned long flags;
1326
1327 spin_lock_irqsave(&cache->lock, flags);
1328 cache->quiescing = 1;
1329 spin_unlock_irqrestore(&cache->lock, flags);
1330}
1331
1332static void stop_quiescing(struct cache *cache)
1333{
1334 unsigned long flags;
1335
1336 spin_lock_irqsave(&cache->lock, flags);
1337 cache->quiescing = 0;
1338 spin_unlock_irqrestore(&cache->lock, flags);
1339}
1340
1341static bool is_quiescing(struct cache *cache)
1342{
1343 int r;
1344 unsigned long flags;
1345
1346 spin_lock_irqsave(&cache->lock, flags);
1347 r = cache->quiescing;
1348 spin_unlock_irqrestore(&cache->lock, flags);
1349
1350 return r;
1351}
1352
1353static void wait_for_migrations(struct cache *cache)
1354{
1355 wait_event(cache->migration_wait, !atomic_read(&cache->nr_migrations));
1356}
1357
1358static void stop_worker(struct cache *cache)
1359{
1360 cancel_delayed_work(&cache->waker);
1361 flush_workqueue(cache->wq);
1362}
1363
1364static void requeue_deferred_io(struct cache *cache)
1365{
1366 struct bio *bio;
1367 struct bio_list bios;
1368
1369 bio_list_init(&bios);
1370 bio_list_merge(&bios, &cache->deferred_bios);
1371 bio_list_init(&cache->deferred_bios);
1372
1373 while ((bio = bio_list_pop(&bios)))
1374 bio_endio(bio, DM_ENDIO_REQUEUE);
1375}
1376
1377static int more_work(struct cache *cache)
1378{
1379 if (is_quiescing(cache))
1380 return !list_empty(&cache->quiesced_migrations) ||
1381 !list_empty(&cache->completed_migrations) ||
1382 !list_empty(&cache->need_commit_migrations);
1383 else
1384 return !bio_list_empty(&cache->deferred_bios) ||
1385 !bio_list_empty(&cache->deferred_flush_bios) ||
Joe Thornbere2e74d62013-03-20 17:21:27 +00001386 !bio_list_empty(&cache->deferred_writethrough_bios) ||
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001387 !list_empty(&cache->quiesced_migrations) ||
1388 !list_empty(&cache->completed_migrations) ||
1389 !list_empty(&cache->need_commit_migrations);
1390}
1391
1392static void do_worker(struct work_struct *ws)
1393{
1394 struct cache *cache = container_of(ws, struct cache, worker);
1395
1396 do {
1397 if (!is_quiescing(cache))
1398 process_deferred_bios(cache);
1399
1400 process_migrations(cache, &cache->quiesced_migrations, issue_copy);
1401 process_migrations(cache, &cache->completed_migrations, complete_migration);
1402
1403 writeback_some_dirty_blocks(cache);
1404
Joe Thornbere2e74d62013-03-20 17:21:27 +00001405 process_deferred_writethrough_bios(cache);
1406
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001407 if (commit_if_needed(cache)) {
1408 process_deferred_flush_bios(cache, false);
1409
1410 /*
1411 * FIXME: rollback metadata or just go into a
1412 * failure mode and error everything
1413 */
1414 } else {
1415 process_deferred_flush_bios(cache, true);
1416 process_migrations(cache, &cache->need_commit_migrations,
1417 migration_success_post_commit);
1418 }
1419 } while (more_work(cache));
1420}
1421
1422/*
1423 * We want to commit periodically so that not too much
1424 * unwritten metadata builds up.
1425 */
1426static void do_waker(struct work_struct *ws)
1427{
1428 struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1429 wake_worker(cache);
1430 queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1431}
1432
1433/*----------------------------------------------------------------*/
1434
1435static int is_congested(struct dm_dev *dev, int bdi_bits)
1436{
1437 struct request_queue *q = bdev_get_queue(dev->bdev);
1438 return bdi_congested(&q->backing_dev_info, bdi_bits);
1439}
1440
1441static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1442{
1443 struct cache *cache = container_of(cb, struct cache, callbacks);
1444
1445 return is_congested(cache->origin_dev, bdi_bits) ||
1446 is_congested(cache->cache_dev, bdi_bits);
1447}
1448
1449/*----------------------------------------------------------------
1450 * Target methods
1451 *--------------------------------------------------------------*/
1452
1453/*
1454 * This function gets called on the error paths of the constructor, so we
1455 * have to cope with a partially initialised struct.
1456 */
1457static void destroy(struct cache *cache)
1458{
1459 unsigned i;
1460
1461 if (cache->next_migration)
1462 mempool_free(cache->next_migration, cache->migration_pool);
1463
1464 if (cache->migration_pool)
1465 mempool_destroy(cache->migration_pool);
1466
1467 if (cache->all_io_ds)
1468 dm_deferred_set_destroy(cache->all_io_ds);
1469
1470 if (cache->prison)
1471 dm_bio_prison_destroy(cache->prison);
1472
1473 if (cache->wq)
1474 destroy_workqueue(cache->wq);
1475
1476 if (cache->dirty_bitset)
1477 free_bitset(cache->dirty_bitset);
1478
1479 if (cache->discard_bitset)
1480 free_bitset(cache->discard_bitset);
1481
1482 if (cache->copier)
1483 dm_kcopyd_client_destroy(cache->copier);
1484
1485 if (cache->cmd)
1486 dm_cache_metadata_close(cache->cmd);
1487
1488 if (cache->metadata_dev)
1489 dm_put_device(cache->ti, cache->metadata_dev);
1490
1491 if (cache->origin_dev)
1492 dm_put_device(cache->ti, cache->origin_dev);
1493
1494 if (cache->cache_dev)
1495 dm_put_device(cache->ti, cache->cache_dev);
1496
1497 if (cache->policy)
1498 dm_cache_policy_destroy(cache->policy);
1499
1500 for (i = 0; i < cache->nr_ctr_args ; i++)
1501 kfree(cache->ctr_args[i]);
1502 kfree(cache->ctr_args);
1503
1504 kfree(cache);
1505}
1506
1507static void cache_dtr(struct dm_target *ti)
1508{
1509 struct cache *cache = ti->private;
1510
1511 destroy(cache);
1512}
1513
1514static sector_t get_dev_size(struct dm_dev *dev)
1515{
1516 return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
1517}
1518
1519/*----------------------------------------------------------------*/
1520
1521/*
1522 * Construct a cache device mapping.
1523 *
1524 * cache <metadata dev> <cache dev> <origin dev> <block size>
1525 * <#feature args> [<feature arg>]*
1526 * <policy> <#policy args> [<policy arg>]*
1527 *
1528 * metadata dev : fast device holding the persistent metadata
1529 * cache dev : fast device holding cached data blocks
1530 * origin dev : slow device holding original data blocks
1531 * block size : cache unit size in sectors
1532 *
1533 * #feature args : number of feature arguments passed
1534 * feature args : writethrough. (The default is writeback.)
1535 *
1536 * policy : the replacement policy to use
1537 * #policy args : an even number of policy arguments corresponding
1538 * to key/value pairs passed to the policy
1539 * policy args : key/value pairs passed to the policy
1540 * E.g. 'sequential_threshold 1024'
1541 * See cache-policies.txt for details.
1542 *
1543 * Optional feature arguments are:
1544 * writethrough : write through caching that prohibits cache block
1545 * content from being different from origin block content.
1546 * Without this argument, the default behaviour is to write
1547 * back cache block contents later for performance reasons,
1548 * so they may differ from the corresponding origin blocks.
1549 */
1550struct cache_args {
1551 struct dm_target *ti;
1552
1553 struct dm_dev *metadata_dev;
1554
1555 struct dm_dev *cache_dev;
1556 sector_t cache_sectors;
1557
1558 struct dm_dev *origin_dev;
1559 sector_t origin_sectors;
1560
1561 uint32_t block_size;
1562
1563 const char *policy_name;
1564 int policy_argc;
1565 const char **policy_argv;
1566
1567 struct cache_features features;
1568};
1569
1570static void destroy_cache_args(struct cache_args *ca)
1571{
1572 if (ca->metadata_dev)
1573 dm_put_device(ca->ti, ca->metadata_dev);
1574
1575 if (ca->cache_dev)
1576 dm_put_device(ca->ti, ca->cache_dev);
1577
1578 if (ca->origin_dev)
1579 dm_put_device(ca->ti, ca->origin_dev);
1580
1581 kfree(ca);
1582}
1583
1584static bool at_least_one_arg(struct dm_arg_set *as, char **error)
1585{
1586 if (!as->argc) {
1587 *error = "Insufficient args";
1588 return false;
1589 }
1590
1591 return true;
1592}
1593
1594static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
1595 char **error)
1596{
1597 int r;
1598 sector_t metadata_dev_size;
1599 char b[BDEVNAME_SIZE];
1600
1601 if (!at_least_one_arg(as, error))
1602 return -EINVAL;
1603
1604 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1605 &ca->metadata_dev);
1606 if (r) {
1607 *error = "Error opening metadata device";
1608 return r;
1609 }
1610
1611 metadata_dev_size = get_dev_size(ca->metadata_dev);
1612 if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
1613 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1614 bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1615
1616 return 0;
1617}
1618
1619static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
1620 char **error)
1621{
1622 int r;
1623
1624 if (!at_least_one_arg(as, error))
1625 return -EINVAL;
1626
1627 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1628 &ca->cache_dev);
1629 if (r) {
1630 *error = "Error opening cache device";
1631 return r;
1632 }
1633 ca->cache_sectors = get_dev_size(ca->cache_dev);
1634
1635 return 0;
1636}
1637
1638static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
1639 char **error)
1640{
1641 int r;
1642
1643 if (!at_least_one_arg(as, error))
1644 return -EINVAL;
1645
1646 r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
1647 &ca->origin_dev);
1648 if (r) {
1649 *error = "Error opening origin device";
1650 return r;
1651 }
1652
1653 ca->origin_sectors = get_dev_size(ca->origin_dev);
1654 if (ca->ti->len > ca->origin_sectors) {
1655 *error = "Device size larger than cached device";
1656 return -EINVAL;
1657 }
1658
1659 return 0;
1660}
1661
1662static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
1663 char **error)
1664{
1665 unsigned long tmp;
1666
1667 if (!at_least_one_arg(as, error))
1668 return -EINVAL;
1669
1670 if (kstrtoul(dm_shift_arg(as), 10, &tmp) || !tmp ||
1671 tmp < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1672 tmp & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1673 *error = "Invalid data block size";
1674 return -EINVAL;
1675 }
1676
1677 if (tmp > ca->cache_sectors) {
1678 *error = "Data block size is larger than the cache device";
1679 return -EINVAL;
1680 }
1681
1682 ca->block_size = tmp;
1683
1684 return 0;
1685}
1686
1687static void init_features(struct cache_features *cf)
1688{
1689 cf->mode = CM_WRITE;
1690 cf->write_through = false;
1691}
1692
1693static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
1694 char **error)
1695{
1696 static struct dm_arg _args[] = {
1697 {0, 1, "Invalid number of cache feature arguments"},
1698 };
1699
1700 int r;
1701 unsigned argc;
1702 const char *arg;
1703 struct cache_features *cf = &ca->features;
1704
1705 init_features(cf);
1706
1707 r = dm_read_arg_group(_args, as, &argc, error);
1708 if (r)
1709 return -EINVAL;
1710
1711 while (argc--) {
1712 arg = dm_shift_arg(as);
1713
1714 if (!strcasecmp(arg, "writeback"))
1715 cf->write_through = false;
1716
1717 else if (!strcasecmp(arg, "writethrough"))
1718 cf->write_through = true;
1719
1720 else {
1721 *error = "Unrecognised cache feature requested";
1722 return -EINVAL;
1723 }
1724 }
1725
1726 return 0;
1727}
1728
1729static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
1730 char **error)
1731{
1732 static struct dm_arg _args[] = {
1733 {0, 1024, "Invalid number of policy arguments"},
1734 };
1735
1736 int r;
1737
1738 if (!at_least_one_arg(as, error))
1739 return -EINVAL;
1740
1741 ca->policy_name = dm_shift_arg(as);
1742
1743 r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
1744 if (r)
1745 return -EINVAL;
1746
1747 ca->policy_argv = (const char **)as->argv;
1748 dm_consume_args(as, ca->policy_argc);
1749
1750 return 0;
1751}
1752
1753static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
1754 char **error)
1755{
1756 int r;
1757 struct dm_arg_set as;
1758
1759 as.argc = argc;
1760 as.argv = argv;
1761
1762 r = parse_metadata_dev(ca, &as, error);
1763 if (r)
1764 return r;
1765
1766 r = parse_cache_dev(ca, &as, error);
1767 if (r)
1768 return r;
1769
1770 r = parse_origin_dev(ca, &as, error);
1771 if (r)
1772 return r;
1773
1774 r = parse_block_size(ca, &as, error);
1775 if (r)
1776 return r;
1777
1778 r = parse_features(ca, &as, error);
1779 if (r)
1780 return r;
1781
1782 r = parse_policy(ca, &as, error);
1783 if (r)
1784 return r;
1785
1786 return 0;
1787}
1788
1789/*----------------------------------------------------------------*/
1790
1791static struct kmem_cache *migration_cache;
1792
1793static int set_config_values(struct dm_cache_policy *p, int argc, const char **argv)
1794{
1795 int r = 0;
1796
1797 if (argc & 1) {
1798 DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
1799 return -EINVAL;
1800 }
1801
1802 while (argc) {
1803 r = policy_set_config_value(p, argv[0], argv[1]);
1804 if (r) {
1805 DMWARN("policy_set_config_value failed: key = '%s', value = '%s'",
1806 argv[0], argv[1]);
1807 return r;
1808 }
1809
1810 argc -= 2;
1811 argv += 2;
1812 }
1813
1814 return r;
1815}
1816
1817static int create_cache_policy(struct cache *cache, struct cache_args *ca,
1818 char **error)
1819{
1820 int r;
1821
1822 cache->policy = dm_cache_policy_create(ca->policy_name,
1823 cache->cache_size,
1824 cache->origin_sectors,
1825 cache->sectors_per_block);
1826 if (!cache->policy) {
1827 *error = "Error creating cache's policy";
1828 return -ENOMEM;
1829 }
1830
1831 r = set_config_values(cache->policy, ca->policy_argc, ca->policy_argv);
Heinz Mauelshagenb9784402013-03-20 17:21:26 +00001832 if (r) {
1833 *error = "Error setting cache policy's config values";
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001834 dm_cache_policy_destroy(cache->policy);
Heinz Mauelshagenb9784402013-03-20 17:21:26 +00001835 cache->policy = NULL;
1836 }
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001837
1838 return r;
1839}
1840
1841/*
1842 * We want the discard block size to be a power of two, at least the size
1843 * of the cache block size, and have no more than 2^14 discard blocks
1844 * across the origin.
1845 */
1846#define MAX_DISCARD_BLOCKS (1 << 14)
1847
1848static bool too_many_discard_blocks(sector_t discard_block_size,
1849 sector_t origin_size)
1850{
1851 (void) sector_div(origin_size, discard_block_size);
1852
1853 return origin_size > MAX_DISCARD_BLOCKS;
1854}
1855
1856static sector_t calculate_discard_block_size(sector_t cache_block_size,
1857 sector_t origin_size)
1858{
1859 sector_t discard_block_size;
1860
1861 discard_block_size = roundup_pow_of_two(cache_block_size);
1862
1863 if (origin_size)
1864 while (too_many_discard_blocks(discard_block_size, origin_size))
1865 discard_block_size *= 2;
1866
1867 return discard_block_size;
1868}
1869
1870#define DEFAULT_MIGRATION_THRESHOLD (2048 * 100)
1871
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001872static int cache_create(struct cache_args *ca, struct cache **result)
1873{
1874 int r = 0;
1875 char **error = &ca->ti->error;
1876 struct cache *cache;
1877 struct dm_target *ti = ca->ti;
1878 dm_block_t origin_blocks;
1879 struct dm_cache_metadata *cmd;
1880 bool may_format = ca->features.mode == CM_WRITE;
1881
1882 cache = kzalloc(sizeof(*cache), GFP_KERNEL);
1883 if (!cache)
1884 return -ENOMEM;
1885
1886 cache->ti = ca->ti;
1887 ti->private = cache;
1888 ti->per_bio_data_size = sizeof(struct per_bio_data);
1889 ti->num_flush_bios = 2;
1890 ti->flush_supported = true;
1891
1892 ti->num_discard_bios = 1;
1893 ti->discards_supported = true;
1894 ti->discard_zeroes_data_unsupported = true;
1895
1896 memcpy(&cache->features, &ca->features, sizeof(cache->features));
1897
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001898 cache->callbacks.congested_fn = cache_is_congested;
1899 dm_table_add_target_callbacks(ti->table, &cache->callbacks);
1900
1901 cache->metadata_dev = ca->metadata_dev;
1902 cache->origin_dev = ca->origin_dev;
1903 cache->cache_dev = ca->cache_dev;
1904
1905 ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
1906
1907 /* FIXME: factor out this whole section */
1908 origin_blocks = cache->origin_sectors = ca->origin_sectors;
Joe Thornber414dd672013-03-20 17:21:25 +00001909 origin_blocks = block_div(origin_blocks, ca->block_size);
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001910 cache->origin_blocks = to_oblock(origin_blocks);
1911
1912 cache->sectors_per_block = ca->block_size;
1913 if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
1914 r = -EINVAL;
1915 goto bad;
1916 }
1917
1918 if (ca->block_size & (ca->block_size - 1)) {
1919 dm_block_t cache_size = ca->cache_sectors;
1920
1921 cache->sectors_per_block_shift = -1;
Joe Thornber414dd672013-03-20 17:21:25 +00001922 cache_size = block_div(cache_size, ca->block_size);
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001923 cache->cache_size = to_cblock(cache_size);
1924 } else {
1925 cache->sectors_per_block_shift = __ffs(ca->block_size);
1926 cache->cache_size = to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift);
1927 }
1928
1929 r = create_cache_policy(cache, ca, error);
1930 if (r)
1931 goto bad;
1932 cache->policy_nr_args = ca->policy_argc;
1933
1934 cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
1935 ca->block_size, may_format,
1936 dm_cache_policy_get_hint_size(cache->policy));
1937 if (IS_ERR(cmd)) {
1938 *error = "Error creating metadata object";
1939 r = PTR_ERR(cmd);
1940 goto bad;
1941 }
1942 cache->cmd = cmd;
1943
1944 spin_lock_init(&cache->lock);
1945 bio_list_init(&cache->deferred_bios);
1946 bio_list_init(&cache->deferred_flush_bios);
Joe Thornbere2e74d62013-03-20 17:21:27 +00001947 bio_list_init(&cache->deferred_writethrough_bios);
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00001948 INIT_LIST_HEAD(&cache->quiesced_migrations);
1949 INIT_LIST_HEAD(&cache->completed_migrations);
1950 INIT_LIST_HEAD(&cache->need_commit_migrations);
1951 cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
1952 atomic_set(&cache->nr_migrations, 0);
1953 init_waitqueue_head(&cache->migration_wait);
1954
1955 cache->nr_dirty = 0;
1956 cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
1957 if (!cache->dirty_bitset) {
1958 *error = "could not allocate dirty bitset";
1959 goto bad;
1960 }
1961 clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
1962
1963 cache->discard_block_size =
1964 calculate_discard_block_size(cache->sectors_per_block,
1965 cache->origin_sectors);
1966 cache->discard_nr_blocks = oblock_to_dblock(cache, cache->origin_blocks);
1967 cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
1968 if (!cache->discard_bitset) {
1969 *error = "could not allocate discard bitset";
1970 goto bad;
1971 }
1972 clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
1973
1974 cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1975 if (IS_ERR(cache->copier)) {
1976 *error = "could not create kcopyd client";
1977 r = PTR_ERR(cache->copier);
1978 goto bad;
1979 }
1980
1981 cache->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1982 if (!cache->wq) {
1983 *error = "could not create workqueue for metadata object";
1984 goto bad;
1985 }
1986 INIT_WORK(&cache->worker, do_worker);
1987 INIT_DELAYED_WORK(&cache->waker, do_waker);
1988 cache->last_commit_jiffies = jiffies;
1989
1990 cache->prison = dm_bio_prison_create(PRISON_CELLS);
1991 if (!cache->prison) {
1992 *error = "could not create bio prison";
1993 goto bad;
1994 }
1995
1996 cache->all_io_ds = dm_deferred_set_create();
1997 if (!cache->all_io_ds) {
1998 *error = "could not create all_io deferred set";
1999 goto bad;
2000 }
2001
2002 cache->migration_pool = mempool_create_slab_pool(MIGRATION_POOL_SIZE,
2003 migration_cache);
2004 if (!cache->migration_pool) {
2005 *error = "Error creating cache's migration mempool";
2006 goto bad;
2007 }
2008
2009 cache->next_migration = NULL;
2010
2011 cache->need_tick_bio = true;
2012 cache->sized = false;
2013 cache->quiescing = false;
2014 cache->commit_requested = false;
2015 cache->loaded_mappings = false;
2016 cache->loaded_discards = false;
2017
2018 load_stats(cache);
2019
2020 atomic_set(&cache->stats.demotion, 0);
2021 atomic_set(&cache->stats.promotion, 0);
2022 atomic_set(&cache->stats.copies_avoided, 0);
2023 atomic_set(&cache->stats.cache_cell_clash, 0);
2024 atomic_set(&cache->stats.commit_count, 0);
2025 atomic_set(&cache->stats.discard_count, 0);
2026
2027 *result = cache;
2028 return 0;
2029
2030bad:
2031 destroy(cache);
2032 return r;
2033}
2034
2035static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2036{
2037 unsigned i;
2038 const char **copy;
2039
2040 copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2041 if (!copy)
2042 return -ENOMEM;
2043 for (i = 0; i < argc; i++) {
2044 copy[i] = kstrdup(argv[i], GFP_KERNEL);
2045 if (!copy[i]) {
2046 while (i--)
2047 kfree(copy[i]);
2048 kfree(copy);
2049 return -ENOMEM;
2050 }
2051 }
2052
2053 cache->nr_ctr_args = argc;
2054 cache->ctr_args = copy;
2055
2056 return 0;
2057}
2058
2059static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2060{
2061 int r = -EINVAL;
2062 struct cache_args *ca;
2063 struct cache *cache = NULL;
2064
2065 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2066 if (!ca) {
2067 ti->error = "Error allocating memory for cache";
2068 return -ENOMEM;
2069 }
2070 ca->ti = ti;
2071
2072 r = parse_cache_args(ca, argc, argv, &ti->error);
2073 if (r)
2074 goto out;
2075
2076 r = cache_create(ca, &cache);
Heinz Mauelshagen617a0b82013-03-20 17:21:26 +00002077 if (r)
2078 goto out;
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00002079
2080 r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2081 if (r) {
2082 destroy(cache);
2083 goto out;
2084 }
2085
2086 ti->private = cache;
2087
2088out:
2089 destroy_cache_args(ca);
2090 return r;
2091}
2092
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00002093static int cache_map(struct dm_target *ti, struct bio *bio)
2094{
2095 struct cache *cache = ti->private;
2096
2097 int r;
2098 dm_oblock_t block = get_bio_block(cache, bio);
2099 bool can_migrate = false;
2100 bool discarded_block;
2101 struct dm_bio_prison_cell *cell;
2102 struct policy_result lookup_result;
2103 struct per_bio_data *pb;
2104
2105 if (from_oblock(block) > from_oblock(cache->origin_blocks)) {
2106 /*
2107 * This can only occur if the io goes to a partial block at
2108 * the end of the origin device. We don't cache these.
2109 * Just remap to the origin and carry on.
2110 */
2111 remap_to_origin_clear_discard(cache, bio, block);
2112 return DM_MAPIO_REMAPPED;
2113 }
2114
2115 pb = init_per_bio_data(bio);
2116
2117 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA | REQ_DISCARD)) {
2118 defer_bio(cache, bio);
2119 return DM_MAPIO_SUBMITTED;
2120 }
2121
2122 /*
2123 * Check to see if that block is currently migrating.
2124 */
2125 cell = alloc_prison_cell(cache);
2126 if (!cell) {
2127 defer_bio(cache, bio);
2128 return DM_MAPIO_SUBMITTED;
2129 }
2130
2131 r = bio_detain(cache, block, bio, cell,
2132 (cell_free_fn) free_prison_cell,
2133 cache, &cell);
2134 if (r) {
2135 if (r < 0)
2136 defer_bio(cache, bio);
2137
2138 return DM_MAPIO_SUBMITTED;
2139 }
2140
2141 discarded_block = is_discarded_oblock(cache, block);
2142
2143 r = policy_map(cache->policy, block, false, can_migrate, discarded_block,
2144 bio, &lookup_result);
2145 if (r == -EWOULDBLOCK) {
2146 cell_defer(cache, cell, true);
2147 return DM_MAPIO_SUBMITTED;
2148
2149 } else if (r) {
2150 DMERR_LIMIT("Unexpected return from cache replacement policy: %d", r);
2151 bio_io_error(bio);
2152 return DM_MAPIO_SUBMITTED;
2153 }
2154
2155 switch (lookup_result.op) {
2156 case POLICY_HIT:
2157 inc_hit_counter(cache, bio);
2158 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2159
Joe Thornbere2e74d62013-03-20 17:21:27 +00002160 if (is_writethrough_io(cache, bio, lookup_result.cblock))
2161 remap_to_origin_then_cache(cache, bio, block, lookup_result.cblock);
2162 else
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00002163 remap_to_cache_dirty(cache, bio, block, lookup_result.cblock);
Joe Thornbere2e74d62013-03-20 17:21:27 +00002164
2165 cell_defer(cache, cell, false);
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00002166 break;
2167
2168 case POLICY_MISS:
2169 inc_miss_counter(cache, bio);
2170 pb->all_io_entry = dm_deferred_entry_inc(cache->all_io_ds);
2171
2172 if (pb->req_nr != 0) {
2173 /*
2174 * This is a duplicate writethrough io that is no
2175 * longer needed because the block has been demoted.
2176 */
2177 bio_endio(bio, 0);
2178 cell_defer(cache, cell, false);
2179 return DM_MAPIO_SUBMITTED;
2180 } else {
2181 remap_to_origin_clear_discard(cache, bio, block);
2182 cell_defer(cache, cell, false);
2183 }
2184 break;
2185
2186 default:
2187 DMERR_LIMIT("%s: erroring bio: unknown policy op: %u", __func__,
2188 (unsigned) lookup_result.op);
2189 bio_io_error(bio);
2190 return DM_MAPIO_SUBMITTED;
2191 }
2192
2193 return DM_MAPIO_REMAPPED;
2194}
2195
2196static int cache_end_io(struct dm_target *ti, struct bio *bio, int error)
2197{
2198 struct cache *cache = ti->private;
2199 unsigned long flags;
2200 struct per_bio_data *pb = get_per_bio_data(bio);
2201
2202 if (pb->tick) {
2203 policy_tick(cache->policy);
2204
2205 spin_lock_irqsave(&cache->lock, flags);
2206 cache->need_tick_bio = true;
2207 spin_unlock_irqrestore(&cache->lock, flags);
2208 }
2209
2210 check_for_quiesced_migrations(cache, pb);
2211
2212 return 0;
2213}
2214
2215static int write_dirty_bitset(struct cache *cache)
2216{
2217 unsigned i, r;
2218
2219 for (i = 0; i < from_cblock(cache->cache_size); i++) {
2220 r = dm_cache_set_dirty(cache->cmd, to_cblock(i),
2221 is_dirty(cache, to_cblock(i)));
2222 if (r)
2223 return r;
2224 }
2225
2226 return 0;
2227}
2228
2229static int write_discard_bitset(struct cache *cache)
2230{
2231 unsigned i, r;
2232
2233 r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2234 cache->discard_nr_blocks);
2235 if (r) {
2236 DMERR("could not resize on-disk discard bitset");
2237 return r;
2238 }
2239
2240 for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2241 r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2242 is_discarded(cache, to_dblock(i)));
2243 if (r)
2244 return r;
2245 }
2246
2247 return 0;
2248}
2249
2250static int save_hint(void *context, dm_cblock_t cblock, dm_oblock_t oblock,
2251 uint32_t hint)
2252{
2253 struct cache *cache = context;
2254 return dm_cache_save_hint(cache->cmd, cblock, hint);
2255}
2256
2257static int write_hints(struct cache *cache)
2258{
2259 int r;
2260
2261 r = dm_cache_begin_hints(cache->cmd, cache->policy);
2262 if (r) {
2263 DMERR("dm_cache_begin_hints failed");
2264 return r;
2265 }
2266
2267 r = policy_walk_mappings(cache->policy, save_hint, cache);
2268 if (r)
2269 DMERR("policy_walk_mappings failed");
2270
2271 return r;
2272}
2273
2274/*
2275 * returns true on success
2276 */
2277static bool sync_metadata(struct cache *cache)
2278{
2279 int r1, r2, r3, r4;
2280
2281 r1 = write_dirty_bitset(cache);
2282 if (r1)
2283 DMERR("could not write dirty bitset");
2284
2285 r2 = write_discard_bitset(cache);
2286 if (r2)
2287 DMERR("could not write discard bitset");
2288
2289 save_stats(cache);
2290
2291 r3 = write_hints(cache);
2292 if (r3)
2293 DMERR("could not write hints");
2294
2295 /*
2296 * If writing the above metadata failed, we still commit, but don't
2297 * set the clean shutdown flag. This will effectively force every
2298 * dirty bit to be set on reload.
2299 */
2300 r4 = dm_cache_commit(cache->cmd, !r1 && !r2 && !r3);
2301 if (r4)
2302 DMERR("could not write cache metadata. Data loss may occur.");
2303
2304 return !r1 && !r2 && !r3 && !r4;
2305}
2306
2307static void cache_postsuspend(struct dm_target *ti)
2308{
2309 struct cache *cache = ti->private;
2310
2311 start_quiescing(cache);
2312 wait_for_migrations(cache);
2313 stop_worker(cache);
2314 requeue_deferred_io(cache);
2315 stop_quiescing(cache);
2316
2317 (void) sync_metadata(cache);
2318}
2319
2320static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2321 bool dirty, uint32_t hint, bool hint_valid)
2322{
2323 int r;
2324 struct cache *cache = context;
2325
2326 r = policy_load_mapping(cache->policy, oblock, cblock, hint, hint_valid);
2327 if (r)
2328 return r;
2329
2330 if (dirty)
2331 set_dirty(cache, oblock, cblock);
2332 else
2333 clear_dirty(cache, oblock, cblock);
2334
2335 return 0;
2336}
2337
2338static int load_discard(void *context, sector_t discard_block_size,
2339 dm_dblock_t dblock, bool discard)
2340{
2341 struct cache *cache = context;
2342
2343 /* FIXME: handle mis-matched block size */
2344
2345 if (discard)
2346 set_discard(cache, dblock);
2347 else
2348 clear_discard(cache, dblock);
2349
2350 return 0;
2351}
2352
2353static int cache_preresume(struct dm_target *ti)
2354{
2355 int r = 0;
2356 struct cache *cache = ti->private;
2357 sector_t actual_cache_size = get_dev_size(cache->cache_dev);
2358 (void) sector_div(actual_cache_size, cache->sectors_per_block);
2359
2360 /*
2361 * Check to see if the cache has resized.
2362 */
2363 if (from_cblock(cache->cache_size) != actual_cache_size || !cache->sized) {
2364 cache->cache_size = to_cblock(actual_cache_size);
2365
2366 r = dm_cache_resize(cache->cmd, cache->cache_size);
2367 if (r) {
2368 DMERR("could not resize cache metadata");
2369 return r;
2370 }
2371
2372 cache->sized = true;
2373 }
2374
2375 if (!cache->loaded_mappings) {
Mike Snitzerea2dd8c2013-03-20 17:21:28 +00002376 r = dm_cache_load_mappings(cache->cmd, cache->policy,
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00002377 load_mapping, cache);
2378 if (r) {
2379 DMERR("could not load cache mappings");
2380 return r;
2381 }
2382
2383 cache->loaded_mappings = true;
2384 }
2385
2386 if (!cache->loaded_discards) {
2387 r = dm_cache_load_discards(cache->cmd, load_discard, cache);
2388 if (r) {
2389 DMERR("could not load origin discards");
2390 return r;
2391 }
2392
2393 cache->loaded_discards = true;
2394 }
2395
2396 return r;
2397}
2398
2399static void cache_resume(struct dm_target *ti)
2400{
2401 struct cache *cache = ti->private;
2402
2403 cache->need_tick_bio = true;
2404 do_waker(&cache->waker.work);
2405}
2406
2407/*
2408 * Status format:
2409 *
2410 * <#used metadata blocks>/<#total metadata blocks>
2411 * <#read hits> <#read misses> <#write hits> <#write misses>
2412 * <#demotions> <#promotions> <#blocks in cache> <#dirty>
2413 * <#features> <features>*
2414 * <#core args> <core args>
2415 * <#policy args> <policy args>*
2416 */
2417static void cache_status(struct dm_target *ti, status_type_t type,
2418 unsigned status_flags, char *result, unsigned maxlen)
2419{
2420 int r = 0;
2421 unsigned i;
2422 ssize_t sz = 0;
2423 dm_block_t nr_free_blocks_metadata = 0;
2424 dm_block_t nr_blocks_metadata = 0;
2425 char buf[BDEVNAME_SIZE];
2426 struct cache *cache = ti->private;
2427 dm_cblock_t residency;
2428
2429 switch (type) {
2430 case STATUSTYPE_INFO:
2431 /* Commit to ensure statistics aren't out-of-date */
2432 if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti)) {
2433 r = dm_cache_commit(cache->cmd, false);
2434 if (r)
2435 DMERR("could not commit metadata for accurate status");
2436 }
2437
2438 r = dm_cache_get_free_metadata_block_count(cache->cmd,
2439 &nr_free_blocks_metadata);
2440 if (r) {
2441 DMERR("could not get metadata free block count");
2442 goto err;
2443 }
2444
2445 r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
2446 if (r) {
2447 DMERR("could not get metadata device size");
2448 goto err;
2449 }
2450
2451 residency = policy_residency(cache->policy);
2452
2453 DMEMIT("%llu/%llu %u %u %u %u %u %u %llu %u ",
2454 (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2455 (unsigned long long)nr_blocks_metadata,
2456 (unsigned) atomic_read(&cache->stats.read_hit),
2457 (unsigned) atomic_read(&cache->stats.read_miss),
2458 (unsigned) atomic_read(&cache->stats.write_hit),
2459 (unsigned) atomic_read(&cache->stats.write_miss),
2460 (unsigned) atomic_read(&cache->stats.demotion),
2461 (unsigned) atomic_read(&cache->stats.promotion),
2462 (unsigned long long) from_cblock(residency),
2463 cache->nr_dirty);
2464
2465 if (cache->features.write_through)
2466 DMEMIT("1 writethrough ");
2467 else
2468 DMEMIT("0 ");
2469
2470 DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
2471 if (sz < maxlen) {
2472 r = policy_emit_config_values(cache->policy, result + sz, maxlen - sz);
2473 if (r)
2474 DMERR("policy_emit_config_values returned %d", r);
2475 }
2476
2477 break;
2478
2479 case STATUSTYPE_TABLE:
2480 format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
2481 DMEMIT("%s ", buf);
2482 format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
2483 DMEMIT("%s ", buf);
2484 format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
2485 DMEMIT("%s", buf);
2486
2487 for (i = 0; i < cache->nr_ctr_args - 1; i++)
2488 DMEMIT(" %s", cache->ctr_args[i]);
2489 if (cache->nr_ctr_args)
2490 DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
2491 }
2492
2493 return;
2494
2495err:
2496 DMEMIT("Error");
2497}
2498
2499#define NOT_CORE_OPTION 1
2500
2501static int process_config_option(struct cache *cache, char **argv)
2502{
2503 unsigned long tmp;
2504
2505 if (!strcasecmp(argv[0], "migration_threshold")) {
2506 if (kstrtoul(argv[1], 10, &tmp))
2507 return -EINVAL;
2508
2509 cache->migration_threshold = tmp;
2510 return 0;
2511 }
2512
2513 return NOT_CORE_OPTION;
2514}
2515
2516/*
2517 * Supports <key> <value>.
2518 *
2519 * The key migration_threshold is supported by the cache target core.
2520 */
2521static int cache_message(struct dm_target *ti, unsigned argc, char **argv)
2522{
2523 int r;
2524 struct cache *cache = ti->private;
2525
2526 if (argc != 2)
2527 return -EINVAL;
2528
2529 r = process_config_option(cache, argv);
2530 if (r == NOT_CORE_OPTION)
2531 return policy_set_config_value(cache->policy, argv[0], argv[1]);
2532
2533 return r;
2534}
2535
2536static int cache_iterate_devices(struct dm_target *ti,
2537 iterate_devices_callout_fn fn, void *data)
2538{
2539 int r = 0;
2540 struct cache *cache = ti->private;
2541
2542 r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
2543 if (!r)
2544 r = fn(ti, cache->origin_dev, 0, ti->len, data);
2545
2546 return r;
2547}
2548
2549/*
2550 * We assume I/O is going to the origin (which is the volume
2551 * more likely to have restrictions e.g. by being striped).
2552 * (Looking up the exact location of the data would be expensive
2553 * and could always be out of date by the time the bio is submitted.)
2554 */
2555static int cache_bvec_merge(struct dm_target *ti,
2556 struct bvec_merge_data *bvm,
2557 struct bio_vec *biovec, int max_size)
2558{
2559 struct cache *cache = ti->private;
2560 struct request_queue *q = bdev_get_queue(cache->origin_dev->bdev);
2561
2562 if (!q->merge_bvec_fn)
2563 return max_size;
2564
2565 bvm->bi_bdev = cache->origin_dev->bdev;
2566 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2567}
2568
2569static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
2570{
2571 /*
2572 * FIXME: these limits may be incompatible with the cache device
2573 */
2574 limits->max_discard_sectors = cache->discard_block_size * 1024;
2575 limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
2576}
2577
2578static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
2579{
2580 struct cache *cache = ti->private;
2581
2582 blk_limits_io_min(limits, 0);
2583 blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
2584 set_discard_limits(cache, limits);
2585}
2586
2587/*----------------------------------------------------------------*/
2588
2589static struct target_type cache_target = {
2590 .name = "cache",
Joe Thornbere2e74d62013-03-20 17:21:27 +00002591 .version = {1, 1, 0},
Joe Thornberc6b4fcb2013-03-01 22:45:51 +00002592 .module = THIS_MODULE,
2593 .ctr = cache_ctr,
2594 .dtr = cache_dtr,
2595 .map = cache_map,
2596 .end_io = cache_end_io,
2597 .postsuspend = cache_postsuspend,
2598 .preresume = cache_preresume,
2599 .resume = cache_resume,
2600 .status = cache_status,
2601 .message = cache_message,
2602 .iterate_devices = cache_iterate_devices,
2603 .merge = cache_bvec_merge,
2604 .io_hints = cache_io_hints,
2605};
2606
2607static int __init dm_cache_init(void)
2608{
2609 int r;
2610
2611 r = dm_register_target(&cache_target);
2612 if (r) {
2613 DMERR("cache target registration failed: %d", r);
2614 return r;
2615 }
2616
2617 migration_cache = KMEM_CACHE(dm_cache_migration, 0);
2618 if (!migration_cache) {
2619 dm_unregister_target(&cache_target);
2620 return -ENOMEM;
2621 }
2622
2623 return 0;
2624}
2625
2626static void __exit dm_cache_exit(void)
2627{
2628 dm_unregister_target(&cache_target);
2629 kmem_cache_destroy(migration_cache);
2630}
2631
2632module_init(dm_cache_init);
2633module_exit(dm_cache_exit);
2634
2635MODULE_DESCRIPTION(DM_NAME " cache target");
2636MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
2637MODULE_LICENSE("GPL");