blob: 484ae6c8f43a20027c88daa3ba485b26cc88a950 [file] [log] [blame]
Kent Overstreetcafe5632013-03-23 16:11:31 -07001/*
2 * bcache setup/teardown code, and some metadata io - read a superblock and
3 * figure out what to do with it.
4 *
5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
6 * Copyright 2012 Google, Inc.
7 */
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "request.h"
13
14#include <linux/buffer_head.h>
15#include <linux/debugfs.h>
16#include <linux/genhd.h>
17#include <linux/module.h>
18#include <linux/random.h>
19#include <linux/reboot.h>
20#include <linux/sysfs.h>
21
22MODULE_LICENSE("GPL");
23MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
24
25static const char bcache_magic[] = {
26 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
27 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
28};
29
30static const char invalid_uuid[] = {
31 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
32 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
33};
34
35/* Default is -1; we skip past it for struct cached_dev's cache mode */
36const char * const bch_cache_modes[] = {
37 "default",
38 "writethrough",
39 "writeback",
40 "writearound",
41 "none",
42 NULL
43};
44
45struct uuid_entry_v0 {
46 uint8_t uuid[16];
47 uint8_t label[32];
48 uint32_t first_reg;
49 uint32_t last_reg;
50 uint32_t invalidated;
51 uint32_t pad;
52};
53
54static struct kobject *bcache_kobj;
55struct mutex bch_register_lock;
56LIST_HEAD(bch_cache_sets);
57static LIST_HEAD(uncached_devices);
58
59static int bcache_major, bcache_minor;
60static wait_queue_head_t unregister_wait;
61struct workqueue_struct *bcache_wq;
62
63#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
64
65static void bio_split_pool_free(struct bio_split_pool *p)
66{
67 if (p->bio_split)
68 bioset_free(p->bio_split);
69
70}
71
72static int bio_split_pool_init(struct bio_split_pool *p)
73{
74 p->bio_split = bioset_create(4, 0);
75 if (!p->bio_split)
76 return -ENOMEM;
77
78 p->bio_split_hook = mempool_create_kmalloc_pool(4,
79 sizeof(struct bio_split_hook));
80 if (!p->bio_split_hook)
81 return -ENOMEM;
82
83 return 0;
84}
85
86/* Superblock */
87
88static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
89 struct page **res)
90{
91 const char *err;
92 struct cache_sb *s;
93 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
94 unsigned i;
95
96 if (!bh)
97 return "IO error";
98
99 s = (struct cache_sb *) bh->b_data;
100
101 sb->offset = le64_to_cpu(s->offset);
102 sb->version = le64_to_cpu(s->version);
103
104 memcpy(sb->magic, s->magic, 16);
105 memcpy(sb->uuid, s->uuid, 16);
106 memcpy(sb->set_uuid, s->set_uuid, 16);
107 memcpy(sb->label, s->label, SB_LABEL_SIZE);
108
109 sb->flags = le64_to_cpu(s->flags);
110 sb->seq = le64_to_cpu(s->seq);
111
112 sb->nbuckets = le64_to_cpu(s->nbuckets);
113 sb->block_size = le16_to_cpu(s->block_size);
114 sb->bucket_size = le16_to_cpu(s->bucket_size);
115
116 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
117 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
118 sb->last_mount = le32_to_cpu(s->last_mount);
119
120 sb->first_bucket = le16_to_cpu(s->first_bucket);
121 sb->keys = le16_to_cpu(s->keys);
122
123 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
124 sb->d[i] = le64_to_cpu(s->d[i]);
125
126 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
127 sb->version, sb->flags, sb->seq, sb->keys);
128
129 err = "Not a bcache superblock";
130 if (sb->offset != SB_SECTOR)
131 goto err;
132
133 if (memcmp(sb->magic, bcache_magic, 16))
134 goto err;
135
136 err = "Too many journal buckets";
137 if (sb->keys > SB_JOURNAL_BUCKETS)
138 goto err;
139
140 err = "Bad checksum";
141 if (s->csum != csum_set(s))
142 goto err;
143
144 err = "Bad UUID";
145 if (is_zero(sb->uuid, 16))
146 goto err;
147
148 err = "Unsupported superblock version";
149 if (sb->version > BCACHE_SB_VERSION)
150 goto err;
151
152 err = "Bad block/bucket size";
153 if (!is_power_of_2(sb->block_size) || sb->block_size > PAGE_SECTORS ||
154 !is_power_of_2(sb->bucket_size) || sb->bucket_size < PAGE_SECTORS)
155 goto err;
156
157 err = "Too many buckets";
158 if (sb->nbuckets > LONG_MAX)
159 goto err;
160
161 err = "Not enough buckets";
162 if (sb->nbuckets < 1 << 7)
163 goto err;
164
165 err = "Invalid superblock: device too small";
166 if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
167 goto err;
168
169 if (sb->version == CACHE_BACKING_DEV)
170 goto out;
171
172 err = "Bad UUID";
173 if (is_zero(sb->set_uuid, 16))
174 goto err;
175
176 err = "Bad cache device number in set";
177 if (!sb->nr_in_set ||
178 sb->nr_in_set <= sb->nr_this_dev ||
179 sb->nr_in_set > MAX_CACHES_PER_SET)
180 goto err;
181
182 err = "Journal buckets not sequential";
183 for (i = 0; i < sb->keys; i++)
184 if (sb->d[i] != sb->first_bucket + i)
185 goto err;
186
187 err = "Too many journal buckets";
188 if (sb->first_bucket + sb->keys > sb->nbuckets)
189 goto err;
190
191 err = "Invalid superblock: first bucket comes before end of super";
192 if (sb->first_bucket * sb->bucket_size < 16)
193 goto err;
194out:
195 sb->last_mount = get_seconds();
196 err = NULL;
197
198 get_page(bh->b_page);
199 *res = bh->b_page;
200err:
201 put_bh(bh);
202 return err;
203}
204
205static void write_bdev_super_endio(struct bio *bio, int error)
206{
207 struct cached_dev *dc = bio->bi_private;
208 /* XXX: error checking */
209
210 closure_put(&dc->sb_write.cl);
211}
212
213static void __write_super(struct cache_sb *sb, struct bio *bio)
214{
215 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
216 unsigned i;
217
218 bio->bi_sector = SB_SECTOR;
219 bio->bi_rw = REQ_SYNC|REQ_META;
220 bio->bi_size = SB_SIZE;
221 bio_map(bio, NULL);
222
223 out->offset = cpu_to_le64(sb->offset);
224 out->version = cpu_to_le64(sb->version);
225
226 memcpy(out->uuid, sb->uuid, 16);
227 memcpy(out->set_uuid, sb->set_uuid, 16);
228 memcpy(out->label, sb->label, SB_LABEL_SIZE);
229
230 out->flags = cpu_to_le64(sb->flags);
231 out->seq = cpu_to_le64(sb->seq);
232
233 out->last_mount = cpu_to_le32(sb->last_mount);
234 out->first_bucket = cpu_to_le16(sb->first_bucket);
235 out->keys = cpu_to_le16(sb->keys);
236
237 for (i = 0; i < sb->keys; i++)
238 out->d[i] = cpu_to_le64(sb->d[i]);
239
240 out->csum = csum_set(out);
241
242 pr_debug("ver %llu, flags %llu, seq %llu",
243 sb->version, sb->flags, sb->seq);
244
245 submit_bio(REQ_WRITE, bio);
246}
247
248void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
249{
250 struct closure *cl = &dc->sb_write.cl;
251 struct bio *bio = &dc->sb_bio;
252
253 closure_lock(&dc->sb_write, parent);
254
255 bio_reset(bio);
256 bio->bi_bdev = dc->bdev;
257 bio->bi_end_io = write_bdev_super_endio;
258 bio->bi_private = dc;
259
260 closure_get(cl);
261 __write_super(&dc->sb, bio);
262
263 closure_return(cl);
264}
265
266static void write_super_endio(struct bio *bio, int error)
267{
268 struct cache *ca = bio->bi_private;
269
270 bch_count_io_errors(ca, error, "writing superblock");
271 closure_put(&ca->set->sb_write.cl);
272}
273
274void bcache_write_super(struct cache_set *c)
275{
276 struct closure *cl = &c->sb_write.cl;
277 struct cache *ca;
278 unsigned i;
279
280 closure_lock(&c->sb_write, &c->cl);
281
282 c->sb.seq++;
283
284 for_each_cache(ca, c, i) {
285 struct bio *bio = &ca->sb_bio;
286
287 ca->sb.version = BCACHE_SB_VERSION;
288 ca->sb.seq = c->sb.seq;
289 ca->sb.last_mount = c->sb.last_mount;
290
291 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
292
293 bio_reset(bio);
294 bio->bi_bdev = ca->bdev;
295 bio->bi_end_io = write_super_endio;
296 bio->bi_private = ca;
297
298 closure_get(cl);
299 __write_super(&ca->sb, bio);
300 }
301
302 closure_return(cl);
303}
304
305/* UUID io */
306
307static void uuid_endio(struct bio *bio, int error)
308{
309 struct closure *cl = bio->bi_private;
310 struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
311
312 cache_set_err_on(error, c, "accessing uuids");
313 bch_bbio_free(bio, c);
314 closure_put(cl);
315}
316
317static void uuid_io(struct cache_set *c, unsigned long rw,
318 struct bkey *k, struct closure *parent)
319{
320 struct closure *cl = &c->uuid_write.cl;
321 struct uuid_entry *u;
322 unsigned i;
323
324 BUG_ON(!parent);
325 closure_lock(&c->uuid_write, parent);
326
327 for (i = 0; i < KEY_PTRS(k); i++) {
328 struct bio *bio = bch_bbio_alloc(c);
329
330 bio->bi_rw = REQ_SYNC|REQ_META|rw;
331 bio->bi_size = KEY_SIZE(k) << 9;
332
333 bio->bi_end_io = uuid_endio;
334 bio->bi_private = cl;
335 bio_map(bio, c->uuids);
336
337 bch_submit_bbio(bio, c, k, i);
338
339 if (!(rw & WRITE))
340 break;
341 }
342
343 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read",
344 pkey(&c->uuid_bucket));
345
346 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
347 if (!is_zero(u->uuid, 16))
348 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
349 u - c->uuids, u->uuid, u->label,
350 u->first_reg, u->last_reg, u->invalidated);
351
352 closure_return(cl);
353}
354
355static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
356{
357 struct bkey *k = &j->uuid_bucket;
358
359 if (__bch_ptr_invalid(c, 1, k))
360 return "bad uuid pointer";
361
362 bkey_copy(&c->uuid_bucket, k);
363 uuid_io(c, READ_SYNC, k, cl);
364
365 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
366 struct uuid_entry_v0 *u0 = (void *) c->uuids;
367 struct uuid_entry *u1 = (void *) c->uuids;
368 int i;
369
370 closure_sync(cl);
371
372 /*
373 * Since the new uuid entry is bigger than the old, we have to
374 * convert starting at the highest memory address and work down
375 * in order to do it in place
376 */
377
378 for (i = c->nr_uuids - 1;
379 i >= 0;
380 --i) {
381 memcpy(u1[i].uuid, u0[i].uuid, 16);
382 memcpy(u1[i].label, u0[i].label, 32);
383
384 u1[i].first_reg = u0[i].first_reg;
385 u1[i].last_reg = u0[i].last_reg;
386 u1[i].invalidated = u0[i].invalidated;
387
388 u1[i].flags = 0;
389 u1[i].sectors = 0;
390 }
391 }
392
393 return NULL;
394}
395
396static int __uuid_write(struct cache_set *c)
397{
398 BKEY_PADDED(key) k;
399 struct closure cl;
400 closure_init_stack(&cl);
401
402 lockdep_assert_held(&bch_register_lock);
403
404 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl))
405 return 1;
406
407 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
408 uuid_io(c, REQ_WRITE, &k.key, &cl);
409 closure_sync(&cl);
410
411 bkey_copy(&c->uuid_bucket, &k.key);
412 __bkey_put(c, &k.key);
413 return 0;
414}
415
416int bch_uuid_write(struct cache_set *c)
417{
418 int ret = __uuid_write(c);
419
420 if (!ret)
421 bch_journal_meta(c, NULL);
422
423 return ret;
424}
425
426static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
427{
428 struct uuid_entry *u;
429
430 for (u = c->uuids;
431 u < c->uuids + c->nr_uuids; u++)
432 if (!memcmp(u->uuid, uuid, 16))
433 return u;
434
435 return NULL;
436}
437
438static struct uuid_entry *uuid_find_empty(struct cache_set *c)
439{
440 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
441 return uuid_find(c, zero_uuid);
442}
443
444/*
445 * Bucket priorities/gens:
446 *
447 * For each bucket, we store on disk its
448 * 8 bit gen
449 * 16 bit priority
450 *
451 * See alloc.c for an explanation of the gen. The priority is used to implement
452 * lru (and in the future other) cache replacement policies; for most purposes
453 * it's just an opaque integer.
454 *
455 * The gens and the priorities don't have a whole lot to do with each other, and
456 * it's actually the gens that must be written out at specific times - it's no
457 * big deal if the priorities don't get written, if we lose them we just reuse
458 * buckets in suboptimal order.
459 *
460 * On disk they're stored in a packed array, and in as many buckets are required
461 * to fit them all. The buckets we use to store them form a list; the journal
462 * header points to the first bucket, the first bucket points to the second
463 * bucket, et cetera.
464 *
465 * This code is used by the allocation code; periodically (whenever it runs out
466 * of buckets to allocate from) the allocation code will invalidate some
467 * buckets, but it can't use those buckets until their new gens are safely on
468 * disk.
469 */
470
471static void prio_endio(struct bio *bio, int error)
472{
473 struct cache *ca = bio->bi_private;
474
475 cache_set_err_on(error, ca->set, "accessing priorities");
476 bch_bbio_free(bio, ca->set);
477 closure_put(&ca->prio);
478}
479
480static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
481{
482 struct closure *cl = &ca->prio;
483 struct bio *bio = bch_bbio_alloc(ca->set);
484
485 closure_init_stack(cl);
486
487 bio->bi_sector = bucket * ca->sb.bucket_size;
488 bio->bi_bdev = ca->bdev;
489 bio->bi_rw = REQ_SYNC|REQ_META|rw;
490 bio->bi_size = bucket_bytes(ca);
491
492 bio->bi_end_io = prio_endio;
493 bio->bi_private = ca;
494 bio_map(bio, ca->disk_buckets);
495
496 closure_bio_submit(bio, &ca->prio, ca);
497 closure_sync(cl);
498}
499
500#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \
501 fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
502
503void bch_prio_write(struct cache *ca)
504{
505 int i;
506 struct bucket *b;
507 struct closure cl;
508
509 closure_init_stack(&cl);
510
511 lockdep_assert_held(&ca->set->bucket_lock);
512
513 for (b = ca->buckets;
514 b < ca->buckets + ca->sb.nbuckets; b++)
515 b->disk_gen = b->gen;
516
517 ca->disk_buckets->seq++;
518
519 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
520 &ca->meta_sectors_written);
521
522 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
523 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
524 blktrace_msg(ca, "Starting priorities: " buckets_free(ca));
525
526 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
527 long bucket;
528 struct prio_set *p = ca->disk_buckets;
Kent Overstreetb1a67b02013-03-25 11:46:44 -0700529 struct bucket_disk *d = p->data;
530 struct bucket_disk *end = d + prios_per_bucket(ca);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700531
532 for (b = ca->buckets + i * prios_per_bucket(ca);
533 b < ca->buckets + ca->sb.nbuckets && d < end;
534 b++, d++) {
535 d->prio = cpu_to_le16(b->prio);
536 d->gen = b->gen;
537 }
538
539 p->next_bucket = ca->prio_buckets[i + 1];
540 p->magic = pset_magic(ca);
541 p->csum = crc64(&p->magic, bucket_bytes(ca) - 8);
542
543 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl);
544 BUG_ON(bucket == -1);
545
546 mutex_unlock(&ca->set->bucket_lock);
547 prio_io(ca, bucket, REQ_WRITE);
548 mutex_lock(&ca->set->bucket_lock);
549
550 ca->prio_buckets[i] = bucket;
551 atomic_dec_bug(&ca->buckets[bucket].pin);
552 }
553
554 mutex_unlock(&ca->set->bucket_lock);
555
556 bch_journal_meta(ca->set, &cl);
557 closure_sync(&cl);
558
559 mutex_lock(&ca->set->bucket_lock);
560
561 ca->need_save_prio = 0;
562
563 /*
564 * Don't want the old priorities to get garbage collected until after we
565 * finish writing the new ones, and they're journalled
566 */
567 for (i = 0; i < prio_buckets(ca); i++)
568 ca->prio_last_buckets[i] = ca->prio_buckets[i];
569}
570
571static void prio_read(struct cache *ca, uint64_t bucket)
572{
573 struct prio_set *p = ca->disk_buckets;
574 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
575 struct bucket *b;
576 unsigned bucket_nr = 0;
577
578 for (b = ca->buckets;
579 b < ca->buckets + ca->sb.nbuckets;
580 b++, d++) {
581 if (d == end) {
582 ca->prio_buckets[bucket_nr] = bucket;
583 ca->prio_last_buckets[bucket_nr] = bucket;
584 bucket_nr++;
585
586 prio_io(ca, bucket, READ_SYNC);
587
588 if (p->csum != crc64(&p->magic, bucket_bytes(ca) - 8))
589 pr_warn("bad csum reading priorities");
590
591 if (p->magic != pset_magic(ca))
592 pr_warn("bad magic reading priorities");
593
594 bucket = p->next_bucket;
595 d = p->data;
596 }
597
598 b->prio = le16_to_cpu(d->prio);
599 b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
600 }
601}
602
603/* Bcache device */
604
605static int open_dev(struct block_device *b, fmode_t mode)
606{
607 struct bcache_device *d = b->bd_disk->private_data;
608 if (atomic_read(&d->closing))
609 return -ENXIO;
610
611 closure_get(&d->cl);
612 return 0;
613}
614
615static int release_dev(struct gendisk *b, fmode_t mode)
616{
617 struct bcache_device *d = b->private_data;
618 closure_put(&d->cl);
619 return 0;
620}
621
622static int ioctl_dev(struct block_device *b, fmode_t mode,
623 unsigned int cmd, unsigned long arg)
624{
625 struct bcache_device *d = b->bd_disk->private_data;
626 return d->ioctl(d, mode, cmd, arg);
627}
628
629static const struct block_device_operations bcache_ops = {
630 .open = open_dev,
631 .release = release_dev,
632 .ioctl = ioctl_dev,
633 .owner = THIS_MODULE,
634};
635
636void bcache_device_stop(struct bcache_device *d)
637{
638 if (!atomic_xchg(&d->closing, 1))
639 closure_queue(&d->cl);
640}
641
642static void bcache_device_detach(struct bcache_device *d)
643{
644 lockdep_assert_held(&bch_register_lock);
645
646 if (atomic_read(&d->detaching)) {
647 struct uuid_entry *u = d->c->uuids + d->id;
648
649 SET_UUID_FLASH_ONLY(u, 0);
650 memcpy(u->uuid, invalid_uuid, 16);
651 u->invalidated = cpu_to_le32(get_seconds());
652 bch_uuid_write(d->c);
653
654 atomic_set(&d->detaching, 0);
655 }
656
657 d->c->devices[d->id] = NULL;
658 closure_put(&d->c->caching);
659 d->c = NULL;
660}
661
662static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
663 unsigned id)
664{
665 BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
666
667 d->id = id;
668 d->c = c;
669 c->devices[id] = d;
670
671 closure_get(&c->caching);
672}
673
674static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
675 const char *name)
676{
677 snprintf(d->name, BCACHEDEVNAME_SIZE,
678 "%s%u", name, d->id);
679
680 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
681 sysfs_create_link(&c->kobj, &d->kobj, d->name),
682 "Couldn't create device <-> cache set symlinks");
683}
684
685static void bcache_device_free(struct bcache_device *d)
686{
687 lockdep_assert_held(&bch_register_lock);
688
689 pr_info("%s stopped", d->disk->disk_name);
690
691 if (d->c)
692 bcache_device_detach(d);
693
694 if (d->disk)
695 del_gendisk(d->disk);
696 if (d->disk && d->disk->queue)
697 blk_cleanup_queue(d->disk->queue);
698 if (d->disk)
699 put_disk(d->disk);
700
701 bio_split_pool_free(&d->bio_split_hook);
702 if (d->unaligned_bvec)
703 mempool_destroy(d->unaligned_bvec);
704 if (d->bio_split)
705 bioset_free(d->bio_split);
706
707 closure_debug_destroy(&d->cl);
708}
709
710static int bcache_device_init(struct bcache_device *d, unsigned block_size)
711{
712 struct request_queue *q;
713
714 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
715 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
716 sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
717 bio_split_pool_init(&d->bio_split_hook))
718
719 return -ENOMEM;
720
721 d->disk = alloc_disk(1);
722 if (!d->disk)
723 return -ENOMEM;
724
725 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
726
727 d->disk->major = bcache_major;
728 d->disk->first_minor = bcache_minor++;
729 d->disk->fops = &bcache_ops;
730 d->disk->private_data = d;
731
732 q = blk_alloc_queue(GFP_KERNEL);
733 if (!q)
734 return -ENOMEM;
735
736 blk_queue_make_request(q, NULL);
737 d->disk->queue = q;
738 q->queuedata = d;
739 q->backing_dev_info.congested_data = d;
740 q->limits.max_hw_sectors = UINT_MAX;
741 q->limits.max_sectors = UINT_MAX;
742 q->limits.max_segment_size = UINT_MAX;
743 q->limits.max_segments = BIO_MAX_PAGES;
744 q->limits.max_discard_sectors = UINT_MAX;
745 q->limits.io_min = block_size;
746 q->limits.logical_block_size = block_size;
747 q->limits.physical_block_size = block_size;
748 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
749 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
750
751 return 0;
752}
753
754/* Cached device */
755
756static void calc_cached_dev_sectors(struct cache_set *c)
757{
758 uint64_t sectors = 0;
759 struct cached_dev *dc;
760
761 list_for_each_entry(dc, &c->cached_devs, list)
762 sectors += bdev_sectors(dc->bdev);
763
764 c->cached_dev_sectors = sectors;
765}
766
767void bch_cached_dev_run(struct cached_dev *dc)
768{
769 struct bcache_device *d = &dc->disk;
770
771 if (atomic_xchg(&dc->running, 1))
772 return;
773
774 if (!d->c &&
775 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
776 struct closure cl;
777 closure_init_stack(&cl);
778
779 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
780 bch_write_bdev_super(dc, &cl);
781 closure_sync(&cl);
782 }
783
784 add_disk(d->disk);
785#if 0
786 char *env[] = { "SYMLINK=label" , NULL };
787 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
788#endif
789 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
790 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
791 pr_debug("error creating sysfs link");
792}
793
794static void cached_dev_detach_finish(struct work_struct *w)
795{
796 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
797 char buf[BDEVNAME_SIZE];
798 struct closure cl;
799 closure_init_stack(&cl);
800
801 BUG_ON(!atomic_read(&dc->disk.detaching));
802 BUG_ON(atomic_read(&dc->count));
803
804 sysfs_remove_link(&dc->disk.c->kobj, dc->disk.name);
805 sysfs_remove_link(&dc->disk.kobj, "cache");
806
807 mutex_lock(&bch_register_lock);
808
809 memset(&dc->sb.set_uuid, 0, 16);
810 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
811
812 bch_write_bdev_super(dc, &cl);
813 closure_sync(&cl);
814
815 bcache_device_detach(&dc->disk);
816 list_move(&dc->list, &uncached_devices);
817
818 mutex_unlock(&bch_register_lock);
819
820 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
821
822 /* Drop ref we took in cached_dev_detach() */
823 closure_put(&dc->disk.cl);
824}
825
826void bch_cached_dev_detach(struct cached_dev *dc)
827{
828 lockdep_assert_held(&bch_register_lock);
829
830 if (atomic_read(&dc->disk.closing))
831 return;
832
833 if (atomic_xchg(&dc->disk.detaching, 1))
834 return;
835
836 /*
837 * Block the device from being closed and freed until we're finished
838 * detaching
839 */
840 closure_get(&dc->disk.cl);
841
842 bch_writeback_queue(dc);
843 cached_dev_put(dc);
844}
845
846int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
847{
848 uint32_t rtime = cpu_to_le32(get_seconds());
849 struct uuid_entry *u;
850 char buf[BDEVNAME_SIZE];
851
852 bdevname(dc->bdev, buf);
853
854 if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
855 return -ENOENT;
856
857 if (dc->disk.c) {
858 pr_err("Can't attach %s: already attached", buf);
859 return -EINVAL;
860 }
861
862 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
863 pr_err("Can't attach %s: shutting down", buf);
864 return -EINVAL;
865 }
866
867 if (dc->sb.block_size < c->sb.block_size) {
868 /* Will die */
Kent Overstreetb1a67b02013-03-25 11:46:44 -0700869 pr_err("Couldn't attach %s: block size less than set's block size",
870 buf);
Kent Overstreetcafe5632013-03-23 16:11:31 -0700871 return -EINVAL;
872 }
873
874 u = uuid_find(c, dc->sb.uuid);
875
876 if (u &&
877 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
878 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
879 memcpy(u->uuid, invalid_uuid, 16);
880 u->invalidated = cpu_to_le32(get_seconds());
881 u = NULL;
882 }
883
884 if (!u) {
885 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
886 pr_err("Couldn't find uuid for %s in set", buf);
887 return -ENOENT;
888 }
889
890 u = uuid_find_empty(c);
891 if (!u) {
892 pr_err("Not caching %s, no room for UUID", buf);
893 return -EINVAL;
894 }
895 }
896
897 /* Deadlocks since we're called via sysfs...
898 sysfs_remove_file(&dc->kobj, &sysfs_attach);
899 */
900
901 if (is_zero(u->uuid, 16)) {
902 struct closure cl;
903 closure_init_stack(&cl);
904
905 memcpy(u->uuid, dc->sb.uuid, 16);
906 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
907 u->first_reg = u->last_reg = rtime;
908 bch_uuid_write(c);
909
910 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
911 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
912
913 bch_write_bdev_super(dc, &cl);
914 closure_sync(&cl);
915 } else {
916 u->last_reg = rtime;
917 bch_uuid_write(c);
918 }
919
920 bcache_device_attach(&dc->disk, c, u - c->uuids);
921 bcache_device_link(&dc->disk, c, "bdev");
922 list_move(&dc->list, &c->cached_devs);
923 calc_cached_dev_sectors(c);
924
925 smp_wmb();
926 /*
927 * dc->c must be set before dc->count != 0 - paired with the mb in
928 * cached_dev_get()
929 */
930 atomic_set(&dc->count, 1);
931
932 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
933 atomic_set(&dc->has_dirty, 1);
934 atomic_inc(&dc->count);
935 bch_writeback_queue(dc);
936 }
937
938 bch_cached_dev_run(dc);
939
940 pr_info("Caching %s as %s on set %pU",
941 bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
942 dc->disk.c->sb.set_uuid);
943 return 0;
944}
945
946void bch_cached_dev_release(struct kobject *kobj)
947{
948 struct cached_dev *dc = container_of(kobj, struct cached_dev,
949 disk.kobj);
950 kfree(dc);
951 module_put(THIS_MODULE);
952}
953
954static void cached_dev_free(struct closure *cl)
955{
956 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
957
958 cancel_delayed_work_sync(&dc->writeback_rate_update);
959
960 mutex_lock(&bch_register_lock);
961
962 bcache_device_free(&dc->disk);
963 list_del(&dc->list);
964
965 mutex_unlock(&bch_register_lock);
966
967 if (!IS_ERR_OR_NULL(dc->bdev)) {
968 blk_sync_queue(bdev_get_queue(dc->bdev));
969 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
970 }
971
972 wake_up(&unregister_wait);
973
974 kobject_put(&dc->disk.kobj);
975}
976
977static void cached_dev_flush(struct closure *cl)
978{
979 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
980 struct bcache_device *d = &dc->disk;
981
982 bch_cache_accounting_destroy(&dc->accounting);
983 kobject_del(&d->kobj);
984
985 continue_at(cl, cached_dev_free, system_wq);
986}
987
988static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
989{
990 int err;
991 struct io *io;
992
993 closure_init(&dc->disk.cl, NULL);
994 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
995
996 __module_get(THIS_MODULE);
997 INIT_LIST_HEAD(&dc->list);
998 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
999
1000 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1001
1002 err = bcache_device_init(&dc->disk, block_size);
1003 if (err)
1004 goto err;
1005
1006 spin_lock_init(&dc->io_lock);
1007 closure_init_unlocked(&dc->sb_write);
1008 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1009
1010 dc->sequential_merge = true;
1011 dc->sequential_cutoff = 4 << 20;
1012
1013 INIT_LIST_HEAD(&dc->io_lru);
1014 dc->sb_bio.bi_max_vecs = 1;
1015 dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs;
1016
1017 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1018 list_add(&io->lru, &dc->io_lru);
1019 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1020 }
1021
1022 bch_writeback_init_cached_dev(dc);
1023 return 0;
1024err:
1025 bcache_device_stop(&dc->disk);
1026 return err;
1027}
1028
1029/* Cached device - bcache superblock */
1030
1031static const char *register_bdev(struct cache_sb *sb, struct page *sb_page,
1032 struct block_device *bdev,
1033 struct cached_dev *dc)
1034{
1035 char name[BDEVNAME_SIZE];
1036 const char *err = "cannot allocate memory";
1037 struct gendisk *g;
1038 struct cache_set *c;
1039
1040 if (!dc || cached_dev_init(dc, sb->block_size << 9) != 0)
1041 return err;
1042
1043 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1044 dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
1045 dc->bdev = bdev;
1046 dc->bdev->bd_holder = dc;
1047
1048 g = dc->disk.disk;
1049
1050 set_capacity(g, dc->bdev->bd_part->nr_sects - 16);
1051
1052 bch_cached_dev_request_init(dc);
1053
1054 err = "error creating kobject";
1055 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1056 "bcache"))
1057 goto err;
1058 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1059 goto err;
1060
1061 list_add(&dc->list, &uncached_devices);
1062 list_for_each_entry(c, &bch_cache_sets, list)
1063 bch_cached_dev_attach(dc, c);
1064
1065 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1066 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1067 bch_cached_dev_run(dc);
1068
1069 return NULL;
1070err:
1071 kobject_put(&dc->disk.kobj);
1072 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1073 /*
1074 * Return NULL instead of an error because kobject_put() cleans
1075 * everything up
1076 */
1077 return NULL;
1078}
1079
1080/* Flash only volumes */
1081
1082void bch_flash_dev_release(struct kobject *kobj)
1083{
1084 struct bcache_device *d = container_of(kobj, struct bcache_device,
1085 kobj);
1086 kfree(d);
1087}
1088
1089static void flash_dev_free(struct closure *cl)
1090{
1091 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1092 bcache_device_free(d);
1093 kobject_put(&d->kobj);
1094}
1095
1096static void flash_dev_flush(struct closure *cl)
1097{
1098 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1099
1100 sysfs_remove_link(&d->c->kobj, d->name);
1101 sysfs_remove_link(&d->kobj, "cache");
1102 kobject_del(&d->kobj);
1103 continue_at(cl, flash_dev_free, system_wq);
1104}
1105
1106static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1107{
1108 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1109 GFP_KERNEL);
1110 if (!d)
1111 return -ENOMEM;
1112
1113 closure_init(&d->cl, NULL);
1114 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1115
1116 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1117
1118 if (bcache_device_init(d, block_bytes(c)))
1119 goto err;
1120
1121 bcache_device_attach(d, c, u - c->uuids);
1122 set_capacity(d->disk, u->sectors);
1123 bch_flash_dev_request_init(d);
1124 add_disk(d->disk);
1125
1126 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1127 goto err;
1128
1129 bcache_device_link(d, c, "volume");
1130
1131 return 0;
1132err:
1133 kobject_put(&d->kobj);
1134 return -ENOMEM;
1135}
1136
1137static int flash_devs_run(struct cache_set *c)
1138{
1139 int ret = 0;
1140 struct uuid_entry *u;
1141
1142 for (u = c->uuids;
1143 u < c->uuids + c->nr_uuids && !ret;
1144 u++)
1145 if (UUID_FLASH_ONLY(u))
1146 ret = flash_dev_run(c, u);
1147
1148 return ret;
1149}
1150
1151int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1152{
1153 struct uuid_entry *u;
1154
1155 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1156 return -EINTR;
1157
1158 u = uuid_find_empty(c);
1159 if (!u) {
1160 pr_err("Can't create volume, no room for UUID");
1161 return -EINVAL;
1162 }
1163
1164 get_random_bytes(u->uuid, 16);
1165 memset(u->label, 0, 32);
1166 u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
1167
1168 SET_UUID_FLASH_ONLY(u, 1);
1169 u->sectors = size >> 9;
1170
1171 bch_uuid_write(c);
1172
1173 return flash_dev_run(c, u);
1174}
1175
1176/* Cache set */
1177
1178__printf(2, 3)
1179bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1180{
1181 va_list args;
1182
1183 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1184 return false;
1185
1186 /* XXX: we can be called from atomic context
1187 acquire_console_sem();
1188 */
1189
1190 printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
1191
1192 va_start(args, fmt);
1193 vprintk(fmt, args);
1194 va_end(args);
1195
1196 printk(", disabling caching\n");
1197
1198 bch_cache_set_unregister(c);
1199 return true;
1200}
1201
1202void bch_cache_set_release(struct kobject *kobj)
1203{
1204 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1205 kfree(c);
1206 module_put(THIS_MODULE);
1207}
1208
1209static void cache_set_free(struct closure *cl)
1210{
1211 struct cache_set *c = container_of(cl, struct cache_set, cl);
1212 struct cache *ca;
1213 unsigned i;
1214
1215 if (!IS_ERR_OR_NULL(c->debug))
1216 debugfs_remove(c->debug);
1217
1218 bch_open_buckets_free(c);
1219 bch_btree_cache_free(c);
1220 bch_journal_free(c);
1221
1222 for_each_cache(ca, c, i)
1223 if (ca)
1224 kobject_put(&ca->kobj);
1225
1226 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1227 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1228
1229 kfree(c->fill_iter);
1230 if (c->bio_split)
1231 bioset_free(c->bio_split);
1232 if (c->bio_meta)
1233 mempool_destroy(c->bio_meta);
1234 if (c->search)
1235 mempool_destroy(c->search);
1236 kfree(c->devices);
1237
1238 mutex_lock(&bch_register_lock);
1239 list_del(&c->list);
1240 mutex_unlock(&bch_register_lock);
1241
1242 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1243 wake_up(&unregister_wait);
1244
1245 closure_debug_destroy(&c->cl);
1246 kobject_put(&c->kobj);
1247}
1248
1249static void cache_set_flush(struct closure *cl)
1250{
1251 struct cache_set *c = container_of(cl, struct cache_set, caching);
1252 struct btree *b;
1253
1254 /* Shut down allocator threads */
1255 set_bit(CACHE_SET_STOPPING_2, &c->flags);
1256 wake_up(&c->alloc_wait);
1257
1258 bch_cache_accounting_destroy(&c->accounting);
1259
1260 kobject_put(&c->internal);
1261 kobject_del(&c->kobj);
1262
1263 if (!IS_ERR_OR_NULL(c->root))
1264 list_add(&c->root->list, &c->btree_cache);
1265
1266 /* Should skip this if we're unregistering because of an error */
1267 list_for_each_entry(b, &c->btree_cache, list)
1268 if (btree_node_dirty(b))
1269 bch_btree_write(b, true, NULL);
1270
1271 closure_return(cl);
1272}
1273
1274static void __cache_set_unregister(struct closure *cl)
1275{
1276 struct cache_set *c = container_of(cl, struct cache_set, caching);
1277 struct cached_dev *dc, *t;
1278 size_t i;
1279
1280 mutex_lock(&bch_register_lock);
1281
1282 if (test_bit(CACHE_SET_UNREGISTERING, &c->flags))
1283 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1284 bch_cached_dev_detach(dc);
1285
1286 for (i = 0; i < c->nr_uuids; i++)
1287 if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i]))
1288 bcache_device_stop(c->devices[i]);
1289
1290 mutex_unlock(&bch_register_lock);
1291
1292 continue_at(cl, cache_set_flush, system_wq);
1293}
1294
1295void bch_cache_set_stop(struct cache_set *c)
1296{
1297 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1298 closure_queue(&c->caching);
1299}
1300
1301void bch_cache_set_unregister(struct cache_set *c)
1302{
1303 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1304 bch_cache_set_stop(c);
1305}
1306
1307#define alloc_bucket_pages(gfp, c) \
1308 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1309
1310struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1311{
1312 int iter_size;
1313 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1314 if (!c)
1315 return NULL;
1316
1317 __module_get(THIS_MODULE);
1318 closure_init(&c->cl, NULL);
1319 set_closure_fn(&c->cl, cache_set_free, system_wq);
1320
1321 closure_init(&c->caching, &c->cl);
1322 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1323
1324 /* Maybe create continue_at_noreturn() and use it here? */
1325 closure_set_stopped(&c->cl);
1326 closure_put(&c->cl);
1327
1328 kobject_init(&c->kobj, &bch_cache_set_ktype);
1329 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1330
1331 bch_cache_accounting_init(&c->accounting, &c->cl);
1332
1333 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1334 c->sb.block_size = sb->block_size;
1335 c->sb.bucket_size = sb->bucket_size;
1336 c->sb.nr_in_set = sb->nr_in_set;
1337 c->sb.last_mount = sb->last_mount;
1338 c->bucket_bits = ilog2(sb->bucket_size);
1339 c->block_bits = ilog2(sb->block_size);
1340 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1341
1342 c->btree_pages = c->sb.bucket_size / PAGE_SECTORS;
1343 if (c->btree_pages > BTREE_MAX_PAGES)
1344 c->btree_pages = max_t(int, c->btree_pages / 4,
1345 BTREE_MAX_PAGES);
1346
1347 init_waitqueue_head(&c->alloc_wait);
1348 mutex_init(&c->bucket_lock);
1349 mutex_init(&c->fill_lock);
1350 mutex_init(&c->sort_lock);
1351 spin_lock_init(&c->sort_time_lock);
1352 closure_init_unlocked(&c->sb_write);
1353 closure_init_unlocked(&c->uuid_write);
1354 spin_lock_init(&c->btree_read_time_lock);
1355 bch_moving_init_cache_set(c);
1356
1357 INIT_LIST_HEAD(&c->list);
1358 INIT_LIST_HEAD(&c->cached_devs);
1359 INIT_LIST_HEAD(&c->btree_cache);
1360 INIT_LIST_HEAD(&c->btree_cache_freeable);
1361 INIT_LIST_HEAD(&c->btree_cache_freed);
1362 INIT_LIST_HEAD(&c->data_buckets);
1363
1364 c->search = mempool_create_slab_pool(32, bch_search_cache);
1365 if (!c->search)
1366 goto err;
1367
1368 iter_size = (sb->bucket_size / sb->block_size + 1) *
1369 sizeof(struct btree_iter_set);
1370
1371 if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
1372 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1373 sizeof(struct bbio) + sizeof(struct bio_vec) *
1374 bucket_pages(c))) ||
1375 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1376 !(c->fill_iter = kmalloc(iter_size, GFP_KERNEL)) ||
1377 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1378 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1379 bch_journal_alloc(c) ||
1380 bch_btree_cache_alloc(c) ||
1381 bch_open_buckets_alloc(c))
1382 goto err;
1383
1384 c->fill_iter->size = sb->bucket_size / sb->block_size;
1385
1386 c->congested_read_threshold_us = 2000;
1387 c->congested_write_threshold_us = 20000;
1388 c->error_limit = 8 << IO_ERROR_SHIFT;
1389
1390 return c;
1391err:
1392 bch_cache_set_unregister(c);
1393 return NULL;
1394}
1395
1396static void run_cache_set(struct cache_set *c)
1397{
1398 const char *err = "cannot allocate memory";
1399 struct cached_dev *dc, *t;
1400 struct cache *ca;
1401 unsigned i;
1402
1403 struct btree_op op;
1404 bch_btree_op_init_stack(&op);
1405 op.lock = SHRT_MAX;
1406
1407 for_each_cache(ca, c, i)
1408 c->nbuckets += ca->sb.nbuckets;
1409
1410 if (CACHE_SYNC(&c->sb)) {
1411 LIST_HEAD(journal);
1412 struct bkey *k;
1413 struct jset *j;
1414
1415 err = "cannot allocate memory for journal";
1416 if (bch_journal_read(c, &journal, &op))
1417 goto err;
1418
1419 pr_debug("btree_journal_read() done");
1420
1421 err = "no journal entries found";
1422 if (list_empty(&journal))
1423 goto err;
1424
1425 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1426
1427 err = "IO error reading priorities";
1428 for_each_cache(ca, c, i)
1429 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1430
1431 /*
1432 * If prio_read() fails it'll call cache_set_error and we'll
1433 * tear everything down right away, but if we perhaps checked
1434 * sooner we could avoid journal replay.
1435 */
1436
1437 k = &j->btree_root;
1438
1439 err = "bad btree root";
1440 if (__bch_ptr_invalid(c, j->btree_level + 1, k))
1441 goto err;
1442
1443 err = "error reading btree root";
1444 c->root = bch_btree_node_get(c, k, j->btree_level, &op);
1445 if (IS_ERR_OR_NULL(c->root))
1446 goto err;
1447
1448 list_del_init(&c->root->list);
1449 rw_unlock(true, c->root);
1450
1451 err = uuid_read(c, j, &op.cl);
1452 if (err)
1453 goto err;
1454
1455 err = "error in recovery";
1456 if (bch_btree_check(c, &op))
1457 goto err;
1458
1459 bch_journal_mark(c, &journal);
1460 bch_btree_gc_finish(c);
1461 pr_debug("btree_check() done");
1462
1463 /*
1464 * bcache_journal_next() can't happen sooner, or
1465 * btree_gc_finish() will give spurious errors about last_gc >
1466 * gc_gen - this is a hack but oh well.
1467 */
1468 bch_journal_next(&c->journal);
1469
1470 for_each_cache(ca, c, i)
1471 closure_call(&ca->alloc, bch_allocator_thread,
1472 system_wq, &c->cl);
1473
1474 /*
1475 * First place it's safe to allocate: btree_check() and
1476 * btree_gc_finish() have to run before we have buckets to
1477 * allocate, and bch_bucket_alloc_set() might cause a journal
1478 * entry to be written so bcache_journal_next() has to be called
1479 * first.
1480 *
1481 * If the uuids were in the old format we have to rewrite them
1482 * before the next journal entry is written:
1483 */
1484 if (j->version < BCACHE_JSET_VERSION_UUID)
1485 __uuid_write(c);
1486
1487 bch_journal_replay(c, &journal, &op);
1488 } else {
1489 pr_notice("invalidating existing data");
1490 /* Don't want invalidate_buckets() to queue a gc yet */
1491 closure_lock(&c->gc, NULL);
1492
1493 for_each_cache(ca, c, i) {
1494 unsigned j;
1495
1496 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1497 2, SB_JOURNAL_BUCKETS);
1498
1499 for (j = 0; j < ca->sb.keys; j++)
1500 ca->sb.d[j] = ca->sb.first_bucket + j;
1501 }
1502
1503 bch_btree_gc_finish(c);
1504
1505 for_each_cache(ca, c, i)
1506 closure_call(&ca->alloc, bch_allocator_thread,
1507 ca->alloc_workqueue, &c->cl);
1508
1509 mutex_lock(&c->bucket_lock);
1510 for_each_cache(ca, c, i)
1511 bch_prio_write(ca);
1512 mutex_unlock(&c->bucket_lock);
1513
1514 wake_up(&c->alloc_wait);
1515
1516 err = "cannot allocate new UUID bucket";
1517 if (__uuid_write(c))
1518 goto err_unlock_gc;
1519
1520 err = "cannot allocate new btree root";
1521 c->root = bch_btree_node_alloc(c, 0, &op.cl);
1522 if (IS_ERR_OR_NULL(c->root))
1523 goto err_unlock_gc;
1524
1525 bkey_copy_key(&c->root->key, &MAX_KEY);
1526 bch_btree_write(c->root, true, &op);
1527
1528 bch_btree_set_root(c->root);
1529 rw_unlock(true, c->root);
1530
1531 /*
1532 * We don't want to write the first journal entry until
1533 * everything is set up - fortunately journal entries won't be
1534 * written until the SET_CACHE_SYNC() here:
1535 */
1536 SET_CACHE_SYNC(&c->sb, true);
1537
1538 bch_journal_next(&c->journal);
1539 bch_journal_meta(c, &op.cl);
1540
1541 /* Unlock */
1542 closure_set_stopped(&c->gc.cl);
1543 closure_put(&c->gc.cl);
1544 }
1545
1546 closure_sync(&op.cl);
1547 c->sb.last_mount = get_seconds();
1548 bcache_write_super(c);
1549
1550 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1551 bch_cached_dev_attach(dc, c);
1552
1553 flash_devs_run(c);
1554
1555 return;
1556err_unlock_gc:
1557 closure_set_stopped(&c->gc.cl);
1558 closure_put(&c->gc.cl);
1559err:
1560 closure_sync(&op.cl);
1561 /* XXX: test this, it's broken */
1562 bch_cache_set_error(c, err);
1563}
1564
1565static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1566{
1567 return ca->sb.block_size == c->sb.block_size &&
1568 ca->sb.bucket_size == c->sb.block_size &&
1569 ca->sb.nr_in_set == c->sb.nr_in_set;
1570}
1571
1572static const char *register_cache_set(struct cache *ca)
1573{
1574 char buf[12];
1575 const char *err = "cannot allocate memory";
1576 struct cache_set *c;
1577
1578 list_for_each_entry(c, &bch_cache_sets, list)
1579 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1580 if (c->cache[ca->sb.nr_this_dev])
1581 return "duplicate cache set member";
1582
1583 if (!can_attach_cache(ca, c))
1584 return "cache sb does not match set";
1585
1586 if (!CACHE_SYNC(&ca->sb))
1587 SET_CACHE_SYNC(&c->sb, false);
1588
1589 goto found;
1590 }
1591
1592 c = bch_cache_set_alloc(&ca->sb);
1593 if (!c)
1594 return err;
1595
1596 err = "error creating kobject";
1597 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1598 kobject_add(&c->internal, &c->kobj, "internal"))
1599 goto err;
1600
1601 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1602 goto err;
1603
1604 bch_debug_init_cache_set(c);
1605
1606 list_add(&c->list, &bch_cache_sets);
1607found:
1608 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1609 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1610 sysfs_create_link(&c->kobj, &ca->kobj, buf))
1611 goto err;
1612
1613 if (ca->sb.seq > c->sb.seq) {
1614 c->sb.version = ca->sb.version;
1615 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1616 c->sb.flags = ca->sb.flags;
1617 c->sb.seq = ca->sb.seq;
1618 pr_debug("set version = %llu", c->sb.version);
1619 }
1620
1621 ca->set = c;
1622 ca->set->cache[ca->sb.nr_this_dev] = ca;
1623 c->cache_by_alloc[c->caches_loaded++] = ca;
1624
1625 if (c->caches_loaded == c->sb.nr_in_set)
1626 run_cache_set(c);
1627
1628 return NULL;
1629err:
1630 bch_cache_set_unregister(c);
1631 return err;
1632}
1633
1634/* Cache device */
1635
1636void bch_cache_release(struct kobject *kobj)
1637{
1638 struct cache *ca = container_of(kobj, struct cache, kobj);
1639
1640 if (ca->set)
1641 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1642
1643 bch_cache_allocator_exit(ca);
1644
1645 bio_split_pool_free(&ca->bio_split_hook);
1646
1647 if (ca->alloc_workqueue)
1648 destroy_workqueue(ca->alloc_workqueue);
1649
1650 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1651 kfree(ca->prio_buckets);
1652 vfree(ca->buckets);
1653
1654 free_heap(&ca->heap);
1655 free_fifo(&ca->unused);
1656 free_fifo(&ca->free_inc);
1657 free_fifo(&ca->free);
1658
1659 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1660 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
1661
1662 if (!IS_ERR_OR_NULL(ca->bdev)) {
1663 blk_sync_queue(bdev_get_queue(ca->bdev));
1664 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1665 }
1666
1667 kfree(ca);
1668 module_put(THIS_MODULE);
1669}
1670
1671static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1672{
1673 size_t free;
1674 struct bucket *b;
1675
1676 if (!ca)
1677 return -ENOMEM;
1678
1679 __module_get(THIS_MODULE);
1680 kobject_init(&ca->kobj, &bch_cache_ktype);
1681
1682 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
1683
1684 INIT_LIST_HEAD(&ca->discards);
1685
1686 bio_init(&ca->sb_bio);
1687 ca->sb_bio.bi_max_vecs = 1;
1688 ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs;
1689
1690 bio_init(&ca->journal.bio);
1691 ca->journal.bio.bi_max_vecs = 8;
1692 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1693
1694 free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
1695 free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
1696
1697 if (!init_fifo(&ca->free, free, GFP_KERNEL) ||
1698 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1699 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
1700 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
1701 !(ca->buckets = vmalloc(sizeof(struct bucket) *
1702 ca->sb.nbuckets)) ||
1703 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1704 2, GFP_KERNEL)) ||
1705 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1706 !(ca->alloc_workqueue = alloc_workqueue("bch_allocator", 0, 1)) ||
1707 bio_split_pool_init(&ca->bio_split_hook))
1708 goto err;
1709
1710 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1711
1712 memset(ca->buckets, 0, ca->sb.nbuckets * sizeof(struct bucket));
1713 for_each_bucket(b, ca)
1714 atomic_set(&b->pin, 0);
1715
1716 if (bch_cache_allocator_init(ca))
1717 goto err;
1718
1719 return 0;
1720err:
1721 kobject_put(&ca->kobj);
1722 return -ENOMEM;
1723}
1724
1725static const char *register_cache(struct cache_sb *sb, struct page *sb_page,
1726 struct block_device *bdev, struct cache *ca)
1727{
1728 char name[BDEVNAME_SIZE];
1729 const char *err = "cannot allocate memory";
1730
1731 if (cache_alloc(sb, ca) != 0)
1732 return err;
1733
1734 ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
1735 ca->bdev = bdev;
1736 ca->bdev->bd_holder = ca;
1737
1738 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
1739 ca->discard = CACHE_DISCARD(&ca->sb);
1740
1741 err = "error creating kobject";
1742 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
1743 goto err;
1744
1745 err = register_cache_set(ca);
1746 if (err)
1747 goto err;
1748
1749 pr_info("registered cache device %s", bdevname(bdev, name));
1750
1751 return NULL;
1752err:
1753 kobject_put(&ca->kobj);
1754 pr_info("error opening %s: %s", bdevname(bdev, name), err);
1755 /* Return NULL instead of an error because kobject_put() cleans
1756 * everything up
1757 */
1758 return NULL;
1759}
1760
1761/* Global interfaces/init */
1762
1763static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1764 const char *, size_t);
1765
1766kobj_attribute_write(register, register_bcache);
1767kobj_attribute_write(register_quiet, register_bcache);
1768
1769static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1770 const char *buffer, size_t size)
1771{
1772 ssize_t ret = size;
1773 const char *err = "cannot allocate memory";
1774 char *path = NULL;
1775 struct cache_sb *sb = NULL;
1776 struct block_device *bdev = NULL;
1777 struct page *sb_page = NULL;
1778
1779 if (!try_module_get(THIS_MODULE))
1780 return -EBUSY;
1781
1782 mutex_lock(&bch_register_lock);
1783
1784 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
1785 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
1786 goto err;
1787
1788 err = "failed to open device";
1789 bdev = blkdev_get_by_path(strim(path),
1790 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1791 sb);
1792 if (bdev == ERR_PTR(-EBUSY))
1793 err = "device busy";
1794
1795 if (IS_ERR(bdev) ||
1796 set_blocksize(bdev, 4096))
1797 goto err;
1798
1799 err = read_super(sb, bdev, &sb_page);
1800 if (err)
1801 goto err_close;
1802
1803 if (sb->version == CACHE_BACKING_DEV) {
1804 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
1805
1806 err = register_bdev(sb, sb_page, bdev, dc);
1807 } else {
1808 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1809
1810 err = register_cache(sb, sb_page, bdev, ca);
1811 }
1812
1813 if (err) {
1814 /* register_(bdev|cache) will only return an error if they
1815 * didn't get far enough to create the kobject - if they did,
1816 * the kobject destructor will do this cleanup.
1817 */
1818 put_page(sb_page);
1819err_close:
1820 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1821err:
1822 if (attr != &ksysfs_register_quiet)
1823 pr_info("error opening %s: %s", path, err);
1824 ret = -EINVAL;
1825 }
1826
1827 kfree(sb);
1828 kfree(path);
1829 mutex_unlock(&bch_register_lock);
1830 module_put(THIS_MODULE);
1831 return ret;
1832}
1833
1834static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
1835{
1836 if (code == SYS_DOWN ||
1837 code == SYS_HALT ||
1838 code == SYS_POWER_OFF) {
1839 DEFINE_WAIT(wait);
1840 unsigned long start = jiffies;
1841 bool stopped = false;
1842
1843 struct cache_set *c, *tc;
1844 struct cached_dev *dc, *tdc;
1845
1846 mutex_lock(&bch_register_lock);
1847
1848 if (list_empty(&bch_cache_sets) &&
1849 list_empty(&uncached_devices))
1850 goto out;
1851
1852 pr_info("Stopping all devices:");
1853
1854 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1855 bch_cache_set_stop(c);
1856
1857 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
1858 bcache_device_stop(&dc->disk);
1859
1860 /* What's a condition variable? */
1861 while (1) {
1862 long timeout = start + 2 * HZ - jiffies;
1863
1864 stopped = list_empty(&bch_cache_sets) &&
1865 list_empty(&uncached_devices);
1866
1867 if (timeout < 0 || stopped)
1868 break;
1869
1870 prepare_to_wait(&unregister_wait, &wait,
1871 TASK_UNINTERRUPTIBLE);
1872
1873 mutex_unlock(&bch_register_lock);
1874 schedule_timeout(timeout);
1875 mutex_lock(&bch_register_lock);
1876 }
1877
1878 finish_wait(&unregister_wait, &wait);
1879
1880 if (stopped)
1881 pr_info("All devices stopped");
1882 else
1883 pr_notice("Timeout waiting for devices to be closed");
1884out:
1885 mutex_unlock(&bch_register_lock);
1886 }
1887
1888 return NOTIFY_DONE;
1889}
1890
1891static struct notifier_block reboot = {
1892 .notifier_call = bcache_reboot,
1893 .priority = INT_MAX, /* before any real devices */
1894};
1895
1896static void bcache_exit(void)
1897{
1898 bch_debug_exit();
1899 bch_writeback_exit();
1900 bch_request_exit();
1901 bch_btree_exit();
1902 if (bcache_kobj)
1903 kobject_put(bcache_kobj);
1904 if (bcache_wq)
1905 destroy_workqueue(bcache_wq);
1906 unregister_blkdev(bcache_major, "bcache");
1907 unregister_reboot_notifier(&reboot);
1908}
1909
1910static int __init bcache_init(void)
1911{
1912 static const struct attribute *files[] = {
1913 &ksysfs_register.attr,
1914 &ksysfs_register_quiet.attr,
1915 NULL
1916 };
1917
1918 mutex_init(&bch_register_lock);
1919 init_waitqueue_head(&unregister_wait);
1920 register_reboot_notifier(&reboot);
Kent Overstreet07e86cc2013-03-25 11:46:43 -07001921 closure_debug_init();
Kent Overstreetcafe5632013-03-23 16:11:31 -07001922
1923 bcache_major = register_blkdev(0, "bcache");
1924 if (bcache_major < 0)
1925 return bcache_major;
1926
1927 if (!(bcache_wq = create_workqueue("bcache")) ||
1928 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
1929 sysfs_create_files(bcache_kobj, files) ||
1930 bch_btree_init() ||
1931 bch_request_init() ||
1932 bch_writeback_init() ||
1933 bch_debug_init(bcache_kobj))
1934 goto err;
1935
1936 return 0;
1937err:
1938 bcache_exit();
1939 return -ENOMEM;
1940}
1941
1942module_exit(bcache_exit);
1943module_init(bcache_init);