blob: e83bddcca34e4524ad11918a7b7d2680275838b4 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Alex Elderd4b125e2012-07-03 16:01:19 -050064#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
65#define RBD_MAX_SNAP_NAME_LEN \
66 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
67
Alex Elder35d489f2012-07-03 16:01:19 -050068#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define RBD_MAX_OPT_LEN 1024
70
71#define RBD_SNAP_HEAD_NAME "-"
72
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
74#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050075
Alex Elderd8891402012-10-09 13:50:17 -070076/* Feature bits */
77
78#define RBD_FEATURE_LAYERING 1
79
80/* Features supported by this (client software) implementation. */
81
82#define RBD_FEATURES_ALL (0)
83
Alex Elder81a89792012-02-02 08:13:30 -060084/*
85 * An RBD device name will be "rbd#", where the "rbd" comes from
86 * RBD_DRV_NAME above, and # is a unique integer identifier.
87 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
88 * enough to hold all possible device names.
89 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060091#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092
Alex Eldercc0538b2012-08-10 13:12:07 -070093#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070094
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
112 u64 obj_version;
113};
114
115struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700116 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700117};
118
119/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600120 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700121 */
122struct rbd_client {
123 struct ceph_client *client;
124 struct kref kref;
125 struct list_head node;
126};
127
128/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600129 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700130 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700131struct rbd_req_status {
132 int done;
133 int rc;
134 u64 bytes;
135};
136
137/*
138 * a collection of requests
139 */
140struct rbd_req_coll {
141 int total;
142 int num_done;
143 struct kref kref;
144 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145};
146
Alex Elderf0f8cef2012-01-29 13:57:44 -0600147/*
148 * a single io request
149 */
150struct rbd_request {
151 struct request *rq; /* blk layer request */
152 struct bio *bio; /* cloned bio */
153 struct page **pages; /* list of used pages */
154 u64 len;
155 int coll_index;
156 struct rbd_req_coll *coll;
157};
158
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800159struct rbd_snap {
160 struct device dev;
161 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800162 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800163 struct list_head node;
164 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500165 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800166};
167
Alex Elderf84344f2012-08-31 17:29:51 -0500168struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500169 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500170 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500171 bool read_only;
172};
173
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700174/*
175 * a single device
176 */
177struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500178 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700179
180 int major; /* blkdev assigned major */
181 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
Alex Eldera30b71b2012-07-10 20:30:11 -0500183 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700184 struct rbd_client *rbd_client;
185
186 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
187
188 spinlock_t lock; /* queue lock */
189
190 struct rbd_image_header header;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500191 bool exists;
Alex Elder589d30e2012-07-10 20:30:11 -0500192 char *image_id;
193 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500194 char *image_name;
195 size_t image_name_len;
196 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500197 char *pool_name;
Alex Elder86992092012-10-25 23:34:41 -0500198 u64 pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199
Alex Elder971f8392012-10-25 23:34:41 -0500200 char *snap_name;
201 u64 snap_id;
202
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700203 struct ceph_osd_event *watch_event;
204 struct ceph_osd_request *watch_request;
205
Josh Durginc6666012011-11-21 17:11:12 -0800206 /* protects updating the header */
207 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500208
209 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700210
211 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800212
213 /* list of snapshots */
214 struct list_head snaps;
215
216 /* sysfs related */
217 struct device dev;
218};
219
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700220static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600221
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700222static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600223static DEFINE_SPINLOCK(rbd_dev_list_lock);
224
Alex Elder432b8582012-01-29 13:57:44 -0600225static LIST_HEAD(rbd_client_list); /* clients */
226static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700227
Alex Elder304f6802012-08-31 17:29:52 -0500228static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
229static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
230
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800231static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500232static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233
Alex Elderf0f8cef2012-01-29 13:57:44 -0600234static ssize_t rbd_add(struct bus_type *bus, const char *buf,
235 size_t count);
236static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
237 size_t count);
238
239static struct bus_attribute rbd_bus_attrs[] = {
240 __ATTR(add, S_IWUSR, NULL, rbd_add),
241 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
242 __ATTR_NULL
243};
244
245static struct bus_type rbd_bus_type = {
246 .name = "rbd",
247 .bus_attrs = rbd_bus_attrs,
248};
249
250static void rbd_root_dev_release(struct device *dev)
251{
252}
253
254static struct device rbd_root_dev = {
255 .init_name = "rbd",
256 .release = rbd_root_dev_release,
257};
258
Alex Elderaafb2302012-09-06 16:00:54 -0500259#ifdef RBD_DEBUG
260#define rbd_assert(expr) \
261 if (unlikely(!(expr))) { \
262 printk(KERN_ERR "\nAssertion failure in %s() " \
263 "at line %d:\n\n" \
264 "\trbd_assert(%s);\n\n", \
265 __func__, __LINE__, #expr); \
266 BUG(); \
267 }
268#else /* !RBD_DEBUG */
269# define rbd_assert(expr) ((void) 0)
270#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800272static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
273{
274 return get_device(&rbd_dev->dev);
275}
276
277static void rbd_put_dev(struct rbd_device *rbd_dev)
278{
279 put_device(&rbd_dev->dev);
280}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281
Alex Elder117973f2012-08-31 17:29:55 -0500282static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
283static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700284
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700285static int rbd_open(struct block_device *bdev, fmode_t mode)
286{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600287 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700288
Alex Elderf84344f2012-08-31 17:29:51 -0500289 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700290 return -EROFS;
291
Alex Elder340c7a22012-08-10 13:12:07 -0700292 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500293 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700294
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 return 0;
296}
297
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800298static int rbd_release(struct gendisk *disk, fmode_t mode)
299{
300 struct rbd_device *rbd_dev = disk->private_data;
301
302 rbd_put_dev(rbd_dev);
303
304 return 0;
305}
306
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307static const struct block_device_operations rbd_bd_ops = {
308 .owner = THIS_MODULE,
309 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800310 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700311};
312
313/*
314 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500315 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700316 */
Alex Elderf8c38922012-08-10 13:12:07 -0700317static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700318{
319 struct rbd_client *rbdc;
320 int ret = -ENOMEM;
321
322 dout("rbd_client_create\n");
323 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
324 if (!rbdc)
325 goto out_opt;
326
327 kref_init(&rbdc->kref);
328 INIT_LIST_HEAD(&rbdc->node);
329
Alex Elderbc534d82012-01-29 13:57:44 -0600330 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
331
Alex Elder43ae4702012-07-03 16:01:18 -0500332 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600334 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500335 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336
337 ret = ceph_open_session(rbdc->client);
338 if (ret < 0)
339 goto out_err;
340
Alex Elder432b8582012-01-29 13:57:44 -0600341 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700342 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600343 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
Alex Elderbc534d82012-01-29 13:57:44 -0600345 mutex_unlock(&ctl_mutex);
346
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347 dout("rbd_client_create created %p\n", rbdc);
348 return rbdc;
349
350out_err:
351 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600352out_mutex:
353 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354 kfree(rbdc);
355out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500356 if (ceph_opts)
357 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400358 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700359}
360
361/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700362 * Find a ceph client with specific addr and configuration. If
363 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700365static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700366{
367 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700368 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369
Alex Elder43ae4702012-07-03 16:01:18 -0500370 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371 return NULL;
372
Alex Elder1f7ba332012-08-10 13:12:07 -0700373 spin_lock(&rbd_client_list_lock);
374 list_for_each_entry(client_node, &rbd_client_list, node) {
375 if (!ceph_compare_options(ceph_opts, client_node->client)) {
376 kref_get(&client_node->kref);
377 found = true;
378 break;
379 }
380 }
381 spin_unlock(&rbd_client_list_lock);
382
383 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700384}
385
386/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700387 * mount options
388 */
389enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700390 Opt_last_int,
391 /* int args above */
392 Opt_last_string,
393 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700394 Opt_read_only,
395 Opt_read_write,
396 /* Boolean args above */
397 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398};
399
Alex Elder43ae4702012-07-03 16:01:18 -0500400static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700401 /* int args above */
402 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500403 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700404 {Opt_read_only, "ro"}, /* Alternate spelling */
405 {Opt_read_write, "read_write"},
406 {Opt_read_write, "rw"}, /* Alternate spelling */
407 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700408 {-1, NULL}
409};
410
411static int parse_rbd_opts_token(char *c, void *private)
412{
Alex Elder43ae4702012-07-03 16:01:18 -0500413 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700414 substring_t argstr[MAX_OPT_ARGS];
415 int token, intval, ret;
416
Alex Elder43ae4702012-07-03 16:01:18 -0500417 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700418 if (token < 0)
419 return -EINVAL;
420
421 if (token < Opt_last_int) {
422 ret = match_int(&argstr[0], &intval);
423 if (ret < 0) {
424 pr_err("bad mount option arg (not int) "
425 "at '%s'\n", c);
426 return ret;
427 }
428 dout("got int token %d val %d\n", token, intval);
429 } else if (token > Opt_last_int && token < Opt_last_string) {
430 dout("got string token %d val %s\n", token,
431 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700432 } else if (token > Opt_last_string && token < Opt_last_bool) {
433 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434 } else {
435 dout("got token %d\n", token);
436 }
437
438 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700439 case Opt_read_only:
440 rbd_opts->read_only = true;
441 break;
442 case Opt_read_write:
443 rbd_opts->read_only = false;
444 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700445 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500446 rbd_assert(false);
447 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700448 }
449 return 0;
450}
451
452/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700453 * Get a ceph client with specific addr and configuration, if one does
454 * not exist create it.
455 */
Alex Elder78cea762012-10-25 23:34:41 -0500456static int rbd_get_client(struct rbd_device *rbd_dev,
457 struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458{
Alex Elderf8c38922012-08-10 13:12:07 -0700459 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700460
Alex Elder1f7ba332012-08-10 13:12:07 -0700461 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700462 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600463 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500464 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700465 } else {
466 rbdc = rbd_client_create(ceph_opts);
467 if (IS_ERR(rbdc))
468 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 }
Alex Elderf8c38922012-08-10 13:12:07 -0700470 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471
Alex Elderf8c38922012-08-10 13:12:07 -0700472 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700473}
474
475/*
476 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600477 *
Alex Elder432b8582012-01-29 13:57:44 -0600478 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700479 */
480static void rbd_client_release(struct kref *kref)
481{
482 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
483
484 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500485 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500487 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488
489 ceph_destroy_client(rbdc->client);
490 kfree(rbdc);
491}
492
493/*
494 * Drop reference to ceph client node. If it's not referenced anymore, release
495 * it.
496 */
497static void rbd_put_client(struct rbd_device *rbd_dev)
498{
499 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
500 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700501}
502
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700503/*
504 * Destroy requests collection
505 */
506static void rbd_coll_release(struct kref *kref)
507{
508 struct rbd_req_coll *coll =
509 container_of(kref, struct rbd_req_coll, kref);
510
511 dout("rbd_coll_release %p\n", coll);
512 kfree(coll);
513}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514
Alex Eldera30b71b2012-07-10 20:30:11 -0500515static bool rbd_image_format_valid(u32 image_format)
516{
517 return image_format == 1 || image_format == 2;
518}
519
Alex Elder8e94af82012-07-25 09:32:40 -0500520static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
521{
Alex Elder103a1502012-08-02 11:29:45 -0500522 size_t size;
523 u32 snap_count;
524
525 /* The header has to start with the magic rbd header text */
526 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
527 return false;
528
Alex Elderdb2388b2012-10-20 22:17:27 -0500529 /* The bio layer requires at least sector-sized I/O */
530
531 if (ondisk->options.order < SECTOR_SHIFT)
532 return false;
533
534 /* If we use u64 in a few spots we may be able to loosen this */
535
536 if (ondisk->options.order > 8 * sizeof (int) - 1)
537 return false;
538
Alex Elder103a1502012-08-02 11:29:45 -0500539 /*
540 * The size of a snapshot header has to fit in a size_t, and
541 * that limits the number of snapshots.
542 */
543 snap_count = le32_to_cpu(ondisk->snap_count);
544 size = SIZE_MAX - sizeof (struct ceph_snap_context);
545 if (snap_count > size / sizeof (__le64))
546 return false;
547
548 /*
549 * Not only that, but the size of the entire the snapshot
550 * header must also be representable in a size_t.
551 */
552 size -= snap_count * sizeof (__le64);
553 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
554 return false;
555
556 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500557}
558
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700559/*
560 * Create a new header structure, translate header format from the on-disk
561 * header.
562 */
563static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500564 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565{
Alex Elderccece232012-07-10 20:30:10 -0500566 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500567 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500568 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500569 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570
Alex Elder6a523252012-07-19 17:12:59 -0500571 memset(header, 0, sizeof (*header));
572
Alex Elder103a1502012-08-02 11:29:45 -0500573 snap_count = le32_to_cpu(ondisk->snap_count);
574
Alex Elder58c17b02012-08-23 23:22:06 -0500575 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
576 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500577 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500579 memcpy(header->object_prefix, ondisk->object_prefix, len);
580 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600581
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700582 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500583 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
584
Alex Elder621901d2012-08-23 23:22:06 -0500585 /* Save a copy of the snapshot names */
586
Alex Elderf785cc12012-08-23 23:22:06 -0500587 if (snap_names_len > (u64) SIZE_MAX)
588 return -EIO;
589 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500591 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500592 /*
593 * Note that rbd_dev_v1_header_read() guarantees
594 * the ondisk buffer we're working with has
595 * snap_names_len bytes beyond the end of the
596 * snapshot id array, this memcpy() is safe.
597 */
598 memcpy(header->snap_names, &ondisk->snaps[snap_count],
599 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500600
Alex Elder621901d2012-08-23 23:22:06 -0500601 /* Record each snapshot's size */
602
Alex Elderd2bb24e2012-07-26 23:37:14 -0500603 size = snap_count * sizeof (*header->snap_sizes);
604 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500606 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500607 for (i = 0; i < snap_count; i++)
608 header->snap_sizes[i] =
609 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700610 } else {
Alex Elderccece232012-07-10 20:30:10 -0500611 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 header->snap_names = NULL;
613 header->snap_sizes = NULL;
614 }
Alex Elder849b4262012-07-09 21:04:24 -0500615
Alex Elder34b13182012-07-13 20:35:12 -0500616 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700617 header->obj_order = ondisk->options.order;
618 header->crypt_type = ondisk->options.crypt_type;
619 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500620
Alex Elder621901d2012-08-23 23:22:06 -0500621 /* Allocate and fill in the snapshot context */
622
Alex Elderf84344f2012-08-31 17:29:51 -0500623 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500624 size = sizeof (struct ceph_snap_context);
625 size += snap_count * sizeof (header->snapc->snaps[0]);
626 header->snapc = kzalloc(size, GFP_KERNEL);
627 if (!header->snapc)
628 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629
630 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500631 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700632 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500633 for (i = 0; i < snap_count; i++)
634 header->snapc->snaps[i] =
635 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636
637 return 0;
638
Alex Elder6a523252012-07-19 17:12:59 -0500639out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500640 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500641 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500643 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500644 kfree(header->object_prefix);
645 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500646
Alex Elder00f1f362012-02-07 12:03:36 -0600647 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648}
649
Alex Elder8836b992012-08-30 14:42:15 -0500650static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652
Alex Eldere86924a2012-07-10 20:30:11 -0500653 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600654
Alex Eldere86924a2012-07-10 20:30:11 -0500655 list_for_each_entry(snap, &rbd_dev->snaps, node) {
656 if (!strcmp(snap_name, snap->name)) {
Alex Elder971f8392012-10-25 23:34:41 -0500657 rbd_dev->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500658 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500659 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600660
Alex Eldere86924a2012-07-10 20:30:11 -0500661 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600662 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 }
Alex Eldere86924a2012-07-10 20:30:11 -0500664
Alex Elder00f1f362012-02-07 12:03:36 -0600665 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666}
667
Alex Elder5ed16172012-08-29 17:11:07 -0500668static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669{
Alex Elder78dc4472012-07-19 08:49:18 -0500670 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671
Alex Elder4e1105a2012-08-31 17:29:52 -0500672 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800673 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder971f8392012-10-25 23:34:41 -0500674 rbd_dev->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500675 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500676 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500677 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500679 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680 if (ret < 0)
681 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500682 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683 }
Alex Elder971f8392012-10-25 23:34:41 -0500684 rbd_dev->snap_name = snap_name;
Alex Elderdaba5fd2012-10-26 17:25:23 -0500685 rbd_dev->exists = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700686done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687 return ret;
688}
689
690static void rbd_header_free(struct rbd_image_header *header)
691{
Alex Elder849b4262012-07-09 21:04:24 -0500692 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500693 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700694 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500695 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500696 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500697 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800698 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500699 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700}
701
Alex Elder65ccfe22012-08-09 10:33:26 -0700702static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703{
Alex Elder65ccfe22012-08-09 10:33:26 -0700704 char *name;
705 u64 segment;
706 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707
Alex Elder65ccfe22012-08-09 10:33:26 -0700708 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
709 if (!name)
710 return NULL;
711 segment = offset >> rbd_dev->header.obj_order;
712 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
713 rbd_dev->header.object_prefix, segment);
714 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
715 pr_err("error formatting segment name for #%llu (%d)\n",
716 segment, ret);
717 kfree(name);
718 name = NULL;
719 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700720
Alex Elder65ccfe22012-08-09 10:33:26 -0700721 return name;
722}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723
Alex Elder65ccfe22012-08-09 10:33:26 -0700724static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
725{
726 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727
Alex Elder65ccfe22012-08-09 10:33:26 -0700728 return offset & (segment_size - 1);
729}
730
731static u64 rbd_segment_length(struct rbd_device *rbd_dev,
732 u64 offset, u64 length)
733{
734 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
735
736 offset &= segment_size - 1;
737
Alex Elderaafb2302012-09-06 16:00:54 -0500738 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700739 if (offset + length > segment_size)
740 length = segment_size - offset;
741
742 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743}
744
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700745static int rbd_get_num_segments(struct rbd_image_header *header,
746 u64 ofs, u64 len)
747{
Alex Elderdf111be2012-08-09 10:33:26 -0700748 u64 start_seg;
749 u64 end_seg;
750
751 if (!len)
752 return 0;
753 if (len - 1 > U64_MAX - ofs)
754 return -ERANGE;
755
756 start_seg = ofs >> header->obj_order;
757 end_seg = (ofs + len - 1) >> header->obj_order;
758
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700759 return end_seg - start_seg + 1;
760}
761
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700762/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700763 * returns the size of an object in the image
764 */
765static u64 rbd_obj_bytes(struct rbd_image_header *header)
766{
767 return 1 << header->obj_order;
768}
769
770/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700771 * bio helpers
772 */
773
774static void bio_chain_put(struct bio *chain)
775{
776 struct bio *tmp;
777
778 while (chain) {
779 tmp = chain;
780 chain = chain->bi_next;
781 bio_put(tmp);
782 }
783}
784
785/*
786 * zeros a bio chain, starting at specific offset
787 */
788static void zero_bio_chain(struct bio *chain, int start_ofs)
789{
790 struct bio_vec *bv;
791 unsigned long flags;
792 void *buf;
793 int i;
794 int pos = 0;
795
796 while (chain) {
797 bio_for_each_segment(bv, chain, i) {
798 if (pos + bv->bv_len > start_ofs) {
799 int remainder = max(start_ofs - pos, 0);
800 buf = bvec_kmap_irq(bv, &flags);
801 memset(buf + remainder, 0,
802 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200803 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700804 }
805 pos += bv->bv_len;
806 }
807
808 chain = chain->bi_next;
809 }
810}
811
812/*
Alex Elderf7760da2012-10-20 22:17:27 -0500813 * Clone a portion of a bio, starting at the given byte offset
814 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700815 */
Alex Elderf7760da2012-10-20 22:17:27 -0500816static struct bio *bio_clone_range(struct bio *bio_src,
817 unsigned int offset,
818 unsigned int len,
819 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700820{
Alex Elderf7760da2012-10-20 22:17:27 -0500821 struct bio_vec *bv;
822 unsigned int resid;
823 unsigned short idx;
824 unsigned int voff;
825 unsigned short end_idx;
826 unsigned short vcnt;
827 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700828
Alex Elderf7760da2012-10-20 22:17:27 -0500829 /* Handle the easy case for the caller */
830
831 if (!offset && len == bio_src->bi_size)
832 return bio_clone(bio_src, gfpmask);
833
834 if (WARN_ON_ONCE(!len))
835 return NULL;
836 if (WARN_ON_ONCE(len > bio_src->bi_size))
837 return NULL;
838 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
839 return NULL;
840
841 /* Find first affected segment... */
842
843 resid = offset;
844 __bio_for_each_segment(bv, bio_src, idx, 0) {
845 if (resid < bv->bv_len)
846 break;
847 resid -= bv->bv_len;
848 }
849 voff = resid;
850
851 /* ...and the last affected segment */
852
853 resid += len;
854 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
855 if (resid <= bv->bv_len)
856 break;
857 resid -= bv->bv_len;
858 }
859 vcnt = end_idx - idx + 1;
860
861 /* Build the clone */
862
863 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
864 if (!bio)
865 return NULL; /* ENOMEM */
866
867 bio->bi_bdev = bio_src->bi_bdev;
868 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
869 bio->bi_rw = bio_src->bi_rw;
870 bio->bi_flags |= 1 << BIO_CLONED;
871
872 /*
873 * Copy over our part of the bio_vec, then update the first
874 * and last (or only) entries.
875 */
876 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
877 vcnt * sizeof (struct bio_vec));
878 bio->bi_io_vec[0].bv_offset += voff;
879 if (vcnt > 1) {
880 bio->bi_io_vec[0].bv_len -= voff;
881 bio->bi_io_vec[vcnt - 1].bv_len = resid;
882 } else {
883 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 }
885
Alex Elderf7760da2012-10-20 22:17:27 -0500886 bio->bi_vcnt = vcnt;
887 bio->bi_size = len;
888 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700889
Alex Elderf7760da2012-10-20 22:17:27 -0500890 return bio;
891}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892
Alex Elderf7760da2012-10-20 22:17:27 -0500893/*
894 * Clone a portion of a bio chain, starting at the given byte offset
895 * into the first bio in the source chain and continuing for the
896 * number of bytes indicated. The result is another bio chain of
897 * exactly the given length, or a null pointer on error.
898 *
899 * The bio_src and offset parameters are both in-out. On entry they
900 * refer to the first source bio and the offset into that bio where
901 * the start of data to be cloned is located.
902 *
903 * On return, bio_src is updated to refer to the bio in the source
904 * chain that contains first un-cloned byte, and *offset will
905 * contain the offset of that byte within that bio.
906 */
907static struct bio *bio_chain_clone_range(struct bio **bio_src,
908 unsigned int *offset,
909 unsigned int len,
910 gfp_t gfpmask)
911{
912 struct bio *bi = *bio_src;
913 unsigned int off = *offset;
914 struct bio *chain = NULL;
915 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700916
Alex Elderf7760da2012-10-20 22:17:27 -0500917 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700918
Alex Elderf7760da2012-10-20 22:17:27 -0500919 if (!bi || off >= bi->bi_size || !len)
920 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700921
Alex Elderf7760da2012-10-20 22:17:27 -0500922 end = &chain;
923 while (len) {
924 unsigned int bi_size;
925 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700926
Alex Elderf7760da2012-10-20 22:17:27 -0500927 if (!bi)
928 goto out_err; /* EINVAL; ran out of bio's */
929 bi_size = min_t(unsigned int, bi->bi_size - off, len);
930 bio = bio_clone_range(bi, off, bi_size, gfpmask);
931 if (!bio)
932 goto out_err; /* ENOMEM */
933
934 *end = bio;
935 end = &bio->bi_next;
936
937 off += bi_size;
938 if (off == bi->bi_size) {
939 bi = bi->bi_next;
940 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700941 }
Alex Elderf7760da2012-10-20 22:17:27 -0500942 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700943 }
Alex Elderf7760da2012-10-20 22:17:27 -0500944 *bio_src = bi;
945 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700946
Alex Elderf7760da2012-10-20 22:17:27 -0500947 return chain;
948out_err:
949 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700950
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700951 return NULL;
952}
953
954/*
955 * helpers for osd request op vectors.
956 */
Alex Elder57cfc102012-06-26 12:57:03 -0700957static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
958 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700959{
Alex Elder57cfc102012-06-26 12:57:03 -0700960 struct ceph_osd_req_op *ops;
961
962 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
963 if (!ops)
964 return NULL;
965
966 ops[0].op = opcode;
967
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 /*
969 * op extent offset and length will be set later on
970 * in calc_raw_layout()
971 */
Alex Elder57cfc102012-06-26 12:57:03 -0700972 ops[0].payload_len = payload_len;
973
974 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975}
976
977static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
978{
979 kfree(ops);
980}
981
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700982static void rbd_coll_end_req_index(struct request *rq,
983 struct rbd_req_coll *coll,
984 int index,
985 int ret, u64 len)
986{
987 struct request_queue *q;
988 int min, max, i;
989
Alex Elderbd919d42012-07-13 20:35:11 -0500990 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
991 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700992
993 if (!rq)
994 return;
995
996 if (!coll) {
997 blk_end_request(rq, ret, len);
998 return;
999 }
1000
1001 q = rq->q;
1002
1003 spin_lock_irq(q->queue_lock);
1004 coll->status[index].done = 1;
1005 coll->status[index].rc = ret;
1006 coll->status[index].bytes = len;
1007 max = min = coll->num_done;
1008 while (max < coll->total && coll->status[max].done)
1009 max++;
1010
1011 for (i = min; i<max; i++) {
1012 __blk_end_request(rq, coll->status[i].rc,
1013 coll->status[i].bytes);
1014 coll->num_done++;
1015 kref_put(&coll->kref, rbd_coll_release);
1016 }
1017 spin_unlock_irq(q->queue_lock);
1018}
1019
1020static void rbd_coll_end_req(struct rbd_request *req,
1021 int ret, u64 len)
1022{
1023 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
1024}
1025
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026/*
1027 * Send ceph osd request
1028 */
1029static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001030 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031 struct ceph_snap_context *snapc,
1032 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001033 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001034 struct bio *bio,
1035 struct page **pages,
1036 int num_pages,
1037 int flags,
1038 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001039 struct rbd_req_coll *coll,
1040 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001041 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001042 struct ceph_msg *msg),
1043 struct ceph_osd_request **linger_req,
1044 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001045{
1046 struct ceph_osd_request *req;
1047 struct ceph_file_layout *layout;
1048 int ret;
1049 u64 bno;
1050 struct timespec mtime = CURRENT_TIME;
1051 struct rbd_request *req_data;
1052 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -06001053 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001054
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001055 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001056 if (!req_data) {
1057 if (coll)
1058 rbd_coll_end_req_index(rq, coll, coll_index,
1059 -ENOMEM, len);
1060 return -ENOMEM;
1061 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001062
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001063 if (coll) {
1064 req_data->coll = coll;
1065 req_data->coll_index = coll_index;
1066 }
1067
Alex Elderf7760da2012-10-20 22:17:27 -05001068 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1069 object_name, (unsigned long long) ofs,
1070 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001071
Alex Elder0ce1a792012-07-03 16:01:18 -05001072 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -06001073 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
1074 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -07001075 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -07001076 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001077 goto done_pages;
1078 }
1079
1080 req->r_callback = rbd_cb;
1081
1082 req_data->rq = rq;
1083 req_data->bio = bio;
1084 req_data->pages = pages;
1085 req_data->len = len;
1086
1087 req->r_priv = req_data;
1088
1089 reqhead = req->r_request->front.iov_base;
1090 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1091
Alex Elderaded07e2012-07-03 16:01:18 -05001092 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001093 req->r_oid_len = strlen(req->r_oid);
1094
1095 layout = &req->r_file_layout;
1096 memset(layout, 0, sizeof(*layout));
1097 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1098 layout->fl_stripe_count = cpu_to_le32(1);
1099 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder86992092012-10-25 23:34:41 -05001100 layout->fl_pg_pool = cpu_to_le32((int) rbd_dev->pool_id);
Sage Weil6cae3712012-09-24 21:02:47 -07001101 ret = ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1102 req, ops);
1103 rbd_assert(ret == 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104
1105 ceph_osdc_build_request(req, ofs, &len,
1106 ops,
1107 snapc,
1108 &mtime,
1109 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001110
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001111 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001112 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001113 *linger_req = req;
1114 }
1115
Alex Elder1dbb4392012-01-24 10:08:37 -06001116 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117 if (ret < 0)
1118 goto done_err;
1119
1120 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001121 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001122 if (ver)
1123 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001124 dout("reassert_ver=%llu\n",
1125 (unsigned long long)
1126 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001127 ceph_osdc_put_request(req);
1128 }
1129 return ret;
1130
1131done_err:
1132 bio_chain_put(req_data->bio);
1133 ceph_osdc_put_request(req);
1134done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001135 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001136 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137 return ret;
1138}
1139
1140/*
1141 * Ceph osd op callback
1142 */
1143static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1144{
1145 struct rbd_request *req_data = req->r_priv;
1146 struct ceph_osd_reply_head *replyhead;
1147 struct ceph_osd_op *op;
1148 __s32 rc;
1149 u64 bytes;
1150 int read_op;
1151
1152 /* parse reply */
1153 replyhead = msg->front.iov_base;
1154 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1155 op = (void *)(replyhead + 1);
1156 rc = le32_to_cpu(replyhead->result);
1157 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001158 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001159
Alex Elderbd919d42012-07-13 20:35:11 -05001160 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1161 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162
1163 if (rc == -ENOENT && read_op) {
1164 zero_bio_chain(req_data->bio, 0);
1165 rc = 0;
1166 } else if (rc == 0 && read_op && bytes < req_data->len) {
1167 zero_bio_chain(req_data->bio, bytes);
1168 bytes = req_data->len;
1169 }
1170
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001171 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172
1173 if (req_data->bio)
1174 bio_chain_put(req_data->bio);
1175
1176 ceph_osdc_put_request(req);
1177 kfree(req_data);
1178}
1179
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001180static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1181{
1182 ceph_osdc_put_request(req);
1183}
1184
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001185/*
1186 * Do a synchronous ceph osd operation
1187 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001188static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001189 struct ceph_snap_context *snapc,
1190 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001191 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001192 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001193 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001194 u64 ofs, u64 inbound_size,
1195 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001196 struct ceph_osd_request **linger_req,
1197 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001198{
1199 int ret;
1200 struct page **pages;
1201 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001202
Alex Elderaafb2302012-09-06 16:00:54 -05001203 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001204
Alex Elderf8d4de62012-07-03 16:01:19 -05001205 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001206 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001207 if (IS_ERR(pages))
1208 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001209
Alex Elder0ce1a792012-07-03 16:01:18 -05001210 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001211 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212 pages, num_pages,
1213 flags,
1214 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001215 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001216 NULL,
1217 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001218 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001219 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001220
Alex Elderf8d4de62012-07-03 16:01:19 -05001221 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1222 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001224done:
1225 ceph_release_page_vector(pages, num_pages);
1226 return ret;
1227}
1228
1229/*
1230 * Do an asynchronous ceph osd operation
1231 */
1232static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001233 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001235 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001236 struct bio *bio,
1237 struct rbd_req_coll *coll,
1238 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239{
1240 char *seg_name;
1241 u64 seg_ofs;
1242 u64 seg_len;
1243 int ret;
1244 struct ceph_osd_req_op *ops;
1245 u32 payload_len;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001246 int opcode;
1247 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001248 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001249
Alex Elder65ccfe22012-08-09 10:33:26 -07001250 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001251 if (!seg_name)
1252 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001253 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1254 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255
Alex Elderff2e4bb2012-10-10 18:59:29 -07001256 if (rq_data_dir(rq) == WRITE) {
1257 opcode = CEPH_OSD_OP_WRITE;
1258 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001259 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001260 payload_len = seg_len;
1261 } else {
1262 opcode = CEPH_OSD_OP_READ;
1263 flags = CEPH_OSD_FLAG_READ;
Alex Elder46342462012-10-10 18:59:29 -07001264 snapc = NULL;
Alex Elder971f8392012-10-25 23:34:41 -05001265 snapid = rbd_dev->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001266 payload_len = 0;
1267 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001268
Alex Elder57cfc102012-06-26 12:57:03 -07001269 ret = -ENOMEM;
1270 ops = rbd_create_rw_ops(1, opcode, payload_len);
1271 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272 goto done;
1273
1274 /* we've taken care of segment sizes earlier when we
1275 cloned the bios. We should never have a segment
1276 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001277 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001278
1279 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1280 seg_name, seg_ofs, seg_len,
1281 bio,
1282 NULL, 0,
1283 flags,
1284 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001285 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001286 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001287
1288 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001289done:
1290 kfree(seg_name);
1291 return ret;
1292}
1293
1294/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001295 * Request sync osd read
1296 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001297static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001298 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001299 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001300 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001301 char *buf,
1302 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001303{
Alex Elder913d2fd2012-06-26 12:57:03 -07001304 struct ceph_osd_req_op *ops;
1305 int ret;
1306
1307 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1308 if (!ops)
1309 return -ENOMEM;
1310
1311 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001312 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001313 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001314 ops, object_name, ofs, len, buf, NULL, ver);
1315 rbd_destroy_ops(ops);
1316
1317 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001318}
1319
1320/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321 * Request sync osd watch
1322 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001323static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001324 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001325 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001326{
1327 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001328 int ret;
1329
Alex Elder57cfc102012-06-26 12:57:03 -07001330 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1331 if (!ops)
1332 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001333
Josh Durgina71b8912011-12-05 18:10:44 -08001334 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001335 ops[0].watch.cookie = notify_id;
1336 ops[0].watch.flag = 0;
1337
Alex Elder0ce1a792012-07-03 16:01:18 -05001338 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001339 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001340 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001341 CEPH_OSD_FLAG_READ,
1342 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001343 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001344 rbd_simple_req_cb, 0, NULL);
1345
1346 rbd_destroy_ops(ops);
1347 return ret;
1348}
1349
1350static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1351{
Alex Elder0ce1a792012-07-03 16:01:18 -05001352 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001353 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001354 int rc;
1355
Alex Elder0ce1a792012-07-03 16:01:18 -05001356 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001357 return;
1358
Alex Elderbd919d42012-07-13 20:35:11 -05001359 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1360 rbd_dev->header_name, (unsigned long long) notify_id,
1361 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001362 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001363 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001364 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001365 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001366
Alex Elder7f0a24d2012-07-25 09:32:40 -05001367 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001368}
1369
1370/*
1371 * Request sync osd watch
1372 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001373static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001374{
1375 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001376 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001377 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001378
Alex Elder57cfc102012-06-26 12:57:03 -07001379 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1380 if (!ops)
1381 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001382
1383 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001384 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001385 if (ret < 0)
1386 goto fail;
1387
Alex Elder0e6f3222012-07-25 09:32:40 -05001388 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001389 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001390 ops[0].watch.flag = 1;
1391
Alex Elder0ce1a792012-07-03 16:01:18 -05001392 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001393 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001394 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1395 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001396 rbd_dev->header_name,
1397 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001398 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001399
1400 if (ret < 0)
1401 goto fail_event;
1402
1403 rbd_destroy_ops(ops);
1404 return 0;
1405
1406fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001407 ceph_osdc_cancel_event(rbd_dev->watch_event);
1408 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001409fail:
1410 rbd_destroy_ops(ops);
1411 return ret;
1412}
1413
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001414/*
1415 * Request sync osd unwatch
1416 */
Alex Elder070c6332012-07-25 09:32:41 -05001417static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001418{
1419 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001420 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001421
Alex Elder57cfc102012-06-26 12:57:03 -07001422 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1423 if (!ops)
1424 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001425
1426 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001427 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001428 ops[0].watch.flag = 0;
1429
Alex Elder0ce1a792012-07-03 16:01:18 -05001430 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001431 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001432 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1433 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001434 rbd_dev->header_name,
1435 0, 0, NULL, NULL, NULL);
1436
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001437
1438 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001439 ceph_osdc_cancel_event(rbd_dev->watch_event);
1440 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001441 return ret;
1442}
1443
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001444/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001445 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001446 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001447static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001448 const char *object_name,
1449 const char *class_name,
1450 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001451 const char *outbound,
1452 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001453 char *inbound,
1454 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001455 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001456 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001457{
1458 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001459 int class_name_len = strlen(class_name);
1460 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001461 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001462 int ret;
1463
Alex Elder3cb4a682012-06-26 12:57:03 -07001464 /*
1465 * Any input parameters required by the method we're calling
1466 * will be sent along with the class and method names as
1467 * part of the message payload. That data and its size are
1468 * supplied via the indata and indata_len fields (named from
1469 * the perspective of the server side) in the OSD request
1470 * operation.
1471 */
1472 payload_size = class_name_len + method_name_len + outbound_size;
1473 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001474 if (!ops)
1475 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001476
Alex Elderaded07e2012-07-03 16:01:18 -05001477 ops[0].cls.class_name = class_name;
1478 ops[0].cls.class_len = (__u8) class_name_len;
1479 ops[0].cls.method_name = method_name;
1480 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001481 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001482 ops[0].cls.indata = outbound;
1483 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001484
Alex Elder0ce1a792012-07-03 16:01:18 -05001485 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001486 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001487 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001488 object_name, 0, inbound_size, inbound,
1489 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001490
1491 rbd_destroy_ops(ops);
1492
1493 dout("cls_exec returned %d\n", ret);
1494 return ret;
1495}
1496
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001497static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1498{
1499 struct rbd_req_coll *coll =
1500 kzalloc(sizeof(struct rbd_req_coll) +
1501 sizeof(struct rbd_req_status) * num_reqs,
1502 GFP_ATOMIC);
1503
1504 if (!coll)
1505 return NULL;
1506 coll->total = num_reqs;
1507 kref_init(&coll->kref);
1508 return coll;
1509}
1510
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511/*
1512 * block device queue callback
1513 */
1514static void rbd_rq_fn(struct request_queue *q)
1515{
1516 struct rbd_device *rbd_dev = q->queuedata;
1517 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001518
Alex Elder00f1f362012-02-07 12:03:36 -06001519 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001522 unsigned int size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001523 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001524 int num_segs, cur_seg = 0;
1525 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001526 struct ceph_snap_context *snapc;
Alex Elderf7760da2012-10-20 22:17:27 -05001527 unsigned int bio_offset;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529 dout("fetched request\n");
1530
1531 /* filter out block requests we don't understand */
1532 if ((rq->cmd_type != REQ_TYPE_FS)) {
1533 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001534 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 }
1536
1537 /* deduce our operation (read, write) */
1538 do_write = (rq_data_dir(rq) == WRITE);
Alex Elderf84344f2012-08-31 17:29:51 -05001539 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001541 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001542 }
1543
1544 spin_unlock_irq(q->queue_lock);
1545
Josh Durgind1d25642011-12-05 14:03:05 -08001546 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001547
Alex Elderdaba5fd2012-10-26 17:25:23 -05001548 if (!rbd_dev->exists) {
1549 rbd_assert(rbd_dev->snap_id != CEPH_NOSNAP);
Josh Durgine88a36e2011-11-21 18:14:25 -08001550 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001551 dout("request for non-existent snapshot");
1552 spin_lock_irq(q->queue_lock);
1553 __blk_end_request_all(rq, -ENXIO);
1554 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001555 }
1556
Josh Durgind1d25642011-12-05 14:03:05 -08001557 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1558
1559 up_read(&rbd_dev->header_rwsem);
1560
Alex Elderf7760da2012-10-20 22:17:27 -05001561 size = blk_rq_bytes(rq);
1562 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
1563 bio = rq->bio;
1564
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001565 dout("%s 0x%x bytes at 0x%llx\n",
1566 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001567 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001568
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001569 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001570 if (num_segs <= 0) {
1571 spin_lock_irq(q->queue_lock);
1572 __blk_end_request_all(rq, num_segs);
1573 ceph_put_snap_context(snapc);
1574 continue;
1575 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001576 coll = rbd_alloc_coll(num_segs);
1577 if (!coll) {
1578 spin_lock_irq(q->queue_lock);
1579 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001580 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001581 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001582 }
1583
Alex Elderf7760da2012-10-20 22:17:27 -05001584 bio_offset = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001585 do {
Alex Elderf7760da2012-10-20 22:17:27 -05001586 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1587 unsigned int chain_size;
1588 struct bio *bio_chain;
1589
1590 BUG_ON(limit > (u64) UINT_MAX);
1591 chain_size = (unsigned int) limit;
Alex Elderbd919d42012-07-13 20:35:11 -05001592 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elderf7760da2012-10-20 22:17:27 -05001593
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001594 kref_get(&coll->kref);
Alex Elderf7760da2012-10-20 22:17:27 -05001595
1596 /* Pass a cloned bio chain via an osd request */
1597
1598 bio_chain = bio_chain_clone_range(&bio,
1599 &bio_offset, chain_size,
1600 GFP_ATOMIC);
1601 if (bio_chain)
Alex Elder46342462012-10-10 18:59:29 -07001602 (void) rbd_do_op(rq, rbd_dev, snapc,
Alex Elderf7760da2012-10-20 22:17:27 -05001603 ofs, chain_size,
1604 bio_chain, coll, cur_seg);
Alex Elder46342462012-10-10 18:59:29 -07001605 else
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001606 rbd_coll_end_req_index(rq, coll, cur_seg,
Alex Elderf7760da2012-10-20 22:17:27 -05001607 -ENOMEM, chain_size);
1608 size -= chain_size;
1609 ofs += chain_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001610
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001611 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001612 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001613 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001614
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001615 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001616
1617 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001618 }
1619}
1620
1621/*
1622 * a queue callback. Makes sure that we don't create a bio that spans across
1623 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001624 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001625 */
1626static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1627 struct bio_vec *bvec)
1628{
1629 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001630 sector_t sector_offset;
1631 sector_t sectors_per_obj;
1632 sector_t obj_sector_offset;
1633 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001634
Alex Eldere5cfeed22012-10-20 22:17:27 -05001635 /*
1636 * Find how far into its rbd object the partition-relative
1637 * bio start sector is to offset relative to the enclosing
1638 * device.
1639 */
1640 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1641 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1642 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001643
Alex Eldere5cfeed22012-10-20 22:17:27 -05001644 /*
1645 * Compute the number of bytes from that offset to the end
1646 * of the object. Account for what's already used by the bio.
1647 */
1648 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1649 if (ret > bmd->bi_size)
1650 ret -= bmd->bi_size;
1651 else
1652 ret = 0;
1653
1654 /*
1655 * Don't send back more than was asked for. And if the bio
1656 * was empty, let the whole thing through because: "Note
1657 * that a block device *must* allow a single page to be
1658 * added to an empty bio."
1659 */
1660 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1661 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1662 ret = (int) bvec->bv_len;
1663
1664 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001665}
1666
1667static void rbd_free_disk(struct rbd_device *rbd_dev)
1668{
1669 struct gendisk *disk = rbd_dev->disk;
1670
1671 if (!disk)
1672 return;
1673
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001674 if (disk->flags & GENHD_FL_UP)
1675 del_gendisk(disk);
1676 if (disk->queue)
1677 blk_cleanup_queue(disk->queue);
1678 put_disk(disk);
1679}
1680
1681/*
Alex Elder4156d992012-08-02 11:29:46 -05001682 * Read the complete header for the given rbd device.
1683 *
1684 * Returns a pointer to a dynamically-allocated buffer containing
1685 * the complete and validated header. Caller can pass the address
1686 * of a variable that will be filled in with the version of the
1687 * header object at the time it was read.
1688 *
1689 * Returns a pointer-coded errno if a failure occurs.
1690 */
1691static struct rbd_image_header_ondisk *
1692rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1693{
1694 struct rbd_image_header_ondisk *ondisk = NULL;
1695 u32 snap_count = 0;
1696 u64 names_size = 0;
1697 u32 want_count;
1698 int ret;
1699
1700 /*
1701 * The complete header will include an array of its 64-bit
1702 * snapshot ids, followed by the names of those snapshots as
1703 * a contiguous block of NUL-terminated strings. Note that
1704 * the number of snapshots could change by the time we read
1705 * it in, in which case we re-read it.
1706 */
1707 do {
1708 size_t size;
1709
1710 kfree(ondisk);
1711
1712 size = sizeof (*ondisk);
1713 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1714 size += names_size;
1715 ondisk = kmalloc(size, GFP_KERNEL);
1716 if (!ondisk)
1717 return ERR_PTR(-ENOMEM);
1718
1719 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1720 rbd_dev->header_name,
1721 0, size,
1722 (char *) ondisk, version);
1723
1724 if (ret < 0)
1725 goto out_err;
1726 if (WARN_ON((size_t) ret < size)) {
1727 ret = -ENXIO;
1728 pr_warning("short header read for image %s"
1729 " (want %zd got %d)\n",
1730 rbd_dev->image_name, size, ret);
1731 goto out_err;
1732 }
1733 if (!rbd_dev_ondisk_valid(ondisk)) {
1734 ret = -ENXIO;
1735 pr_warning("invalid header for image %s\n",
1736 rbd_dev->image_name);
1737 goto out_err;
1738 }
1739
1740 names_size = le64_to_cpu(ondisk->snap_names_len);
1741 want_count = snap_count;
1742 snap_count = le32_to_cpu(ondisk->snap_count);
1743 } while (snap_count != want_count);
1744
1745 return ondisk;
1746
1747out_err:
1748 kfree(ondisk);
1749
1750 return ERR_PTR(ret);
1751}
1752
1753/*
1754 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001755 */
1756static int rbd_read_header(struct rbd_device *rbd_dev,
1757 struct rbd_image_header *header)
1758{
Alex Elder4156d992012-08-02 11:29:46 -05001759 struct rbd_image_header_ondisk *ondisk;
1760 u64 ver = 0;
1761 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001762
Alex Elder4156d992012-08-02 11:29:46 -05001763 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1764 if (IS_ERR(ondisk))
1765 return PTR_ERR(ondisk);
1766 ret = rbd_header_from_disk(header, ondisk);
1767 if (ret >= 0)
1768 header->obj_version = ver;
1769 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001770
Alex Elder4156d992012-08-02 11:29:46 -05001771 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001772}
1773
Alex Elder41f38c22012-10-25 23:34:40 -05001774static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001775{
1776 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001777 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001778
Alex Eldera0593292012-07-19 09:09:27 -05001779 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001780 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001781}
1782
Alex Elder94785542012-10-09 13:50:17 -07001783static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1784{
1785 sector_t size;
1786
Alex Elder971f8392012-10-25 23:34:41 -05001787 if (rbd_dev->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001788 return;
1789
1790 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1791 dout("setting size to %llu sectors", (unsigned long long) size);
1792 rbd_dev->mapping.size = (u64) size;
1793 set_capacity(rbd_dev->disk, size);
1794}
1795
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001796/*
1797 * only read the first part of the ondisk header, without the snaps info
1798 */
Alex Elder117973f2012-08-31 17:29:55 -05001799static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001800{
1801 int ret;
1802 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001803
1804 ret = rbd_read_header(rbd_dev, &h);
1805 if (ret < 0)
1806 return ret;
1807
Josh Durgina51aa0c2011-12-05 10:35:04 -08001808 down_write(&rbd_dev->header_rwsem);
1809
Alex Elder94785542012-10-09 13:50:17 -07001810 /* Update image size, and check for resize of mapped image */
1811 rbd_dev->header.image_size = h.image_size;
1812 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001813
Alex Elder849b4262012-07-09 21:04:24 -05001814 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001815 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001816 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001817 /* osd requests may still refer to snapc */
1818 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001819
Alex Elderb8136232012-07-25 09:32:41 -05001820 if (hver)
1821 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001822 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001823 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001824 rbd_dev->header.snapc = h.snapc;
1825 rbd_dev->header.snap_names = h.snap_names;
1826 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001827 /* Free the extra copy of the object prefix */
1828 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1829 kfree(h.object_prefix);
1830
Alex Elder304f6802012-08-31 17:29:52 -05001831 ret = rbd_dev_snaps_update(rbd_dev);
1832 if (!ret)
1833 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001834
Josh Durginc6666012011-11-21 17:11:12 -08001835 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001836
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001837 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001838}
1839
Alex Elder117973f2012-08-31 17:29:55 -05001840static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001841{
1842 int ret;
1843
Alex Elder117973f2012-08-31 17:29:55 -05001844 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001845 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001846 if (rbd_dev->image_format == 1)
1847 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1848 else
1849 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001850 mutex_unlock(&ctl_mutex);
1851
1852 return ret;
1853}
1854
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001855static int rbd_init_disk(struct rbd_device *rbd_dev)
1856{
1857 struct gendisk *disk;
1858 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001859 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001860
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001861 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001862 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1863 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001864 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001865
Alex Elderf0f8cef2012-01-29 13:57:44 -06001866 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001867 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001868 disk->major = rbd_dev->major;
1869 disk->first_minor = 0;
1870 disk->fops = &rbd_bd_ops;
1871 disk->private_data = rbd_dev;
1872
1873 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001874 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1875 if (!q)
1876 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001877
Alex Elder593a9e72012-02-07 12:03:37 -06001878 /* We use the default size, but let's be explicit about it. */
1879 blk_queue_physical_block_size(q, SECTOR_SIZE);
1880
Josh Durgin029bcbd2011-07-22 11:35:23 -07001881 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001882 segment_size = rbd_obj_bytes(&rbd_dev->header);
1883 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1884 blk_queue_max_segment_size(q, segment_size);
1885 blk_queue_io_min(q, segment_size);
1886 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001887
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001888 blk_queue_merge_bvec(q, rbd_merge_bvec);
1889 disk->queue = q;
1890
1891 q->queuedata = rbd_dev;
1892
1893 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001894
Alex Elder12f02942012-08-29 17:11:07 -05001895 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1896
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001897 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001898out_disk:
1899 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001900
1901 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001902}
1903
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001904/*
1905 sysfs
1906*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001907
Alex Elder593a9e72012-02-07 12:03:37 -06001908static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1909{
1910 return container_of(dev, struct rbd_device, dev);
1911}
1912
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001913static ssize_t rbd_size_show(struct device *dev,
1914 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001915{
Alex Elder593a9e72012-02-07 12:03:37 -06001916 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001917 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001918
Josh Durgina51aa0c2011-12-05 10:35:04 -08001919 down_read(&rbd_dev->header_rwsem);
1920 size = get_capacity(rbd_dev->disk);
1921 up_read(&rbd_dev->header_rwsem);
1922
1923 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001924}
1925
Alex Elder34b13182012-07-13 20:35:12 -05001926/*
1927 * Note this shows the features for whatever's mapped, which is not
1928 * necessarily the base image.
1929 */
1930static ssize_t rbd_features_show(struct device *dev,
1931 struct device_attribute *attr, char *buf)
1932{
1933 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1934
1935 return sprintf(buf, "0x%016llx\n",
1936 (unsigned long long) rbd_dev->mapping.features);
1937}
1938
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001939static ssize_t rbd_major_show(struct device *dev,
1940 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001941{
Alex Elder593a9e72012-02-07 12:03:37 -06001942 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001943
1944 return sprintf(buf, "%d\n", rbd_dev->major);
1945}
1946
1947static ssize_t rbd_client_id_show(struct device *dev,
1948 struct device_attribute *attr, char *buf)
1949{
Alex Elder593a9e72012-02-07 12:03:37 -06001950 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001951
Alex Elder1dbb4392012-01-24 10:08:37 -06001952 return sprintf(buf, "client%lld\n",
1953 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001954}
1955
1956static ssize_t rbd_pool_show(struct device *dev,
1957 struct device_attribute *attr, char *buf)
1958{
Alex Elder593a9e72012-02-07 12:03:37 -06001959 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001960
1961 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1962}
1963
Alex Elder9bb2f332012-07-12 10:46:35 -05001964static ssize_t rbd_pool_id_show(struct device *dev,
1965 struct device_attribute *attr, char *buf)
1966{
1967 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1968
Alex Elder86992092012-10-25 23:34:41 -05001969 return sprintf(buf, "%llu\n", (unsigned long long) rbd_dev->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05001970}
1971
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001972static ssize_t rbd_name_show(struct device *dev,
1973 struct device_attribute *attr, char *buf)
1974{
Alex Elder593a9e72012-02-07 12:03:37 -06001975 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001976
Alex Elder0bed54d2012-07-03 16:01:18 -05001977 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001978}
1979
Alex Elder589d30e2012-07-10 20:30:11 -05001980static ssize_t rbd_image_id_show(struct device *dev,
1981 struct device_attribute *attr, char *buf)
1982{
1983 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1984
1985 return sprintf(buf, "%s\n", rbd_dev->image_id);
1986}
1987
Alex Elder34b13182012-07-13 20:35:12 -05001988/*
1989 * Shows the name of the currently-mapped snapshot (or
1990 * RBD_SNAP_HEAD_NAME for the base image).
1991 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001992static ssize_t rbd_snap_show(struct device *dev,
1993 struct device_attribute *attr,
1994 char *buf)
1995{
Alex Elder593a9e72012-02-07 12:03:37 -06001996 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001997
Alex Elder971f8392012-10-25 23:34:41 -05001998 return sprintf(buf, "%s\n", rbd_dev->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001999}
2000
2001static ssize_t rbd_image_refresh(struct device *dev,
2002 struct device_attribute *attr,
2003 const char *buf,
2004 size_t size)
2005{
Alex Elder593a9e72012-02-07 12:03:37 -06002006 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002007 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002008
Alex Elder117973f2012-08-31 17:29:55 -05002009 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002010
2011 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002012}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002013
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002014static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002015static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002016static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2017static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2018static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002019static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002020static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002021static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002022static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2023static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002024
2025static struct attribute *rbd_attrs[] = {
2026 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002027 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002028 &dev_attr_major.attr,
2029 &dev_attr_client_id.attr,
2030 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002031 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002032 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002033 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002034 &dev_attr_current_snap.attr,
2035 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002036 NULL
2037};
2038
2039static struct attribute_group rbd_attr_group = {
2040 .attrs = rbd_attrs,
2041};
2042
2043static const struct attribute_group *rbd_attr_groups[] = {
2044 &rbd_attr_group,
2045 NULL
2046};
2047
2048static void rbd_sysfs_dev_release(struct device *dev)
2049{
2050}
2051
2052static struct device_type rbd_device_type = {
2053 .name = "rbd",
2054 .groups = rbd_attr_groups,
2055 .release = rbd_sysfs_dev_release,
2056};
2057
2058
2059/*
2060 sysfs - snapshots
2061*/
2062
2063static ssize_t rbd_snap_size_show(struct device *dev,
2064 struct device_attribute *attr,
2065 char *buf)
2066{
2067 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2068
Josh Durgin3591538f2011-12-05 18:25:13 -08002069 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002070}
2071
2072static ssize_t rbd_snap_id_show(struct device *dev,
2073 struct device_attribute *attr,
2074 char *buf)
2075{
2076 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2077
Josh Durgin3591538f2011-12-05 18:25:13 -08002078 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002079}
2080
Alex Elder34b13182012-07-13 20:35:12 -05002081static ssize_t rbd_snap_features_show(struct device *dev,
2082 struct device_attribute *attr,
2083 char *buf)
2084{
2085 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2086
2087 return sprintf(buf, "0x%016llx\n",
2088 (unsigned long long) snap->features);
2089}
2090
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002091static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2092static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002093static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002094
2095static struct attribute *rbd_snap_attrs[] = {
2096 &dev_attr_snap_size.attr,
2097 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002098 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002099 NULL,
2100};
2101
2102static struct attribute_group rbd_snap_attr_group = {
2103 .attrs = rbd_snap_attrs,
2104};
2105
2106static void rbd_snap_dev_release(struct device *dev)
2107{
2108 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2109 kfree(snap->name);
2110 kfree(snap);
2111}
2112
2113static const struct attribute_group *rbd_snap_attr_groups[] = {
2114 &rbd_snap_attr_group,
2115 NULL
2116};
2117
2118static struct device_type rbd_snap_device_type = {
2119 .groups = rbd_snap_attr_groups,
2120 .release = rbd_snap_dev_release,
2121};
2122
Alex Elder304f6802012-08-31 17:29:52 -05002123static bool rbd_snap_registered(struct rbd_snap *snap)
2124{
2125 bool ret = snap->dev.type == &rbd_snap_device_type;
2126 bool reg = device_is_registered(&snap->dev);
2127
2128 rbd_assert(!ret ^ reg);
2129
2130 return ret;
2131}
2132
Alex Elder41f38c22012-10-25 23:34:40 -05002133static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002134{
2135 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002136 if (device_is_registered(&snap->dev))
2137 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002138}
2139
Alex Elder14e70852012-07-19 09:09:27 -05002140static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002141 struct device *parent)
2142{
2143 struct device *dev = &snap->dev;
2144 int ret;
2145
2146 dev->type = &rbd_snap_device_type;
2147 dev->parent = parent;
2148 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002149 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002150 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2151
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002152 ret = device_register(dev);
2153
2154 return ret;
2155}
2156
Alex Elder4e891e02012-07-10 20:30:10 -05002157static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002158 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002159 u64 snap_id, u64 snap_size,
2160 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002161{
Alex Elder4e891e02012-07-10 20:30:10 -05002162 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002163 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002164
2165 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002166 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002167 return ERR_PTR(-ENOMEM);
2168
2169 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002170 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002171 if (!snap->name)
2172 goto err;
2173
Alex Elderc8d18422012-07-10 20:30:11 -05002174 snap->id = snap_id;
2175 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002176 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002177
2178 return snap;
2179
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002180err:
2181 kfree(snap->name);
2182 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002183
2184 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002185}
2186
Alex Eldercd892122012-07-03 16:01:19 -05002187static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2188 u64 *snap_size, u64 *snap_features)
2189{
2190 char *snap_name;
2191
2192 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2193
2194 *snap_size = rbd_dev->header.snap_sizes[which];
2195 *snap_features = 0; /* No features for v1 */
2196
2197 /* Skip over names until we find the one we are looking for */
2198
2199 snap_name = rbd_dev->header.snap_names;
2200 while (which--)
2201 snap_name += strlen(snap_name) + 1;
2202
2203 return snap_name;
2204}
2205
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002206/*
Alex Elder9d475de2012-07-03 16:01:19 -05002207 * Get the size and object order for an image snapshot, or if
2208 * snap_id is CEPH_NOSNAP, gets this information for the base
2209 * image.
2210 */
2211static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2212 u8 *order, u64 *snap_size)
2213{
2214 __le64 snapid = cpu_to_le64(snap_id);
2215 int ret;
2216 struct {
2217 u8 order;
2218 __le64 size;
2219 } __attribute__ ((packed)) size_buf = { 0 };
2220
2221 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2222 "rbd", "get_size",
2223 (char *) &snapid, sizeof (snapid),
2224 (char *) &size_buf, sizeof (size_buf),
2225 CEPH_OSD_FLAG_READ, NULL);
2226 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2227 if (ret < 0)
2228 return ret;
2229
2230 *order = size_buf.order;
2231 *snap_size = le64_to_cpu(size_buf.size);
2232
2233 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2234 (unsigned long long) snap_id, (unsigned int) *order,
2235 (unsigned long long) *snap_size);
2236
2237 return 0;
2238}
2239
2240static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2241{
2242 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2243 &rbd_dev->header.obj_order,
2244 &rbd_dev->header.image_size);
2245}
2246
Alex Elder1e130192012-07-03 16:01:19 -05002247static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2248{
2249 void *reply_buf;
2250 int ret;
2251 void *p;
2252
2253 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2254 if (!reply_buf)
2255 return -ENOMEM;
2256
2257 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2258 "rbd", "get_object_prefix",
2259 NULL, 0,
2260 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2261 CEPH_OSD_FLAG_READ, NULL);
2262 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2263 if (ret < 0)
2264 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002265 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002266
2267 p = reply_buf;
2268 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2269 p + RBD_OBJ_PREFIX_LEN_MAX,
2270 NULL, GFP_NOIO);
2271
2272 if (IS_ERR(rbd_dev->header.object_prefix)) {
2273 ret = PTR_ERR(rbd_dev->header.object_prefix);
2274 rbd_dev->header.object_prefix = NULL;
2275 } else {
2276 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2277 }
2278
2279out:
2280 kfree(reply_buf);
2281
2282 return ret;
2283}
2284
Alex Elderb1b54022012-07-03 16:01:19 -05002285static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2286 u64 *snap_features)
2287{
2288 __le64 snapid = cpu_to_le64(snap_id);
2289 struct {
2290 __le64 features;
2291 __le64 incompat;
2292 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002293 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002294 int ret;
2295
2296 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2297 "rbd", "get_features",
2298 (char *) &snapid, sizeof (snapid),
2299 (char *) &features_buf, sizeof (features_buf),
2300 CEPH_OSD_FLAG_READ, NULL);
2301 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2302 if (ret < 0)
2303 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002304
2305 incompat = le64_to_cpu(features_buf.incompat);
2306 if (incompat & ~RBD_FEATURES_ALL)
2307 return -ENOTSUPP;
2308
Alex Elderb1b54022012-07-03 16:01:19 -05002309 *snap_features = le64_to_cpu(features_buf.features);
2310
2311 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2312 (unsigned long long) snap_id,
2313 (unsigned long long) *snap_features,
2314 (unsigned long long) le64_to_cpu(features_buf.incompat));
2315
2316 return 0;
2317}
2318
2319static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2320{
2321 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2322 &rbd_dev->header.features);
2323}
2324
Alex Elder6e14b1a2012-07-03 16:01:19 -05002325static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002326{
2327 size_t size;
2328 int ret;
2329 void *reply_buf;
2330 void *p;
2331 void *end;
2332 u64 seq;
2333 u32 snap_count;
2334 struct ceph_snap_context *snapc;
2335 u32 i;
2336
2337 /*
2338 * We'll need room for the seq value (maximum snapshot id),
2339 * snapshot count, and array of that many snapshot ids.
2340 * For now we have a fixed upper limit on the number we're
2341 * prepared to receive.
2342 */
2343 size = sizeof (__le64) + sizeof (__le32) +
2344 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2345 reply_buf = kzalloc(size, GFP_KERNEL);
2346 if (!reply_buf)
2347 return -ENOMEM;
2348
2349 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2350 "rbd", "get_snapcontext",
2351 NULL, 0,
2352 reply_buf, size,
Alex Elder6e14b1a2012-07-03 16:01:19 -05002353 CEPH_OSD_FLAG_READ, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002354 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2355 if (ret < 0)
2356 goto out;
2357
2358 ret = -ERANGE;
2359 p = reply_buf;
2360 end = (char *) reply_buf + size;
2361 ceph_decode_64_safe(&p, end, seq, out);
2362 ceph_decode_32_safe(&p, end, snap_count, out);
2363
2364 /*
2365 * Make sure the reported number of snapshot ids wouldn't go
2366 * beyond the end of our buffer. But before checking that,
2367 * make sure the computed size of the snapshot context we
2368 * allocate is representable in a size_t.
2369 */
2370 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2371 / sizeof (u64)) {
2372 ret = -EINVAL;
2373 goto out;
2374 }
2375 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2376 goto out;
2377
2378 size = sizeof (struct ceph_snap_context) +
2379 snap_count * sizeof (snapc->snaps[0]);
2380 snapc = kmalloc(size, GFP_KERNEL);
2381 if (!snapc) {
2382 ret = -ENOMEM;
2383 goto out;
2384 }
2385
2386 atomic_set(&snapc->nref, 1);
2387 snapc->seq = seq;
2388 snapc->num_snaps = snap_count;
2389 for (i = 0; i < snap_count; i++)
2390 snapc->snaps[i] = ceph_decode_64(&p);
2391
2392 rbd_dev->header.snapc = snapc;
2393
2394 dout(" snap context seq = %llu, snap_count = %u\n",
2395 (unsigned long long) seq, (unsigned int) snap_count);
2396
2397out:
2398 kfree(reply_buf);
2399
2400 return 0;
2401}
2402
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002403static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2404{
2405 size_t size;
2406 void *reply_buf;
2407 __le64 snap_id;
2408 int ret;
2409 void *p;
2410 void *end;
2411 size_t snap_name_len;
2412 char *snap_name;
2413
2414 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2415 reply_buf = kmalloc(size, GFP_KERNEL);
2416 if (!reply_buf)
2417 return ERR_PTR(-ENOMEM);
2418
2419 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2420 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2421 "rbd", "get_snapshot_name",
2422 (char *) &snap_id, sizeof (snap_id),
2423 reply_buf, size,
2424 CEPH_OSD_FLAG_READ, NULL);
2425 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2426 if (ret < 0)
2427 goto out;
2428
2429 p = reply_buf;
2430 end = (char *) reply_buf + size;
2431 snap_name_len = 0;
2432 snap_name = ceph_extract_encoded_string(&p, end, &snap_name_len,
2433 GFP_KERNEL);
2434 if (IS_ERR(snap_name)) {
2435 ret = PTR_ERR(snap_name);
2436 goto out;
2437 } else {
2438 dout(" snap_id 0x%016llx snap_name = %s\n",
2439 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2440 }
2441 kfree(reply_buf);
2442
2443 return snap_name;
2444out:
2445 kfree(reply_buf);
2446
2447 return ERR_PTR(ret);
2448}
2449
2450static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2451 u64 *snap_size, u64 *snap_features)
2452{
2453 __le64 snap_id;
2454 u8 order;
2455 int ret;
2456
2457 snap_id = rbd_dev->header.snapc->snaps[which];
2458 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2459 if (ret)
2460 return ERR_PTR(ret);
2461 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2462 if (ret)
2463 return ERR_PTR(ret);
2464
2465 return rbd_dev_v2_snap_name(rbd_dev, which);
2466}
2467
2468static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2469 u64 *snap_size, u64 *snap_features)
2470{
2471 if (rbd_dev->image_format == 1)
2472 return rbd_dev_v1_snap_info(rbd_dev, which,
2473 snap_size, snap_features);
2474 if (rbd_dev->image_format == 2)
2475 return rbd_dev_v2_snap_info(rbd_dev, which,
2476 snap_size, snap_features);
2477 return ERR_PTR(-EINVAL);
2478}
2479
Alex Elder117973f2012-08-31 17:29:55 -05002480static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2481{
2482 int ret;
2483 __u8 obj_order;
2484
2485 down_write(&rbd_dev->header_rwsem);
2486
2487 /* Grab old order first, to see if it changes */
2488
2489 obj_order = rbd_dev->header.obj_order,
2490 ret = rbd_dev_v2_image_size(rbd_dev);
2491 if (ret)
2492 goto out;
2493 if (rbd_dev->header.obj_order != obj_order) {
2494 ret = -EIO;
2495 goto out;
2496 }
2497 rbd_update_mapping_size(rbd_dev);
2498
2499 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2500 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2501 if (ret)
2502 goto out;
2503 ret = rbd_dev_snaps_update(rbd_dev);
2504 dout("rbd_dev_snaps_update returned %d\n", ret);
2505 if (ret)
2506 goto out;
2507 ret = rbd_dev_snaps_register(rbd_dev);
2508 dout("rbd_dev_snaps_register returned %d\n", ret);
2509out:
2510 up_write(&rbd_dev->header_rwsem);
2511
2512 return ret;
2513}
2514
Alex Elder9d475de2012-07-03 16:01:19 -05002515/*
Alex Elder35938152012-08-02 11:29:46 -05002516 * Scan the rbd device's current snapshot list and compare it to the
2517 * newly-received snapshot context. Remove any existing snapshots
2518 * not present in the new snapshot context. Add a new snapshot for
2519 * any snaphots in the snapshot context not in the current list.
2520 * And verify there are no changes to snapshots we already know
2521 * about.
2522 *
2523 * Assumes the snapshots in the snapshot context are sorted by
2524 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2525 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002526 */
Alex Elder304f6802012-08-31 17:29:52 -05002527static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002528{
Alex Elder35938152012-08-02 11:29:46 -05002529 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2530 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002531 struct list_head *head = &rbd_dev->snaps;
2532 struct list_head *links = head->next;
2533 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002534
Alex Elder9fcbb802012-08-23 23:48:49 -05002535 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002536 while (index < snap_count || links != head) {
2537 u64 snap_id;
2538 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002539 char *snap_name;
2540 u64 snap_size = 0;
2541 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002542
Alex Elder35938152012-08-02 11:29:46 -05002543 snap_id = index < snap_count ? snapc->snaps[index]
2544 : CEPH_NOSNAP;
2545 snap = links != head ? list_entry(links, struct rbd_snap, node)
2546 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002547 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002548
Alex Elder35938152012-08-02 11:29:46 -05002549 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2550 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002551
Alex Elder35938152012-08-02 11:29:46 -05002552 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002553
Alex Elder971f8392012-10-25 23:34:41 -05002554 if (rbd_dev->snap_id == snap->id)
Alex Elderdaba5fd2012-10-26 17:25:23 -05002555 rbd_dev->exists = false;
Alex Elder41f38c22012-10-25 23:34:40 -05002556 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002557 dout("%ssnap id %llu has been removed\n",
Alex Elder971f8392012-10-25 23:34:41 -05002558 rbd_dev->snap_id == snap->id ? "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002559 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002560
Alex Elder35938152012-08-02 11:29:46 -05002561 /* Done with this list entry; advance */
2562
2563 links = next;
2564 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002565 }
Alex Elder35938152012-08-02 11:29:46 -05002566
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002567 snap_name = rbd_dev_snap_info(rbd_dev, index,
2568 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002569 if (IS_ERR(snap_name))
2570 return PTR_ERR(snap_name);
2571
Alex Elder9fcbb802012-08-23 23:48:49 -05002572 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2573 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002574 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2575 struct rbd_snap *new_snap;
2576
2577 /* We haven't seen this snapshot before */
2578
Alex Elderc8d18422012-07-10 20:30:11 -05002579 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002580 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002581 if (IS_ERR(new_snap)) {
2582 int err = PTR_ERR(new_snap);
2583
2584 dout(" failed to add dev, error %d\n", err);
2585
2586 return err;
2587 }
Alex Elder35938152012-08-02 11:29:46 -05002588
2589 /* New goes before existing, or at end of list */
2590
Alex Elder9fcbb802012-08-23 23:48:49 -05002591 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002592 if (snap)
2593 list_add_tail(&new_snap->node, &snap->node);
2594 else
Alex Elder523f3252012-08-30 00:16:37 -05002595 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002596 } else {
2597 /* Already have this one */
2598
Alex Elder9fcbb802012-08-23 23:48:49 -05002599 dout(" already present\n");
2600
Alex Eldercd892122012-07-03 16:01:19 -05002601 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002602 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002603 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002604
2605 /* Done with this list entry; advance */
2606
2607 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002608 }
Alex Elder35938152012-08-02 11:29:46 -05002609
2610 /* Advance to the next entry in the snapshot context */
2611
2612 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002613 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002614 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002615
2616 return 0;
2617}
2618
Alex Elder304f6802012-08-31 17:29:52 -05002619/*
2620 * Scan the list of snapshots and register the devices for any that
2621 * have not already been registered.
2622 */
2623static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2624{
2625 struct rbd_snap *snap;
2626 int ret = 0;
2627
2628 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002629 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2630 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002631
2632 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2633 if (!rbd_snap_registered(snap)) {
2634 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2635 if (ret < 0)
2636 break;
2637 }
2638 }
2639 dout("%s: returning %d\n", __func__, ret);
2640
2641 return ret;
2642}
2643
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002644static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2645{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002646 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002647 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002648
2649 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002650
Alex Eldercd789ab2012-08-30 00:16:38 -05002651 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002652 dev->bus = &rbd_bus_type;
2653 dev->type = &rbd_device_type;
2654 dev->parent = &rbd_root_dev;
2655 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002656 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002657 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002658
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002659 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002660
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002661 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002662}
2663
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002664static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2665{
2666 device_unregister(&rbd_dev->dev);
2667}
2668
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002669static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2670{
2671 int ret, rc;
2672
2673 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002674 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002675 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05002676 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002677 if (rc < 0)
2678 return rc;
2679 }
2680 } while (ret == -ERANGE);
2681
2682 return ret;
2683}
2684
Alex Eldere2839302012-08-29 17:11:06 -05002685static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002686
2687/*
Alex Elder499afd52012-02-02 08:13:29 -06002688 * Get a unique rbd identifier for the given new rbd_dev, and add
2689 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002690 */
Alex Eldere2839302012-08-29 17:11:06 -05002691static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002692{
Alex Eldere2839302012-08-29 17:11:06 -05002693 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002694
2695 spin_lock(&rbd_dev_list_lock);
2696 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2697 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002698 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2699 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002700}
Alex Elderb7f23c32012-01-29 13:57:43 -06002701
Alex Elder1ddbe942012-01-29 13:57:44 -06002702/*
Alex Elder499afd52012-02-02 08:13:29 -06002703 * Remove an rbd_dev from the global list, and record that its
2704 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002705 */
Alex Eldere2839302012-08-29 17:11:06 -05002706static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002707{
Alex Elderd184f6b2012-01-29 13:57:44 -06002708 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002709 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002710 int max_id;
2711
Alex Elderaafb2302012-09-06 16:00:54 -05002712 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002713
Alex Eldere2839302012-08-29 17:11:06 -05002714 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2715 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002716 spin_lock(&rbd_dev_list_lock);
2717 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002718
2719 /*
2720 * If the id being "put" is not the current maximum, there
2721 * is nothing special we need to do.
2722 */
Alex Eldere2839302012-08-29 17:11:06 -05002723 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002724 spin_unlock(&rbd_dev_list_lock);
2725 return;
2726 }
2727
2728 /*
2729 * We need to update the current maximum id. Search the
2730 * list to find out what it is. We're more likely to find
2731 * the maximum at the end, so search the list backward.
2732 */
2733 max_id = 0;
2734 list_for_each_prev(tmp, &rbd_dev_list) {
2735 struct rbd_device *rbd_dev;
2736
2737 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07002738 if (rbd_dev->dev_id > max_id)
2739 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002740 }
Alex Elder499afd52012-02-02 08:13:29 -06002741 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002742
Alex Elder1ddbe942012-01-29 13:57:44 -06002743 /*
Alex Eldere2839302012-08-29 17:11:06 -05002744 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002745 * which case it now accurately reflects the new maximum.
2746 * Be careful not to overwrite the maximum value in that
2747 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002748 */
Alex Eldere2839302012-08-29 17:11:06 -05002749 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2750 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002751}
2752
Alex Eldera725f65e2012-02-02 08:13:30 -06002753/*
Alex Eldere28fff262012-02-02 08:13:30 -06002754 * Skips over white space at *buf, and updates *buf to point to the
2755 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002756 * the token (string of non-white space characters) found. Note
2757 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002758 */
2759static inline size_t next_token(const char **buf)
2760{
2761 /*
2762 * These are the characters that produce nonzero for
2763 * isspace() in the "C" and "POSIX" locales.
2764 */
2765 const char *spaces = " \f\n\r\t\v";
2766
2767 *buf += strspn(*buf, spaces); /* Find start of token */
2768
2769 return strcspn(*buf, spaces); /* Return token length */
2770}
2771
2772/*
2773 * Finds the next token in *buf, and if the provided token buffer is
2774 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002775 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2776 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002777 *
2778 * Returns the length of the token found (not including the '\0').
2779 * Return value will be 0 if no token is found, and it will be >=
2780 * token_size if the token would not fit.
2781 *
Alex Elder593a9e72012-02-07 12:03:37 -06002782 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002783 * found token. Note that this occurs even if the token buffer is
2784 * too small to hold it.
2785 */
2786static inline size_t copy_token(const char **buf,
2787 char *token,
2788 size_t token_size)
2789{
2790 size_t len;
2791
2792 len = next_token(buf);
2793 if (len < token_size) {
2794 memcpy(token, *buf, len);
2795 *(token + len) = '\0';
2796 }
2797 *buf += len;
2798
2799 return len;
2800}
2801
2802/*
Alex Elderea3352f2012-07-09 21:04:23 -05002803 * Finds the next token in *buf, dynamically allocates a buffer big
2804 * enough to hold a copy of it, and copies the token into the new
2805 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2806 * that a duplicate buffer is created even for a zero-length token.
2807 *
2808 * Returns a pointer to the newly-allocated duplicate, or a null
2809 * pointer if memory for the duplicate was not available. If
2810 * the lenp argument is a non-null pointer, the length of the token
2811 * (not including the '\0') is returned in *lenp.
2812 *
2813 * If successful, the *buf pointer will be updated to point beyond
2814 * the end of the found token.
2815 *
2816 * Note: uses GFP_KERNEL for allocation.
2817 */
2818static inline char *dup_token(const char **buf, size_t *lenp)
2819{
2820 char *dup;
2821 size_t len;
2822
2823 len = next_token(buf);
2824 dup = kmalloc(len + 1, GFP_KERNEL);
2825 if (!dup)
2826 return NULL;
2827
2828 memcpy(dup, *buf, len);
2829 *(dup + len) = '\0';
2830 *buf += len;
2831
2832 if (lenp)
2833 *lenp = len;
2834
2835 return dup;
2836}
2837
2838/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002839 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2840 * rbd_md_name, and name fields of the given rbd_dev, based on the
2841 * list of monitor addresses and other options provided via
2842 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2843 * copy of the snapshot name to map if successful, or a
2844 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002845 *
2846 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002847 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002848static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2849 const char *buf,
2850 const char **mon_addrs,
2851 size_t *mon_addrs_size,
2852 char *options,
2853 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002854{
Alex Elderd22f76e2012-07-12 10:46:35 -05002855 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002856 char *err_ptr = ERR_PTR(-EINVAL);
2857 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002858
2859 /* The first four tokens are required */
2860
Alex Elder7ef32142012-02-02 08:13:30 -06002861 len = next_token(&buf);
2862 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002863 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002864 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002865 *mon_addrs = buf;
2866
2867 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002868
Alex Eldere28fff262012-02-02 08:13:30 -06002869 len = copy_token(&buf, options, options_size);
2870 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002871 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002872
Alex Elder3feeb8942012-08-31 17:29:52 -05002873 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002874 rbd_dev->pool_name = dup_token(&buf, NULL);
2875 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002876 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002877
Alex Elder0bed54d2012-07-03 16:01:18 -05002878 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2879 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002880 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002881
Alex Elderd4b125e2012-07-03 16:01:19 -05002882 /* Snapshot name is optional; default is to use "head" */
2883
Alex Elder3feeb8942012-08-31 17:29:52 -05002884 len = next_token(&buf);
Alex Elderd4b125e2012-07-03 16:01:19 -05002885 if (len > RBD_MAX_SNAP_NAME_LEN) {
2886 err_ptr = ERR_PTR(-ENAMETOOLONG);
2887 goto out_err;
2888 }
Alex Elder820a5f32012-07-09 21:04:24 -05002889 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002890 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2891 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002892 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002893 snap_name = kmalloc(len + 1, GFP_KERNEL);
2894 if (!snap_name)
2895 goto out_err;
2896 memcpy(snap_name, buf, len);
2897 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002898
Alex Elder3feeb8942012-08-31 17:29:52 -05002899 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002900
2901out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002902 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002903 rbd_dev->image_name = NULL;
2904 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002905 kfree(rbd_dev->pool_name);
2906 rbd_dev->pool_name = NULL;
2907
Alex Elder3feeb8942012-08-31 17:29:52 -05002908 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002909}
2910
Alex Elder589d30e2012-07-10 20:30:11 -05002911/*
2912 * An rbd format 2 image has a unique identifier, distinct from the
2913 * name given to it by the user. Internally, that identifier is
2914 * what's used to specify the names of objects related to the image.
2915 *
2916 * A special "rbd id" object is used to map an rbd image name to its
2917 * id. If that object doesn't exist, then there is no v2 rbd image
2918 * with the supplied name.
2919 *
2920 * This function will record the given rbd_dev's image_id field if
2921 * it can be determined, and in that case will return 0. If any
2922 * errors occur a negative errno will be returned and the rbd_dev's
2923 * image_id field will be unchanged (and should be NULL).
2924 */
2925static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2926{
2927 int ret;
2928 size_t size;
2929 char *object_name;
2930 void *response;
2931 void *p;
2932
2933 /*
2934 * First, see if the format 2 image id file exists, and if
2935 * so, get the image's persistent id from it.
2936 */
2937 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2938 object_name = kmalloc(size, GFP_NOIO);
2939 if (!object_name)
2940 return -ENOMEM;
2941 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2942 dout("rbd id object name is %s\n", object_name);
2943
2944 /* Response will be an encoded string, which includes a length */
2945
2946 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2947 response = kzalloc(size, GFP_NOIO);
2948 if (!response) {
2949 ret = -ENOMEM;
2950 goto out;
2951 }
2952
2953 ret = rbd_req_sync_exec(rbd_dev, object_name,
2954 "rbd", "get_id",
2955 NULL, 0,
2956 response, RBD_IMAGE_ID_LEN_MAX,
2957 CEPH_OSD_FLAG_READ, NULL);
2958 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2959 if (ret < 0)
2960 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002961 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05002962
2963 p = response;
2964 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2965 p + RBD_IMAGE_ID_LEN_MAX,
2966 &rbd_dev->image_id_len,
2967 GFP_NOIO);
2968 if (IS_ERR(rbd_dev->image_id)) {
2969 ret = PTR_ERR(rbd_dev->image_id);
2970 rbd_dev->image_id = NULL;
2971 } else {
2972 dout("image_id is %s\n", rbd_dev->image_id);
2973 }
2974out:
2975 kfree(response);
2976 kfree(object_name);
2977
2978 return ret;
2979}
2980
Alex Eldera30b71b2012-07-10 20:30:11 -05002981static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2982{
2983 int ret;
2984 size_t size;
2985
2986 /* Version 1 images have no id; empty string is used */
2987
2988 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2989 if (!rbd_dev->image_id)
2990 return -ENOMEM;
2991 rbd_dev->image_id_len = 0;
2992
2993 /* Record the header object name for this rbd image. */
2994
2995 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2996 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2997 if (!rbd_dev->header_name) {
2998 ret = -ENOMEM;
2999 goto out_err;
3000 }
3001 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
3002
3003 /* Populate rbd image metadata */
3004
3005 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3006 if (ret < 0)
3007 goto out_err;
3008 rbd_dev->image_format = 1;
3009
3010 dout("discovered version 1 image, header name is %s\n",
3011 rbd_dev->header_name);
3012
3013 return 0;
3014
3015out_err:
3016 kfree(rbd_dev->header_name);
3017 rbd_dev->header_name = NULL;
3018 kfree(rbd_dev->image_id);
3019 rbd_dev->image_id = NULL;
3020
3021 return ret;
3022}
3023
3024static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3025{
3026 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003027 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003028 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003029
3030 /*
3031 * Image id was filled in by the caller. Record the header
3032 * object name for this rbd image.
3033 */
3034 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
3035 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3036 if (!rbd_dev->header_name)
3037 return -ENOMEM;
3038 sprintf(rbd_dev->header_name, "%s%s",
3039 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003040
3041 /* Get the size and object order for the image */
3042
3043 ret = rbd_dev_v2_image_size(rbd_dev);
3044 if (ret < 0)
3045 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003046
3047 /* Get the object prefix (a.k.a. block_name) for the image */
3048
3049 ret = rbd_dev_v2_object_prefix(rbd_dev);
3050 if (ret < 0)
3051 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003052
Alex Elderd8891402012-10-09 13:50:17 -07003053 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003054
3055 ret = rbd_dev_v2_features(rbd_dev);
3056 if (ret < 0)
3057 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003058
Alex Elder6e14b1a2012-07-03 16:01:19 -05003059 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003060
Alex Elder6e14b1a2012-07-03 16:01:19 -05003061 rbd_dev->header.crypt_type = 0;
3062 rbd_dev->header.comp_type = 0;
3063
3064 /* Get the snapshot context, plus the header version */
3065
3066 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003067 if (ret)
3068 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003069 rbd_dev->header.obj_version = ver;
3070
Alex Eldera30b71b2012-07-10 20:30:11 -05003071 rbd_dev->image_format = 2;
3072
3073 dout("discovered version 2 image, header name is %s\n",
3074 rbd_dev->header_name);
3075
Alex Elder35152972012-08-31 17:29:55 -05003076 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003077out_err:
3078 kfree(rbd_dev->header_name);
3079 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003080 kfree(rbd_dev->header.object_prefix);
3081 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003082
3083 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003084}
3085
3086/*
3087 * Probe for the existence of the header object for the given rbd
3088 * device. For format 2 images this includes determining the image
3089 * id.
3090 */
3091static int rbd_dev_probe(struct rbd_device *rbd_dev)
3092{
3093 int ret;
3094
3095 /*
3096 * Get the id from the image id object. If it's not a
3097 * format 2 image, we'll get ENOENT back, and we'll assume
3098 * it's a format 1 image.
3099 */
3100 ret = rbd_dev_image_id(rbd_dev);
3101 if (ret)
3102 ret = rbd_dev_v1_probe(rbd_dev);
3103 else
3104 ret = rbd_dev_v2_probe(rbd_dev);
3105 if (ret)
3106 dout("probe failed, returning %d\n", ret);
3107
3108 return ret;
3109}
3110
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003111static ssize_t rbd_add(struct bus_type *bus,
3112 const char *buf,
3113 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003114{
Alex Eldercb8627c2012-07-09 21:04:23 -05003115 char *options;
3116 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06003117 const char *mon_addrs = NULL;
3118 size_t mon_addrs_size = 0;
Alex Elder78cea762012-10-25 23:34:41 -05003119 char *snap_name;
3120 struct rbd_options rbd_opts;
3121 struct ceph_options *ceph_opts;
Alex Elder27cc2592012-02-02 08:13:30 -06003122 struct ceph_osd_client *osdc;
3123 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003124
3125 if (!try_module_get(THIS_MODULE))
3126 return -ENODEV;
3127
Alex Elder27cc2592012-02-02 08:13:30 -06003128 options = kmalloc(count, GFP_KERNEL);
3129 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05003130 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05003131 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
3132 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05003133 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003134
3135 /* static rbd_device initialization */
3136 spin_lock_init(&rbd_dev->lock);
3137 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003138 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08003139 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003140
Alex Eldera725f65e2012-02-02 08:13:30 -06003141 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05003142 snap_name = rbd_add_parse_args(rbd_dev, buf,
3143 &mon_addrs, &mon_addrs_size, options, count);
3144 if (IS_ERR(snap_name)) {
3145 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003146 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05003147 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003148
Alex Elder78cea762012-10-25 23:34:41 -05003149 /* Initialize all rbd options to the defaults */
3150
3151 rbd_opts.read_only = RBD_READ_ONLY_DEFAULT;
3152
3153 ceph_opts = ceph_parse_options(options, mon_addrs,
3154 mon_addrs + mon_addrs_size - 1,
3155 parse_rbd_opts_token, &rbd_opts);
3156 if (IS_ERR(ceph_opts)) {
3157 rc = PTR_ERR(ceph_opts);
Alex Elder85ae8922012-07-26 23:37:14 -05003158 goto err_out_args;
Alex Elder78cea762012-10-25 23:34:41 -05003159 }
3160
3161 /* Record the parsed rbd options */
3162
3163 rbd_dev->mapping.read_only = rbd_opts.read_only;
3164
3165 rc = rbd_get_client(rbd_dev, ceph_opts);
3166 if (rc < 0)
3167 goto err_out_opts;
3168 ceph_opts = NULL; /* ceph_opts now owned by rbd_dev client */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003169
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003170 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06003171 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003172 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
3173 if (rc < 0)
3174 goto err_out_client;
Alex Elder86992092012-10-25 23:34:41 -05003175 rbd_dev->pool_id = (u64) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003176
Alex Eldera30b71b2012-07-10 20:30:11 -05003177 rc = rbd_dev_probe(rbd_dev);
3178 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05003179 goto err_out_client;
Alex Elder05fd6f62012-08-29 17:11:07 -05003180
3181 /* no need to lock here, as rbd_dev is not registered yet */
3182 rc = rbd_dev_snaps_update(rbd_dev);
3183 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003184 goto err_out_probe;
Alex Elder05fd6f62012-08-29 17:11:07 -05003185
3186 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
3187 if (rc)
Alex Elder41f38c22012-10-25 23:34:40 -05003188 goto err_out_snaps;
Alex Elder05fd6f62012-08-29 17:11:07 -05003189
Alex Elder85ae8922012-07-26 23:37:14 -05003190 /* generate unique id: find highest unique id, add one */
3191 rbd_dev_id_get(rbd_dev);
3192
3193 /* Fill in the device name, now that we have its id. */
3194 BUILD_BUG_ON(DEV_NAME_LEN
3195 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3196 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3197
3198 /* Get our block major device number. */
3199
Alex Elder27cc2592012-02-02 08:13:30 -06003200 rc = register_blkdev(0, rbd_dev->name);
3201 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05003202 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06003203 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003204
Alex Elder0f308a32012-08-29 17:11:07 -05003205 /* Set up the blkdev mapping. */
3206
3207 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003208 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003209 goto err_out_blkdev;
3210
Alex Elder0f308a32012-08-29 17:11:07 -05003211 rc = rbd_bus_add_dev(rbd_dev);
3212 if (rc)
3213 goto err_out_disk;
3214
Alex Elder32eec682012-02-08 16:11:14 -06003215 /*
3216 * At this point cleanup in the event of an error is the job
3217 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06003218 */
Alex Elder2ac4e752012-07-10 20:30:10 -05003219
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003220 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05003221 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05003222 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05003223 if (rc)
3224 goto err_out_bus;
3225
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003226 rc = rbd_init_watch_dev(rbd_dev);
3227 if (rc)
3228 goto err_out_bus;
3229
Alex Elder3ee40012012-08-29 17:11:07 -05003230 /* Everything's ready. Announce the disk to the world. */
3231
3232 add_disk(rbd_dev->disk);
3233
3234 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3235 (unsigned long long) rbd_dev->mapping.size);
3236
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003237 return count;
3238
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003239err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003240 /* this will also clean up rest of rbd_dev stuff */
3241
3242 rbd_bus_del_dev(rbd_dev);
3243 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08003244 return rc;
3245
Alex Elder0f308a32012-08-29 17:11:07 -05003246err_out_disk:
3247 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003248err_out_blkdev:
3249 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05003250err_out_id:
3251 rbd_dev_id_put(rbd_dev);
Alex Elder41f38c22012-10-25 23:34:40 -05003252err_out_snaps:
3253 rbd_remove_all_snaps(rbd_dev);
3254err_out_probe:
Alex Elder05fd6f62012-08-29 17:11:07 -05003255 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003256err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05003257 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003258 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05003259 kfree(rbd_dev->image_id);
Alex Elder78cea762012-10-25 23:34:41 -05003260err_out_opts:
3261 if (ceph_opts)
3262 ceph_destroy_options(ceph_opts);
Alex Elder85ae8922012-07-26 23:37:14 -05003263err_out_args:
Alex Elder971f8392012-10-25 23:34:41 -05003264 kfree(rbd_dev->snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05003265 kfree(rbd_dev->image_name);
3266 kfree(rbd_dev->pool_name);
3267err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06003268 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05003269 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06003270
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003271 dout("Error adding device %s\n", buf);
3272 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003273
3274 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003275}
3276
Alex Elderde71a292012-07-03 16:01:19 -05003277static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003278{
3279 struct list_head *tmp;
3280 struct rbd_device *rbd_dev;
3281
Alex Eldere124a82f2012-01-29 13:57:44 -06003282 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003283 list_for_each(tmp, &rbd_dev_list) {
3284 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003285 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06003286 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003287 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06003288 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003289 }
Alex Eldere124a82f2012-01-29 13:57:44 -06003290 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003291 return NULL;
3292}
3293
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003294static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003295{
Alex Elder593a9e72012-02-07 12:03:37 -06003296 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003297
Alex Elder1dbb4392012-01-24 10:08:37 -06003298 if (rbd_dev->watch_request) {
3299 struct ceph_client *client = rbd_dev->rbd_client->client;
3300
3301 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003302 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003303 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003304 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05003305 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003306
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003307 rbd_put_client(rbd_dev);
3308
3309 /* clean up and free blkdev */
3310 rbd_free_disk(rbd_dev);
3311 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003312
Alex Elder2ac4e752012-07-10 20:30:10 -05003313 /* release allocated disk header fields */
3314 rbd_header_free(&rbd_dev->header);
3315
Alex Elder32eec682012-02-08 16:11:14 -06003316 /* done with the id, and with the rbd_dev */
Alex Elder971f8392012-10-25 23:34:41 -05003317 kfree(rbd_dev->snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003318 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05003319 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05003320 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05003321 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05003322 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003323 kfree(rbd_dev);
3324
3325 /* release module ref */
3326 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003327}
3328
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003329static ssize_t rbd_remove(struct bus_type *bus,
3330 const char *buf,
3331 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003332{
3333 struct rbd_device *rbd_dev = NULL;
3334 int target_id, rc;
3335 unsigned long ul;
3336 int ret = count;
3337
3338 rc = strict_strtoul(buf, 10, &ul);
3339 if (rc)
3340 return rc;
3341
3342 /* convert to int; abort if we lost anything in the conversion */
3343 target_id = (int) ul;
3344 if (target_id != ul)
3345 return -EINVAL;
3346
3347 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3348
3349 rbd_dev = __rbd_get_dev(target_id);
3350 if (!rbd_dev) {
3351 ret = -ENOENT;
3352 goto done;
3353 }
3354
Alex Elder41f38c22012-10-25 23:34:40 -05003355 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003356 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003357
3358done:
3359 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003360
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003361 return ret;
3362}
3363
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003364/*
3365 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003366 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003367 */
3368static int rbd_sysfs_init(void)
3369{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003370 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003371
Alex Elderfed4c142012-02-07 12:03:36 -06003372 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003373 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003374 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003375
Alex Elderfed4c142012-02-07 12:03:36 -06003376 ret = bus_register(&rbd_bus_type);
3377 if (ret < 0)
3378 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003379
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003380 return ret;
3381}
3382
3383static void rbd_sysfs_cleanup(void)
3384{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003385 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003386 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003387}
3388
3389int __init rbd_init(void)
3390{
3391 int rc;
3392
3393 rc = rbd_sysfs_init();
3394 if (rc)
3395 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003396 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003397 return 0;
3398}
3399
3400void __exit rbd_exit(void)
3401{
3402 rbd_sysfs_cleanup();
3403}
3404
3405module_init(rbd_init);
3406module_exit(rbd_exit);
3407
3408MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3409MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3410MODULE_DESCRIPTION("rados block device");
3411
3412/* following authorship retained from original osdblk.c */
3413MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3414
3415MODULE_LICENSE("GPL");