blob: feaa2e9192a1ad285df80d41f2d2ba4c508991af [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderf0f8cef2012-01-29 13:57:44 -060055#define RBD_DRV_NAME "rbd"
56#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070057
58#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
59
Alex Elderd4b125e2012-07-03 16:01:19 -050060#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
61#define RBD_MAX_SNAP_NAME_LEN \
62 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63
Alex Elder35d489f2012-07-03 16:01:19 -050064#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070065
66#define RBD_SNAP_HEAD_NAME "-"
67
Alex Elder9e15b772012-10-30 19:40:33 -050068/* This allows a single page to hold an image name sent by OSD */
69#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050070#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050071
Alex Elder1e130192012-07-03 16:01:19 -050072#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050073
Alex Elderd8891402012-10-09 13:50:17 -070074/* Feature bits */
75
Alex Elder5cbf6f122013-04-11 09:29:48 -050076#define RBD_FEATURE_LAYERING (1<<0)
77#define RBD_FEATURE_STRIPINGV2 (1<<1)
78#define RBD_FEATURES_ALL \
79 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
Alex Elderd8891402012-10-09 13:50:17 -070080
81/* Features supported by this (client software) implementation. */
82
Alex Elder770eba62012-10-25 23:34:40 -050083#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -070084
Alex Elder81a89792012-02-02 08:13:30 -060085/*
86 * An RBD device name will be "rbd#", where the "rbd" comes from
87 * RBD_DRV_NAME above, and # is a unique integer identifier.
88 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
89 * enough to hold all possible device names.
90 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060092#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093
94/*
95 * block device image metadata (in-memory version)
96 */
97struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050098 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050099 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500100 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101 __u8 obj_order;
102 __u8 crypt_type;
103 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104
Alex Elderf84344f2012-08-31 17:29:51 -0500105 /* The remaining fields need to be updated occasionally */
106 u64 image_size;
107 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108 char *snap_names;
109 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700110
Alex Elder500d0c02013-04-26 09:43:47 -0500111 u64 stripe_unit;
112 u64 stripe_count;
113
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114 u64 obj_version;
115};
116
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500117/*
118 * An rbd image specification.
119 *
120 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500121 * identify an image. Each rbd_dev structure includes a pointer to
122 * an rbd_spec structure that encapsulates this identity.
123 *
124 * Each of the id's in an rbd_spec has an associated name. For a
125 * user-mapped image, the names are supplied and the id's associated
126 * with them are looked up. For a layered image, a parent image is
127 * defined by the tuple, and the names are looked up.
128 *
129 * An rbd_dev structure contains a parent_spec pointer which is
130 * non-null if the image it represents is a child in a layered
131 * image. This pointer will refer to the rbd_spec structure used
132 * by the parent rbd_dev for its own identity (i.e., the structure
133 * is shared between the parent and child).
134 *
135 * Since these structures are populated once, during the discovery
136 * phase of image construction, they are effectively immutable so
137 * we make no effort to synchronize access to them.
138 *
139 * Note that code herein does not assume the image name is known (it
140 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500141 */
142struct rbd_spec {
143 u64 pool_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500144 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500145
Alex Elderecb4dc222013-04-26 09:43:47 -0500146 const char *image_id;
147 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148
149 u64 snap_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500150 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500151
152 struct kref kref;
153};
154
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700155/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600156 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157 */
158struct rbd_client {
159 struct ceph_client *client;
160 struct kref kref;
161 struct list_head node;
162};
163
Alex Elderbf0d5f502012-11-22 00:00:08 -0600164struct rbd_img_request;
165typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
166
167#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
168
169struct rbd_obj_request;
170typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
171
Alex Elder9969ebc2013-01-18 12:31:10 -0600172enum obj_request_type {
173 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
174};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600175
Alex Elder926f9b32013-02-11 12:33:24 -0600176enum obj_req_flags {
177 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600178 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600179 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
180 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600181};
182
Alex Elderbf0d5f502012-11-22 00:00:08 -0600183struct rbd_obj_request {
184 const char *object_name;
185 u64 offset; /* object start byte */
186 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600187 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600188
Alex Elderc5b5ef62013-02-11 12:33:24 -0600189 /*
190 * An object request associated with an image will have its
191 * img_data flag set; a standalone object request will not.
192 *
193 * A standalone object request will have which == BAD_WHICH
194 * and a null obj_request pointer.
195 *
196 * An object request initiated in support of a layered image
197 * object (to check for its existence before a write) will
198 * have which == BAD_WHICH and a non-null obj_request pointer.
199 *
200 * Finally, an object request for rbd image data will have
201 * which != BAD_WHICH, and will have a non-null img_request
202 * pointer. The value of which will be in the range
203 * 0..(img_request->obj_request_count-1).
204 */
205 union {
206 struct rbd_obj_request *obj_request; /* STAT op */
207 struct {
208 struct rbd_img_request *img_request;
209 u64 img_offset;
210 /* links for img_request->obj_requests list */
211 struct list_head links;
212 };
213 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600214 u32 which; /* posn image request list */
215
216 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600217 union {
218 struct bio *bio_list;
219 struct {
220 struct page **pages;
221 u32 page_count;
222 };
223 };
Alex Elder0eefd472013-04-19 15:34:50 -0500224 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600225
226 struct ceph_osd_request *osd_req;
227
228 u64 xferred; /* bytes transferred */
229 u64 version;
Sage Weil1b83bef2013-02-25 16:11:12 -0800230 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600231
232 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600233 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600234
235 struct kref kref;
236};
237
Alex Elder0c425242013-02-08 09:55:49 -0600238enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600239 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
240 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600241 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600242};
243
Alex Elderbf0d5f502012-11-22 00:00:08 -0600244struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600245 struct rbd_device *rbd_dev;
246 u64 offset; /* starting image byte offset */
247 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600248 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600249 union {
Alex Elder9849e982013-01-24 16:13:36 -0600250 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600251 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600252 };
253 union {
254 struct request *rq; /* block request */
255 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600256 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500257 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600258 spinlock_t completion_lock;/* protects next_completion */
259 u32 next_completion;
260 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500261 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600262 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600263
264 u32 obj_request_count;
265 struct list_head obj_requests; /* rbd_obj_request structs */
266
267 struct kref kref;
268};
269
270#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600271 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600272#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600273 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600274#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600275 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800277struct rbd_snap {
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800278 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800279 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800280 struct list_head node;
281 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500282 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800283};
284
Alex Elderf84344f2012-08-31 17:29:51 -0500285struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500286 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500287 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500288 bool read_only;
289};
290
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700291/*
292 * a single device
293 */
294struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500295 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296
297 int major; /* blkdev assigned major */
298 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299
Alex Eldera30b71b2012-07-10 20:30:11 -0500300 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700301 struct rbd_client *rbd_client;
302
303 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
304
Alex Elderb82d1672013-01-14 12:43:31 -0600305 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306
307 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600308 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500309 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500311 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500312
Alex Elder0903e872012-11-14 12:25:19 -0600313 struct ceph_file_layout layout;
314
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700315 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600316 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700317
Alex Elder86b00e02012-10-25 23:34:42 -0500318 struct rbd_spec *parent_spec;
319 u64 parent_overlap;
Alex Elder2f82ee52012-10-30 19:40:33 -0500320 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500321
Josh Durginc6666012011-11-21 17:11:12 -0800322 /* protects updating the header */
323 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500324
325 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700326
327 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800328
329 /* list of snapshots */
330 struct list_head snaps;
331
332 /* sysfs related */
333 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600334 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800335};
336
Alex Elderb82d1672013-01-14 12:43:31 -0600337/*
338 * Flag bits for rbd_dev->flags. If atomicity is required,
339 * rbd_dev->lock is used to protect access.
340 *
341 * Currently, only the "removing" flag (which is coupled with the
342 * "open_count" field) requires atomic access.
343 */
Alex Elder6d292902013-01-14 12:43:31 -0600344enum rbd_dev_flags {
345 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600346 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600347};
348
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600350
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600352static DEFINE_SPINLOCK(rbd_dev_list_lock);
353
Alex Elder432b8582012-01-29 13:57:44 -0600354static LIST_HEAD(rbd_client_list); /* clients */
355static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356
Alex Elder3d7efd12013-04-19 15:34:50 -0500357static int rbd_img_request_submit(struct rbd_img_request *img_request);
358
Alex Elder304f6802012-08-31 17:29:52 -0500359static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
Alex Elder304f6802012-08-31 17:29:52 -0500360
Alex Elder200a6a82013-04-28 23:32:34 -0500361static void rbd_dev_device_release(struct device *dev);
Alex Elder6087b512013-04-25 15:09:41 -0500362static void rbd_snap_destroy(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800363
Alex Elderf0f8cef2012-01-29 13:57:44 -0600364static ssize_t rbd_add(struct bus_type *bus, const char *buf,
365 size_t count);
366static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
367 size_t count);
Alex Elder71f293e2013-04-26 09:43:48 -0500368static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600369
370static struct bus_attribute rbd_bus_attrs[] = {
371 __ATTR(add, S_IWUSR, NULL, rbd_add),
372 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
373 __ATTR_NULL
374};
375
376static struct bus_type rbd_bus_type = {
377 .name = "rbd",
378 .bus_attrs = rbd_bus_attrs,
379};
380
381static void rbd_root_dev_release(struct device *dev)
382{
383}
384
385static struct device rbd_root_dev = {
386 .init_name = "rbd",
387 .release = rbd_root_dev_release,
388};
389
Alex Elder06ecc6c2012-11-01 10:17:15 -0500390static __printf(2, 3)
391void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
392{
393 struct va_format vaf;
394 va_list args;
395
396 va_start(args, fmt);
397 vaf.fmt = fmt;
398 vaf.va = &args;
399
400 if (!rbd_dev)
401 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
402 else if (rbd_dev->disk)
403 printk(KERN_WARNING "%s: %s: %pV\n",
404 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
405 else if (rbd_dev->spec && rbd_dev->spec->image_name)
406 printk(KERN_WARNING "%s: image %s: %pV\n",
407 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
408 else if (rbd_dev->spec && rbd_dev->spec->image_id)
409 printk(KERN_WARNING "%s: id %s: %pV\n",
410 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
411 else /* punt */
412 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
413 RBD_DRV_NAME, rbd_dev, &vaf);
414 va_end(args);
415}
416
Alex Elderaafb2302012-09-06 16:00:54 -0500417#ifdef RBD_DEBUG
418#define rbd_assert(expr) \
419 if (unlikely(!(expr))) { \
420 printk(KERN_ERR "\nAssertion failure in %s() " \
421 "at line %d:\n\n" \
422 "\trbd_assert(%s);\n\n", \
423 __func__, __LINE__, #expr); \
424 BUG(); \
425 }
426#else /* !RBD_DEBUG */
427# define rbd_assert(expr) ((void) 0)
428#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800429
Alex Elderb454e362013-04-19 15:34:50 -0500430static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder05a46af2013-04-26 15:44:36 -0500431static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
432static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600433
Alex Elder117973f2012-08-31 17:29:55 -0500434static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
435static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700436
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437static int rbd_open(struct block_device *bdev, fmode_t mode)
438{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600439 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600440 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441
Alex Elderf84344f2012-08-31 17:29:51 -0500442 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700443 return -EROFS;
444
Alex Eldera14ea262013-02-05 13:23:12 -0600445 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600446 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
447 removing = true;
448 else
449 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600450 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600451 if (removing)
452 return -ENOENT;
453
Alex Elder42382b72012-11-16 09:29:16 -0600454 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600455 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500456 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600457 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700458
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700459 return 0;
460}
461
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800462static int rbd_release(struct gendisk *disk, fmode_t mode)
463{
464 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600465 unsigned long open_count_before;
466
Alex Eldera14ea262013-02-05 13:23:12 -0600467 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600468 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600469 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600470 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800471
Alex Elder42382b72012-11-16 09:29:16 -0600472 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600473 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600474 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800475
476 return 0;
477}
478
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700479static const struct block_device_operations rbd_bd_ops = {
480 .owner = THIS_MODULE,
481 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800482 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483};
484
485/*
486 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500487 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488 */
Alex Elderf8c38922012-08-10 13:12:07 -0700489static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700490{
491 struct rbd_client *rbdc;
492 int ret = -ENOMEM;
493
Alex Elder37206ee2013-02-20 17:32:08 -0600494 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
496 if (!rbdc)
497 goto out_opt;
498
499 kref_init(&rbdc->kref);
500 INIT_LIST_HEAD(&rbdc->node);
501
Alex Elderbc534d82012-01-29 13:57:44 -0600502 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
503
Alex Elder43ae4702012-07-03 16:01:18 -0500504 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700505 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600506 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500507 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700508
509 ret = ceph_open_session(rbdc->client);
510 if (ret < 0)
511 goto out_err;
512
Alex Elder432b8582012-01-29 13:57:44 -0600513 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600515 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700516
Alex Elderbc534d82012-01-29 13:57:44 -0600517 mutex_unlock(&ctl_mutex);
Alex Elder37206ee2013-02-20 17:32:08 -0600518 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600519
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700520 return rbdc;
521
522out_err:
523 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600524out_mutex:
525 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700526 kfree(rbdc);
527out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500528 if (ceph_opts)
529 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600530 dout("%s: error %d\n", __func__, ret);
531
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400532 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700533}
534
Alex Elder2f82ee52012-10-30 19:40:33 -0500535static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
536{
537 kref_get(&rbdc->kref);
538
539 return rbdc;
540}
541
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700542/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700543 * Find a ceph client with specific addr and configuration. If
544 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700546static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700547{
548 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700549 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700550
Alex Elder43ae4702012-07-03 16:01:18 -0500551 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700552 return NULL;
553
Alex Elder1f7ba332012-08-10 13:12:07 -0700554 spin_lock(&rbd_client_list_lock);
555 list_for_each_entry(client_node, &rbd_client_list, node) {
556 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500557 __rbd_get_client(client_node);
558
Alex Elder1f7ba332012-08-10 13:12:07 -0700559 found = true;
560 break;
561 }
562 }
563 spin_unlock(&rbd_client_list_lock);
564
565 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700566}
567
568/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700569 * mount options
570 */
571enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700572 Opt_last_int,
573 /* int args above */
574 Opt_last_string,
575 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700576 Opt_read_only,
577 Opt_read_write,
578 /* Boolean args above */
579 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700580};
581
Alex Elder43ae4702012-07-03 16:01:18 -0500582static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700583 /* int args above */
584 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500585 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700586 {Opt_read_only, "ro"}, /* Alternate spelling */
587 {Opt_read_write, "read_write"},
588 {Opt_read_write, "rw"}, /* Alternate spelling */
589 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700590 {-1, NULL}
591};
592
Alex Elder98571b52013-01-20 14:44:42 -0600593struct rbd_options {
594 bool read_only;
595};
596
597#define RBD_READ_ONLY_DEFAULT false
598
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700599static int parse_rbd_opts_token(char *c, void *private)
600{
Alex Elder43ae4702012-07-03 16:01:18 -0500601 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700602 substring_t argstr[MAX_OPT_ARGS];
603 int token, intval, ret;
604
Alex Elder43ae4702012-07-03 16:01:18 -0500605 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700606 if (token < 0)
607 return -EINVAL;
608
609 if (token < Opt_last_int) {
610 ret = match_int(&argstr[0], &intval);
611 if (ret < 0) {
612 pr_err("bad mount option arg (not int) "
613 "at '%s'\n", c);
614 return ret;
615 }
616 dout("got int token %d val %d\n", token, intval);
617 } else if (token > Opt_last_int && token < Opt_last_string) {
618 dout("got string token %d val %s\n", token,
619 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700620 } else if (token > Opt_last_string && token < Opt_last_bool) {
621 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700622 } else {
623 dout("got token %d\n", token);
624 }
625
626 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700627 case Opt_read_only:
628 rbd_opts->read_only = true;
629 break;
630 case Opt_read_write:
631 rbd_opts->read_only = false;
632 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700633 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500634 rbd_assert(false);
635 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700636 }
637 return 0;
638}
639
640/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700641 * Get a ceph client with specific addr and configuration, if one does
642 * not exist create it.
643 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500644static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700645{
Alex Elderf8c38922012-08-10 13:12:07 -0700646 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700647
Alex Elder1f7ba332012-08-10 13:12:07 -0700648 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500649 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500650 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500651 else
Alex Elderf8c38922012-08-10 13:12:07 -0700652 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700653
Alex Elder9d3997f2012-10-25 23:34:42 -0500654 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700655}
656
657/*
658 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600659 *
Alex Elder432b8582012-01-29 13:57:44 -0600660 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700661 */
662static void rbd_client_release(struct kref *kref)
663{
664 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
665
Alex Elder37206ee2013-02-20 17:32:08 -0600666 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500667 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500669 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700670
671 ceph_destroy_client(rbdc->client);
672 kfree(rbdc);
673}
674
Alex Elder468521c2013-04-26 09:43:47 -0500675/* Caller has to fill in snapc->seq and snapc->snaps[0..snap_count-1] */
676
677static struct ceph_snap_context *rbd_snap_context_create(u32 snap_count)
678{
679 struct ceph_snap_context *snapc;
680 size_t size;
681
682 size = sizeof (struct ceph_snap_context);
683 size += snap_count * sizeof (snapc->snaps[0]);
684 snapc = kzalloc(size, GFP_KERNEL);
685 if (!snapc)
686 return NULL;
687
688 atomic_set(&snapc->nref, 1);
689 snapc->num_snaps = snap_count;
690
691 return snapc;
692}
693
694static inline void rbd_snap_context_get(struct ceph_snap_context *snapc)
695{
696 (void)ceph_get_snap_context(snapc);
697}
698
699static inline void rbd_snap_context_put(struct ceph_snap_context *snapc)
700{
701 ceph_put_snap_context(snapc);
702}
703
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704/*
705 * Drop reference to ceph client node. If it's not referenced anymore, release
706 * it.
707 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500708static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709{
Alex Elderc53d5892012-10-25 23:34:42 -0500710 if (rbdc)
711 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712}
713
Alex Eldera30b71b2012-07-10 20:30:11 -0500714static bool rbd_image_format_valid(u32 image_format)
715{
716 return image_format == 1 || image_format == 2;
717}
718
Alex Elder8e94af82012-07-25 09:32:40 -0500719static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
720{
Alex Elder103a1502012-08-02 11:29:45 -0500721 size_t size;
722 u32 snap_count;
723
724 /* The header has to start with the magic rbd header text */
725 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
726 return false;
727
Alex Elderdb2388b2012-10-20 22:17:27 -0500728 /* The bio layer requires at least sector-sized I/O */
729
730 if (ondisk->options.order < SECTOR_SHIFT)
731 return false;
732
733 /* If we use u64 in a few spots we may be able to loosen this */
734
735 if (ondisk->options.order > 8 * sizeof (int) - 1)
736 return false;
737
Alex Elder103a1502012-08-02 11:29:45 -0500738 /*
739 * The size of a snapshot header has to fit in a size_t, and
740 * that limits the number of snapshots.
741 */
742 snap_count = le32_to_cpu(ondisk->snap_count);
743 size = SIZE_MAX - sizeof (struct ceph_snap_context);
744 if (snap_count > size / sizeof (__le64))
745 return false;
746
747 /*
748 * Not only that, but the size of the entire the snapshot
749 * header must also be representable in a size_t.
750 */
751 size -= snap_count * sizeof (__le64);
752 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
753 return false;
754
755 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500756}
757
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758/*
759 * Create a new header structure, translate header format from the on-disk
760 * header.
761 */
762static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500763 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700764{
Alex Elderccece232012-07-10 20:30:10 -0500765 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500766 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500767 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500768 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700769
Alex Elder6a523252012-07-19 17:12:59 -0500770 memset(header, 0, sizeof (*header));
771
Alex Elder103a1502012-08-02 11:29:45 -0500772 snap_count = le32_to_cpu(ondisk->snap_count);
773
Alex Elder58c17b02012-08-23 23:22:06 -0500774 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
775 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500776 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700777 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500778 memcpy(header->object_prefix, ondisk->object_prefix, len);
779 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600780
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700781 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500782 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
783
Alex Elder621901d2012-08-23 23:22:06 -0500784 /* Save a copy of the snapshot names */
785
Alex Elderf785cc12012-08-23 23:22:06 -0500786 if (snap_names_len > (u64) SIZE_MAX)
787 return -EIO;
788 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700789 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500790 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500791 /*
792 * Note that rbd_dev_v1_header_read() guarantees
793 * the ondisk buffer we're working with has
794 * snap_names_len bytes beyond the end of the
795 * snapshot id array, this memcpy() is safe.
796 */
797 memcpy(header->snap_names, &ondisk->snaps[snap_count],
798 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500799
Alex Elder621901d2012-08-23 23:22:06 -0500800 /* Record each snapshot's size */
801
Alex Elderd2bb24e2012-07-26 23:37:14 -0500802 size = snap_count * sizeof (*header->snap_sizes);
803 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700804 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500805 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500806 for (i = 0; i < snap_count; i++)
807 header->snap_sizes[i] =
808 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700809 } else {
810 header->snap_names = NULL;
811 header->snap_sizes = NULL;
812 }
Alex Elder849b4262012-07-09 21:04:24 -0500813
Alex Elder34b13182012-07-13 20:35:12 -0500814 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700815 header->obj_order = ondisk->options.order;
816 header->crypt_type = ondisk->options.crypt_type;
817 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500818
Alex Elder621901d2012-08-23 23:22:06 -0500819 /* Allocate and fill in the snapshot context */
820
Alex Elderf84344f2012-08-31 17:29:51 -0500821 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder468521c2013-04-26 09:43:47 -0500822
823 header->snapc = rbd_snap_context_create(snap_count);
Alex Elder6a523252012-07-19 17:12:59 -0500824 if (!header->snapc)
825 goto out_err;
Alex Elder505cbb92012-07-19 08:49:18 -0500826 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Alex Elder621901d2012-08-23 23:22:06 -0500827 for (i = 0; i < snap_count; i++)
Alex Elder468521c2013-04-26 09:43:47 -0500828 header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700829
830 return 0;
831
Alex Elder6a523252012-07-19 17:12:59 -0500832out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500833 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500834 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700835 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500836 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500837 kfree(header->object_prefix);
838 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500839
Alex Elder00f1f362012-02-07 12:03:36 -0600840 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700841}
842
Alex Elder9e15b772012-10-30 19:40:33 -0500843static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
844{
845 struct rbd_snap *snap;
846
847 if (snap_id == CEPH_NOSNAP)
848 return RBD_SNAP_HEAD_NAME;
849
850 list_for_each_entry(snap, &rbd_dev->snaps, node)
851 if (snap_id == snap->id)
852 return snap->name;
853
854 return NULL;
855}
856
Alex Elder8b0241f2013-04-25 23:15:08 -0500857static struct rbd_snap *snap_by_name(struct rbd_device *rbd_dev,
858 const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859{
Alex Eldere86924a2012-07-10 20:30:11 -0500860 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600861
Alex Elder8b0241f2013-04-25 23:15:08 -0500862 list_for_each_entry(snap, &rbd_dev->snaps, node)
863 if (!strcmp(snap_name, snap->name))
864 return snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600865
Alex Elder8b0241f2013-04-25 23:15:08 -0500866 return NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700867}
868
Alex Elderd1cf5782013-04-27 09:59:30 -0500869static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700870{
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500871 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800872 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder99c1f082012-08-30 14:42:15 -0500873 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500874 rbd_dev->mapping.features = rbd_dev->header.features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700875 } else {
Alex Elder8b0241f2013-04-25 23:15:08 -0500876 struct rbd_snap *snap;
877
878 snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
879 if (!snap)
880 return -ENOENT;
Alex Elder8b0241f2013-04-25 23:15:08 -0500881 rbd_dev->mapping.size = snap->size;
882 rbd_dev->mapping.features = snap->features;
Alex Elderf84344f2012-08-31 17:29:51 -0500883 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 }
Alex Elder6d292902013-01-14 12:43:31 -0600885
Alex Elder8b0241f2013-04-25 23:15:08 -0500886 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887}
888
Alex Elderd1cf5782013-04-27 09:59:30 -0500889static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
890{
891 rbd_dev->mapping.size = 0;
892 rbd_dev->mapping.features = 0;
893 rbd_dev->mapping.read_only = true;
894}
895
Alex Elder200a6a82013-04-28 23:32:34 -0500896static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
897{
898 rbd_dev->mapping.size = 0;
899 rbd_dev->mapping.features = 0;
900 rbd_dev->mapping.read_only = true;
901}
902
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903static void rbd_header_free(struct rbd_image_header *header)
904{
Alex Elder849b4262012-07-09 21:04:24 -0500905 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500906 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700907 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500908 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500909 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500910 header->snap_names = NULL;
Alex Elder468521c2013-04-26 09:43:47 -0500911 rbd_snap_context_put(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500912 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700913}
914
Alex Elder98571b52013-01-20 14:44:42 -0600915static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700916{
Alex Elder65ccfe22012-08-09 10:33:26 -0700917 char *name;
918 u64 segment;
919 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700920
Alex Elder2fd82b92012-11-09 15:05:54 -0600921 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700922 if (!name)
923 return NULL;
924 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600925 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700926 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600927 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700928 pr_err("error formatting segment name for #%llu (%d)\n",
929 segment, ret);
930 kfree(name);
931 name = NULL;
932 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700933
Alex Elder65ccfe22012-08-09 10:33:26 -0700934 return name;
935}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700936
Alex Elder65ccfe22012-08-09 10:33:26 -0700937static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
938{
939 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700940
Alex Elder65ccfe22012-08-09 10:33:26 -0700941 return offset & (segment_size - 1);
942}
943
944static u64 rbd_segment_length(struct rbd_device *rbd_dev,
945 u64 offset, u64 length)
946{
947 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
948
949 offset &= segment_size - 1;
950
Alex Elderaafb2302012-09-06 16:00:54 -0500951 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700952 if (offset + length > segment_size)
953 length = segment_size - offset;
954
955 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956}
957
958/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700959 * returns the size of an object in the image
960 */
961static u64 rbd_obj_bytes(struct rbd_image_header *header)
962{
963 return 1 << header->obj_order;
964}
965
966/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 * bio helpers
968 */
969
970static void bio_chain_put(struct bio *chain)
971{
972 struct bio *tmp;
973
974 while (chain) {
975 tmp = chain;
976 chain = chain->bi_next;
977 bio_put(tmp);
978 }
979}
980
981/*
982 * zeros a bio chain, starting at specific offset
983 */
984static void zero_bio_chain(struct bio *chain, int start_ofs)
985{
986 struct bio_vec *bv;
987 unsigned long flags;
988 void *buf;
989 int i;
990 int pos = 0;
991
992 while (chain) {
993 bio_for_each_segment(bv, chain, i) {
994 if (pos + bv->bv_len > start_ofs) {
995 int remainder = max(start_ofs - pos, 0);
996 buf = bvec_kmap_irq(bv, &flags);
997 memset(buf + remainder, 0,
998 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200999 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000 }
1001 pos += bv->bv_len;
1002 }
1003
1004 chain = chain->bi_next;
1005 }
1006}
1007
1008/*
Alex Elderb9434c52013-04-19 15:34:50 -05001009 * similar to zero_bio_chain(), zeros data defined by a page array,
1010 * starting at the given byte offset from the start of the array and
1011 * continuing up to the given end offset. The pages array is
1012 * assumed to be big enough to hold all bytes up to the end.
1013 */
1014static void zero_pages(struct page **pages, u64 offset, u64 end)
1015{
1016 struct page **page = &pages[offset >> PAGE_SHIFT];
1017
1018 rbd_assert(end > offset);
1019 rbd_assert(end - offset <= (u64)SIZE_MAX);
1020 while (offset < end) {
1021 size_t page_offset;
1022 size_t length;
1023 unsigned long flags;
1024 void *kaddr;
1025
1026 page_offset = (size_t)(offset & ~PAGE_MASK);
1027 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
1028 local_irq_save(flags);
1029 kaddr = kmap_atomic(*page);
1030 memset(kaddr + page_offset, 0, length);
1031 kunmap_atomic(kaddr);
1032 local_irq_restore(flags);
1033
1034 offset += length;
1035 page++;
1036 }
1037}
1038
1039/*
Alex Elderf7760da2012-10-20 22:17:27 -05001040 * Clone a portion of a bio, starting at the given byte offset
1041 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001042 */
Alex Elderf7760da2012-10-20 22:17:27 -05001043static struct bio *bio_clone_range(struct bio *bio_src,
1044 unsigned int offset,
1045 unsigned int len,
1046 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001047{
Alex Elderf7760da2012-10-20 22:17:27 -05001048 struct bio_vec *bv;
1049 unsigned int resid;
1050 unsigned short idx;
1051 unsigned int voff;
1052 unsigned short end_idx;
1053 unsigned short vcnt;
1054 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001055
Alex Elderf7760da2012-10-20 22:17:27 -05001056 /* Handle the easy case for the caller */
1057
1058 if (!offset && len == bio_src->bi_size)
1059 return bio_clone(bio_src, gfpmask);
1060
1061 if (WARN_ON_ONCE(!len))
1062 return NULL;
1063 if (WARN_ON_ONCE(len > bio_src->bi_size))
1064 return NULL;
1065 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1066 return NULL;
1067
1068 /* Find first affected segment... */
1069
1070 resid = offset;
1071 __bio_for_each_segment(bv, bio_src, idx, 0) {
1072 if (resid < bv->bv_len)
1073 break;
1074 resid -= bv->bv_len;
1075 }
1076 voff = resid;
1077
1078 /* ...and the last affected segment */
1079
1080 resid += len;
1081 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1082 if (resid <= bv->bv_len)
1083 break;
1084 resid -= bv->bv_len;
1085 }
1086 vcnt = end_idx - idx + 1;
1087
1088 /* Build the clone */
1089
1090 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1091 if (!bio)
1092 return NULL; /* ENOMEM */
1093
1094 bio->bi_bdev = bio_src->bi_bdev;
1095 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1096 bio->bi_rw = bio_src->bi_rw;
1097 bio->bi_flags |= 1 << BIO_CLONED;
1098
1099 /*
1100 * Copy over our part of the bio_vec, then update the first
1101 * and last (or only) entries.
1102 */
1103 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1104 vcnt * sizeof (struct bio_vec));
1105 bio->bi_io_vec[0].bv_offset += voff;
1106 if (vcnt > 1) {
1107 bio->bi_io_vec[0].bv_len -= voff;
1108 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1109 } else {
1110 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001111 }
1112
Alex Elderf7760da2012-10-20 22:17:27 -05001113 bio->bi_vcnt = vcnt;
1114 bio->bi_size = len;
1115 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001116
Alex Elderf7760da2012-10-20 22:17:27 -05001117 return bio;
1118}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001119
Alex Elderf7760da2012-10-20 22:17:27 -05001120/*
1121 * Clone a portion of a bio chain, starting at the given byte offset
1122 * into the first bio in the source chain and continuing for the
1123 * number of bytes indicated. The result is another bio chain of
1124 * exactly the given length, or a null pointer on error.
1125 *
1126 * The bio_src and offset parameters are both in-out. On entry they
1127 * refer to the first source bio and the offset into that bio where
1128 * the start of data to be cloned is located.
1129 *
1130 * On return, bio_src is updated to refer to the bio in the source
1131 * chain that contains first un-cloned byte, and *offset will
1132 * contain the offset of that byte within that bio.
1133 */
1134static struct bio *bio_chain_clone_range(struct bio **bio_src,
1135 unsigned int *offset,
1136 unsigned int len,
1137 gfp_t gfpmask)
1138{
1139 struct bio *bi = *bio_src;
1140 unsigned int off = *offset;
1141 struct bio *chain = NULL;
1142 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143
Alex Elderf7760da2012-10-20 22:17:27 -05001144 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001145
Alex Elderf7760da2012-10-20 22:17:27 -05001146 if (!bi || off >= bi->bi_size || !len)
1147 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148
Alex Elderf7760da2012-10-20 22:17:27 -05001149 end = &chain;
1150 while (len) {
1151 unsigned int bi_size;
1152 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001153
Alex Elderf5400b72012-11-01 10:17:15 -05001154 if (!bi) {
1155 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001156 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001157 }
Alex Elderf7760da2012-10-20 22:17:27 -05001158 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1159 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1160 if (!bio)
1161 goto out_err; /* ENOMEM */
1162
1163 *end = bio;
1164 end = &bio->bi_next;
1165
1166 off += bi_size;
1167 if (off == bi->bi_size) {
1168 bi = bi->bi_next;
1169 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170 }
Alex Elderf7760da2012-10-20 22:17:27 -05001171 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172 }
Alex Elderf7760da2012-10-20 22:17:27 -05001173 *bio_src = bi;
1174 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001175
Alex Elderf7760da2012-10-20 22:17:27 -05001176 return chain;
1177out_err:
1178 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180 return NULL;
1181}
1182
Alex Elder926f9b32013-02-11 12:33:24 -06001183/*
1184 * The default/initial value for all object request flags is 0. For
1185 * each flag, once its value is set to 1 it is never reset to 0
1186 * again.
1187 */
Alex Elder6365d332013-02-11 12:33:24 -06001188static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1189{
1190 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001191 struct rbd_device *rbd_dev;
1192
Alex Elder57acbaa2013-02-11 12:33:24 -06001193 rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder6365d332013-02-11 12:33:24 -06001194 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1195 obj_request);
1196 }
1197}
1198
1199static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1200{
1201 smp_mb();
1202 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1203}
1204
Alex Elder57acbaa2013-02-11 12:33:24 -06001205static void obj_request_done_set(struct rbd_obj_request *obj_request)
1206{
1207 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1208 struct rbd_device *rbd_dev = NULL;
1209
1210 if (obj_request_img_data_test(obj_request))
1211 rbd_dev = obj_request->img_request->rbd_dev;
1212 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1213 obj_request);
1214 }
1215}
1216
1217static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1218{
1219 smp_mb();
1220 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1221}
1222
Alex Elder5679c592013-02-11 12:33:24 -06001223/*
1224 * This sets the KNOWN flag after (possibly) setting the EXISTS
1225 * flag. The latter is set based on the "exists" value provided.
1226 *
1227 * Note that for our purposes once an object exists it never goes
1228 * away again. It's possible that the response from two existence
1229 * checks are separated by the creation of the target object, and
1230 * the first ("doesn't exist") response arrives *after* the second
1231 * ("does exist"). In that case we ignore the second one.
1232 */
1233static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1234 bool exists)
1235{
1236 if (exists)
1237 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1238 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1239 smp_mb();
1240}
1241
1242static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1243{
1244 smp_mb();
1245 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1246}
1247
1248static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1249{
1250 smp_mb();
1251 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1252}
1253
Alex Elderbf0d5f502012-11-22 00:00:08 -06001254static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1255{
Alex Elder37206ee2013-02-20 17:32:08 -06001256 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1257 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001258 kref_get(&obj_request->kref);
1259}
1260
1261static void rbd_obj_request_destroy(struct kref *kref);
1262static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1263{
1264 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001265 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1266 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001267 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1268}
1269
1270static void rbd_img_request_get(struct rbd_img_request *img_request)
1271{
Alex Elder37206ee2013-02-20 17:32:08 -06001272 dout("%s: img %p (was %d)\n", __func__, img_request,
1273 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001274 kref_get(&img_request->kref);
1275}
1276
1277static void rbd_img_request_destroy(struct kref *kref);
1278static void rbd_img_request_put(struct rbd_img_request *img_request)
1279{
1280 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001281 dout("%s: img %p (was %d)\n", __func__, img_request,
1282 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001283 kref_put(&img_request->kref, rbd_img_request_destroy);
1284}
1285
1286static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1287 struct rbd_obj_request *obj_request)
1288{
Alex Elder25dcf952013-01-25 17:08:55 -06001289 rbd_assert(obj_request->img_request == NULL);
1290
Alex Elderb155e862013-04-15 14:50:37 -05001291 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001292 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001293 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001294 rbd_assert(!obj_request_img_data_test(obj_request));
1295 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001296 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001297 img_request->obj_request_count++;
1298 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001299 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1300 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001301}
1302
1303static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1304 struct rbd_obj_request *obj_request)
1305{
1306 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001307
Alex Elder37206ee2013-02-20 17:32:08 -06001308 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1309 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001310 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001311 rbd_assert(img_request->obj_request_count > 0);
1312 img_request->obj_request_count--;
1313 rbd_assert(obj_request->which == img_request->obj_request_count);
1314 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001315 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001316 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001317 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001318 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001319 rbd_obj_request_put(obj_request);
1320}
1321
1322static bool obj_request_type_valid(enum obj_request_type type)
1323{
1324 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001325 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001326 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001327 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001328 return true;
1329 default:
1330 return false;
1331 }
1332}
1333
Alex Elderbf0d5f502012-11-22 00:00:08 -06001334static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1335 struct rbd_obj_request *obj_request)
1336{
Alex Elder37206ee2013-02-20 17:32:08 -06001337 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1338
Alex Elderbf0d5f502012-11-22 00:00:08 -06001339 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1340}
1341
1342static void rbd_img_request_complete(struct rbd_img_request *img_request)
1343{
Alex Elder55f27e02013-04-10 12:34:25 -05001344
Alex Elder37206ee2013-02-20 17:32:08 -06001345 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001346
1347 /*
1348 * If no error occurred, compute the aggregate transfer
1349 * count for the image request. We could instead use
1350 * atomic64_cmpxchg() to update it as each object request
1351 * completes; not clear which way is better off hand.
1352 */
1353 if (!img_request->result) {
1354 struct rbd_obj_request *obj_request;
1355 u64 xferred = 0;
1356
1357 for_each_obj_request(img_request, obj_request)
1358 xferred += obj_request->xferred;
1359 img_request->xferred = xferred;
1360 }
1361
Alex Elderbf0d5f502012-11-22 00:00:08 -06001362 if (img_request->callback)
1363 img_request->callback(img_request);
1364 else
1365 rbd_img_request_put(img_request);
1366}
1367
Alex Elder788e2df2013-01-17 12:25:27 -06001368/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1369
1370static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1371{
Alex Elder37206ee2013-02-20 17:32:08 -06001372 dout("%s: obj %p\n", __func__, obj_request);
1373
Alex Elder788e2df2013-01-17 12:25:27 -06001374 return wait_for_completion_interruptible(&obj_request->completion);
1375}
1376
Alex Elder0c425242013-02-08 09:55:49 -06001377/*
1378 * The default/initial value for all image request flags is 0. Each
1379 * is conditionally set to 1 at image request initialization time
1380 * and currently never change thereafter.
1381 */
1382static void img_request_write_set(struct rbd_img_request *img_request)
1383{
1384 set_bit(IMG_REQ_WRITE, &img_request->flags);
1385 smp_mb();
1386}
1387
1388static bool img_request_write_test(struct rbd_img_request *img_request)
1389{
1390 smp_mb();
1391 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1392}
1393
Alex Elder9849e982013-01-24 16:13:36 -06001394static void img_request_child_set(struct rbd_img_request *img_request)
1395{
1396 set_bit(IMG_REQ_CHILD, &img_request->flags);
1397 smp_mb();
1398}
1399
1400static bool img_request_child_test(struct rbd_img_request *img_request)
1401{
1402 smp_mb();
1403 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1404}
1405
Alex Elderd0b2e942013-01-24 16:13:36 -06001406static void img_request_layered_set(struct rbd_img_request *img_request)
1407{
1408 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1409 smp_mb();
1410}
1411
1412static bool img_request_layered_test(struct rbd_img_request *img_request)
1413{
1414 smp_mb();
1415 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1416}
1417
Alex Elder6e2a4502013-03-27 09:16:30 -05001418static void
1419rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1420{
Alex Elderb9434c52013-04-19 15:34:50 -05001421 u64 xferred = obj_request->xferred;
1422 u64 length = obj_request->length;
1423
Alex Elder6e2a4502013-03-27 09:16:30 -05001424 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1425 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001426 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001427 /*
1428 * ENOENT means a hole in the image. We zero-fill the
1429 * entire length of the request. A short read also implies
1430 * zero-fill to the end of the request. Either way we
1431 * update the xferred count to indicate the whole request
1432 * was satisfied.
1433 */
Alex Elderb9434c52013-04-19 15:34:50 -05001434 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001435 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001436 if (obj_request->type == OBJ_REQUEST_BIO)
1437 zero_bio_chain(obj_request->bio_list, 0);
1438 else
1439 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001440 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001441 obj_request->xferred = length;
1442 } else if (xferred < length && !obj_request->result) {
1443 if (obj_request->type == OBJ_REQUEST_BIO)
1444 zero_bio_chain(obj_request->bio_list, xferred);
1445 else
1446 zero_pages(obj_request->pages, xferred, length);
1447 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001448 }
1449 obj_request_done_set(obj_request);
1450}
1451
Alex Elderbf0d5f502012-11-22 00:00:08 -06001452static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1453{
Alex Elder37206ee2013-02-20 17:32:08 -06001454 dout("%s: obj %p cb %p\n", __func__, obj_request,
1455 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001456 if (obj_request->callback)
1457 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001458 else
1459 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001460}
1461
Alex Elderc47f9372013-02-26 14:23:07 -06001462static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001463{
1464 dout("%s: obj %p\n", __func__, obj_request);
1465 obj_request_done_set(obj_request);
1466}
1467
Alex Elderc47f9372013-02-26 14:23:07 -06001468static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001469{
Alex Elder57acbaa2013-02-11 12:33:24 -06001470 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001471 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001472 bool layered = false;
1473
1474 if (obj_request_img_data_test(obj_request)) {
1475 img_request = obj_request->img_request;
1476 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001477 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001478 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001479
1480 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1481 obj_request, img_request, obj_request->result,
1482 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001483 if (layered && obj_request->result == -ENOENT &&
1484 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001485 rbd_img_parent_read(obj_request);
1486 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001487 rbd_img_obj_request_read_callback(obj_request);
1488 else
1489 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001490}
1491
Alex Elderc47f9372013-02-26 14:23:07 -06001492static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001493{
Sage Weil1b83bef2013-02-25 16:11:12 -08001494 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1495 obj_request->result, obj_request->length);
1496 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001497 * There is no such thing as a successful short write. Set
1498 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001499 */
1500 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001501 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001502}
1503
Alex Elderfbfab532013-02-08 09:55:48 -06001504/*
1505 * For a simple stat call there's nothing to do. We'll do more if
1506 * this is part of a write sequence for a layered image.
1507 */
Alex Elderc47f9372013-02-26 14:23:07 -06001508static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001509{
Alex Elder37206ee2013-02-20 17:32:08 -06001510 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001511 obj_request_done_set(obj_request);
1512}
1513
Alex Elderbf0d5f502012-11-22 00:00:08 -06001514static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1515 struct ceph_msg *msg)
1516{
1517 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001518 u16 opcode;
1519
Alex Elder37206ee2013-02-20 17:32:08 -06001520 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001521 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001522 if (obj_request_img_data_test(obj_request)) {
1523 rbd_assert(obj_request->img_request);
1524 rbd_assert(obj_request->which != BAD_WHICH);
1525 } else {
1526 rbd_assert(obj_request->which == BAD_WHICH);
1527 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001528
Sage Weil1b83bef2013-02-25 16:11:12 -08001529 if (osd_req->r_result < 0)
1530 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001531 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1532
Alex Elder0eefd472013-04-19 15:34:50 -05001533 BUG_ON(osd_req->r_num_ops > 2);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001534
Alex Elderc47f9372013-02-26 14:23:07 -06001535 /*
1536 * We support a 64-bit length, but ultimately it has to be
1537 * passed to blk_end_request(), which takes an unsigned int.
1538 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001539 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elder8b3e1a52013-01-24 16:13:36 -06001540 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Alex Elder79528732013-04-03 21:32:51 -05001541 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001542 switch (opcode) {
1543 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001544 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001545 break;
1546 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001547 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001548 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001549 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001550 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001551 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001552 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001553 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001554 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001555 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001556 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001557 default:
1558 rbd_warn(NULL, "%s: unsupported op %hu\n",
1559 obj_request->object_name, (unsigned short) opcode);
1560 break;
1561 }
1562
Alex Elder07741302013-02-05 23:41:50 -06001563 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001564 rbd_obj_request_complete(obj_request);
1565}
1566
Alex Elder9d4df012013-04-19 15:34:50 -05001567static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001568{
1569 struct rbd_img_request *img_request = obj_request->img_request;
Alex Elder8c042b02013-04-03 01:28:58 -05001570 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001571 u64 snap_id;
Alex Elder430c28c2013-04-03 21:32:51 -05001572
Alex Elder8c042b02013-04-03 01:28:58 -05001573 rbd_assert(osd_req != NULL);
Alex Elder430c28c2013-04-03 21:32:51 -05001574
Alex Elder9d4df012013-04-19 15:34:50 -05001575 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
Alex Elder8c042b02013-04-03 01:28:58 -05001576 ceph_osdc_build_request(osd_req, obj_request->offset,
Alex Elder9d4df012013-04-19 15:34:50 -05001577 NULL, snap_id, NULL);
1578}
1579
1580static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1581{
1582 struct rbd_img_request *img_request = obj_request->img_request;
1583 struct ceph_osd_request *osd_req = obj_request->osd_req;
1584 struct ceph_snap_context *snapc;
1585 struct timespec mtime = CURRENT_TIME;
1586
1587 rbd_assert(osd_req != NULL);
1588
1589 snapc = img_request ? img_request->snapc : NULL;
1590 ceph_osdc_build_request(osd_req, obj_request->offset,
1591 snapc, CEPH_NOSNAP, &mtime);
Alex Elder430c28c2013-04-03 21:32:51 -05001592}
1593
Alex Elderbf0d5f502012-11-22 00:00:08 -06001594static struct ceph_osd_request *rbd_osd_req_create(
1595 struct rbd_device *rbd_dev,
1596 bool write_request,
Alex Elder430c28c2013-04-03 21:32:51 -05001597 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001598{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001599 struct ceph_snap_context *snapc = NULL;
1600 struct ceph_osd_client *osdc;
1601 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001602
Alex Elder6365d332013-02-11 12:33:24 -06001603 if (obj_request_img_data_test(obj_request)) {
1604 struct rbd_img_request *img_request = obj_request->img_request;
1605
Alex Elder0c425242013-02-08 09:55:49 -06001606 rbd_assert(write_request ==
1607 img_request_write_test(img_request));
1608 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001609 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001610 }
1611
1612 /* Allocate and initialize the request, for the single op */
1613
1614 osdc = &rbd_dev->rbd_client->client->osdc;
1615 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1616 if (!osd_req)
1617 return NULL; /* ENOMEM */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001618
Alex Elder430c28c2013-04-03 21:32:51 -05001619 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001620 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05001621 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06001622 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001623
1624 osd_req->r_callback = rbd_osd_req_callback;
1625 osd_req->r_priv = obj_request;
1626
1627 osd_req->r_oid_len = strlen(obj_request->object_name);
1628 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1629 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1630
1631 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1632
Alex Elderbf0d5f502012-11-22 00:00:08 -06001633 return osd_req;
1634}
1635
Alex Elder0eefd472013-04-19 15:34:50 -05001636/*
1637 * Create a copyup osd request based on the information in the
1638 * object request supplied. A copyup request has two osd ops,
1639 * a copyup method call, and a "normal" write request.
1640 */
1641static struct ceph_osd_request *
1642rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1643{
1644 struct rbd_img_request *img_request;
1645 struct ceph_snap_context *snapc;
1646 struct rbd_device *rbd_dev;
1647 struct ceph_osd_client *osdc;
1648 struct ceph_osd_request *osd_req;
1649
1650 rbd_assert(obj_request_img_data_test(obj_request));
1651 img_request = obj_request->img_request;
1652 rbd_assert(img_request);
1653 rbd_assert(img_request_write_test(img_request));
1654
1655 /* Allocate and initialize the request, for the two ops */
1656
1657 snapc = img_request->snapc;
1658 rbd_dev = img_request->rbd_dev;
1659 osdc = &rbd_dev->rbd_client->client->osdc;
1660 osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1661 if (!osd_req)
1662 return NULL; /* ENOMEM */
1663
1664 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1665 osd_req->r_callback = rbd_osd_req_callback;
1666 osd_req->r_priv = obj_request;
1667
1668 osd_req->r_oid_len = strlen(obj_request->object_name);
1669 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1670 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1671
1672 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1673
1674 return osd_req;
1675}
1676
1677
Alex Elderbf0d5f502012-11-22 00:00:08 -06001678static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1679{
1680 ceph_osdc_put_request(osd_req);
1681}
1682
1683/* object_name is assumed to be a non-null pointer and NUL-terminated */
1684
1685static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1686 u64 offset, u64 length,
1687 enum obj_request_type type)
1688{
1689 struct rbd_obj_request *obj_request;
1690 size_t size;
1691 char *name;
1692
1693 rbd_assert(obj_request_type_valid(type));
1694
1695 size = strlen(object_name) + 1;
1696 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1697 if (!obj_request)
1698 return NULL;
1699
1700 name = (char *)(obj_request + 1);
1701 obj_request->object_name = memcpy(name, object_name, size);
1702 obj_request->offset = offset;
1703 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06001704 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001705 obj_request->which = BAD_WHICH;
1706 obj_request->type = type;
1707 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06001708 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001709 kref_init(&obj_request->kref);
1710
Alex Elder37206ee2013-02-20 17:32:08 -06001711 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1712 offset, length, (int)type, obj_request);
1713
Alex Elderbf0d5f502012-11-22 00:00:08 -06001714 return obj_request;
1715}
1716
1717static void rbd_obj_request_destroy(struct kref *kref)
1718{
1719 struct rbd_obj_request *obj_request;
1720
1721 obj_request = container_of(kref, struct rbd_obj_request, kref);
1722
Alex Elder37206ee2013-02-20 17:32:08 -06001723 dout("%s: obj %p\n", __func__, obj_request);
1724
Alex Elderbf0d5f502012-11-22 00:00:08 -06001725 rbd_assert(obj_request->img_request == NULL);
1726 rbd_assert(obj_request->which == BAD_WHICH);
1727
1728 if (obj_request->osd_req)
1729 rbd_osd_req_destroy(obj_request->osd_req);
1730
1731 rbd_assert(obj_request_type_valid(obj_request->type));
1732 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001733 case OBJ_REQUEST_NODATA:
1734 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001735 case OBJ_REQUEST_BIO:
1736 if (obj_request->bio_list)
1737 bio_chain_put(obj_request->bio_list);
1738 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001739 case OBJ_REQUEST_PAGES:
1740 if (obj_request->pages)
1741 ceph_release_page_vector(obj_request->pages,
1742 obj_request->page_count);
1743 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001744 }
1745
1746 kfree(obj_request);
1747}
1748
1749/*
1750 * Caller is responsible for filling in the list of object requests
1751 * that comprises the image request, and the Linux request pointer
1752 * (if there is one).
1753 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001754static struct rbd_img_request *rbd_img_request_create(
1755 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001756 u64 offset, u64 length,
Alex Elder9849e982013-01-24 16:13:36 -06001757 bool write_request,
1758 bool child_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001759{
1760 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001761
1762 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1763 if (!img_request)
1764 return NULL;
1765
1766 if (write_request) {
1767 down_read(&rbd_dev->header_rwsem);
Alex Elder468521c2013-04-26 09:43:47 -05001768 rbd_snap_context_get(rbd_dev->header.snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001769 up_read(&rbd_dev->header_rwsem);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001770 }
1771
1772 img_request->rq = NULL;
1773 img_request->rbd_dev = rbd_dev;
1774 img_request->offset = offset;
1775 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06001776 img_request->flags = 0;
1777 if (write_request) {
1778 img_request_write_set(img_request);
Alex Elder468521c2013-04-26 09:43:47 -05001779 img_request->snapc = rbd_dev->header.snapc;
Alex Elder0c425242013-02-08 09:55:49 -06001780 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001781 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06001782 }
Alex Elder9849e982013-01-24 16:13:36 -06001783 if (child_request)
1784 img_request_child_set(img_request);
Alex Elderd0b2e942013-01-24 16:13:36 -06001785 if (rbd_dev->parent_spec)
1786 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001787 spin_lock_init(&img_request->completion_lock);
1788 img_request->next_completion = 0;
1789 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06001790 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001791 img_request->obj_request_count = 0;
1792 INIT_LIST_HEAD(&img_request->obj_requests);
1793 kref_init(&img_request->kref);
1794
1795 rbd_img_request_get(img_request); /* Avoid a warning */
1796 rbd_img_request_put(img_request); /* TEMPORARY */
1797
Alex Elder37206ee2013-02-20 17:32:08 -06001798 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1799 write_request ? "write" : "read", offset, length,
1800 img_request);
1801
Alex Elderbf0d5f502012-11-22 00:00:08 -06001802 return img_request;
1803}
1804
1805static void rbd_img_request_destroy(struct kref *kref)
1806{
1807 struct rbd_img_request *img_request;
1808 struct rbd_obj_request *obj_request;
1809 struct rbd_obj_request *next_obj_request;
1810
1811 img_request = container_of(kref, struct rbd_img_request, kref);
1812
Alex Elder37206ee2013-02-20 17:32:08 -06001813 dout("%s: img %p\n", __func__, img_request);
1814
Alex Elderbf0d5f502012-11-22 00:00:08 -06001815 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1816 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001817 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001818
Alex Elder0c425242013-02-08 09:55:49 -06001819 if (img_request_write_test(img_request))
Alex Elder468521c2013-04-26 09:43:47 -05001820 rbd_snap_context_put(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001821
Alex Elder8b3e1a52013-01-24 16:13:36 -06001822 if (img_request_child_test(img_request))
1823 rbd_obj_request_put(img_request->obj_request);
1824
Alex Elderbf0d5f502012-11-22 00:00:08 -06001825 kfree(img_request);
1826}
1827
Alex Elder12178572013-02-08 09:55:49 -06001828static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1829{
Alex Elder6365d332013-02-11 12:33:24 -06001830 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001831 unsigned int xferred;
1832 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001833 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001834
Alex Elder6365d332013-02-11 12:33:24 -06001835 rbd_assert(obj_request_img_data_test(obj_request));
1836 img_request = obj_request->img_request;
1837
Alex Elder12178572013-02-08 09:55:49 -06001838 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1839 xferred = (unsigned int)obj_request->xferred;
1840 result = obj_request->result;
1841 if (result) {
1842 struct rbd_device *rbd_dev = img_request->rbd_dev;
1843
1844 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1845 img_request_write_test(img_request) ? "write" : "read",
1846 obj_request->length, obj_request->img_offset,
1847 obj_request->offset);
1848 rbd_warn(rbd_dev, " result %d xferred %x\n",
1849 result, xferred);
1850 if (!img_request->result)
1851 img_request->result = result;
1852 }
1853
Alex Elderf1a47392013-04-19 15:34:50 -05001854 /* Image object requests don't own their page array */
1855
1856 if (obj_request->type == OBJ_REQUEST_PAGES) {
1857 obj_request->pages = NULL;
1858 obj_request->page_count = 0;
1859 }
1860
Alex Elder8b3e1a52013-01-24 16:13:36 -06001861 if (img_request_child_test(img_request)) {
1862 rbd_assert(img_request->obj_request != NULL);
1863 more = obj_request->which < img_request->obj_request_count - 1;
1864 } else {
1865 rbd_assert(img_request->rq != NULL);
1866 more = blk_end_request(img_request->rq, result, xferred);
1867 }
1868
1869 return more;
Alex Elder12178572013-02-08 09:55:49 -06001870}
1871
Alex Elder21692382013-04-05 01:27:12 -05001872static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1873{
1874 struct rbd_img_request *img_request;
1875 u32 which = obj_request->which;
1876 bool more = true;
1877
Alex Elder6365d332013-02-11 12:33:24 -06001878 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001879 img_request = obj_request->img_request;
1880
1881 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1882 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001883 rbd_assert(img_request->obj_request_count > 0);
1884 rbd_assert(which != BAD_WHICH);
1885 rbd_assert(which < img_request->obj_request_count);
1886 rbd_assert(which >= img_request->next_completion);
1887
1888 spin_lock_irq(&img_request->completion_lock);
1889 if (which != img_request->next_completion)
1890 goto out;
1891
1892 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001893 rbd_assert(more);
1894 rbd_assert(which < img_request->obj_request_count);
1895
1896 if (!obj_request_done_test(obj_request))
1897 break;
Alex Elder12178572013-02-08 09:55:49 -06001898 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001899 which++;
1900 }
1901
1902 rbd_assert(more ^ (which == img_request->obj_request_count));
1903 img_request->next_completion = which;
1904out:
1905 spin_unlock_irq(&img_request->completion_lock);
1906
1907 if (!more)
1908 rbd_img_request_complete(img_request);
1909}
1910
Alex Elderf1a47392013-04-19 15:34:50 -05001911/*
1912 * Split up an image request into one or more object requests, each
1913 * to a different object. The "type" parameter indicates whether
1914 * "data_desc" is the pointer to the head of a list of bio
1915 * structures, or the base of a page array. In either case this
1916 * function assumes data_desc describes memory sufficient to hold
1917 * all data described by the image request.
1918 */
1919static int rbd_img_request_fill(struct rbd_img_request *img_request,
1920 enum obj_request_type type,
1921 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001922{
1923 struct rbd_device *rbd_dev = img_request->rbd_dev;
1924 struct rbd_obj_request *obj_request = NULL;
1925 struct rbd_obj_request *next_obj_request;
Alex Elder0c425242013-02-08 09:55:49 -06001926 bool write_request = img_request_write_test(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05001927 struct bio *bio_list;
1928 unsigned int bio_offset = 0;
1929 struct page **pages;
Alex Elder7da22d22013-01-24 16:13:36 -06001930 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001931 u64 resid;
1932 u16 opcode;
1933
Alex Elderf1a47392013-04-19 15:34:50 -05001934 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1935 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06001936
Alex Elder430c28c2013-04-03 21:32:51 -05001937 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
Alex Elder7da22d22013-01-24 16:13:36 -06001938 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001939 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06001940 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001941
1942 if (type == OBJ_REQUEST_BIO) {
1943 bio_list = data_desc;
1944 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1945 } else {
1946 rbd_assert(type == OBJ_REQUEST_PAGES);
1947 pages = data_desc;
1948 }
1949
Alex Elderbf0d5f502012-11-22 00:00:08 -06001950 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05001951 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001952 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001953 u64 offset;
1954 u64 length;
1955
Alex Elder7da22d22013-01-24 16:13:36 -06001956 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001957 if (!object_name)
1958 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06001959 offset = rbd_segment_offset(rbd_dev, img_offset);
1960 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001961 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05001962 offset, length, type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001963 kfree(object_name); /* object request has its own copy */
1964 if (!obj_request)
1965 goto out_unwind;
1966
Alex Elderf1a47392013-04-19 15:34:50 -05001967 if (type == OBJ_REQUEST_BIO) {
1968 unsigned int clone_size;
1969
1970 rbd_assert(length <= (u64)UINT_MAX);
1971 clone_size = (unsigned int)length;
1972 obj_request->bio_list =
1973 bio_chain_clone_range(&bio_list,
1974 &bio_offset,
1975 clone_size,
1976 GFP_ATOMIC);
1977 if (!obj_request->bio_list)
1978 goto out_partial;
1979 } else {
1980 unsigned int page_count;
1981
1982 obj_request->pages = pages;
1983 page_count = (u32)calc_pages_for(offset, length);
1984 obj_request->page_count = page_count;
1985 if ((offset + length) & ~PAGE_MASK)
1986 page_count--; /* more on last page */
1987 pages += page_count;
1988 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001989
Alex Elder2fa12322013-04-05 01:27:12 -05001990 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1991 obj_request);
1992 if (!osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001993 goto out_partial;
Alex Elder2fa12322013-04-05 01:27:12 -05001994 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05001995 obj_request->callback = rbd_img_obj_callback;
Alex Elder430c28c2013-04-03 21:32:51 -05001996
Alex Elder2fa12322013-04-05 01:27:12 -05001997 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1998 0, 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001999 if (type == OBJ_REQUEST_BIO)
2000 osd_req_op_extent_osd_data_bio(osd_req, 0,
2001 obj_request->bio_list, length);
2002 else
2003 osd_req_op_extent_osd_data_pages(osd_req, 0,
2004 obj_request->pages, length,
2005 offset & ~PAGE_MASK, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002006
2007 if (write_request)
2008 rbd_osd_req_format_write(obj_request);
2009 else
2010 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002011
Alex Elder7da22d22013-01-24 16:13:36 -06002012 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002013 rbd_img_obj_request_add(img_request, obj_request);
2014
Alex Elder7da22d22013-01-24 16:13:36 -06002015 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002016 resid -= length;
2017 }
2018
2019 return 0;
2020
2021out_partial:
2022 rbd_obj_request_put(obj_request);
2023out_unwind:
2024 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2025 rbd_obj_request_put(obj_request);
2026
2027 return -ENOMEM;
2028}
2029
Alex Elder3d7efd12013-04-19 15:34:50 -05002030static void
Alex Elder0eefd472013-04-19 15:34:50 -05002031rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2032{
2033 struct rbd_img_request *img_request;
2034 struct rbd_device *rbd_dev;
2035 u64 length;
2036 u32 page_count;
2037
2038 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2039 rbd_assert(obj_request_img_data_test(obj_request));
2040 img_request = obj_request->img_request;
2041 rbd_assert(img_request);
2042
2043 rbd_dev = img_request->rbd_dev;
2044 rbd_assert(rbd_dev);
2045 length = (u64)1 << rbd_dev->header.obj_order;
2046 page_count = (u32)calc_pages_for(0, length);
2047
2048 rbd_assert(obj_request->copyup_pages);
2049 ceph_release_page_vector(obj_request->copyup_pages, page_count);
2050 obj_request->copyup_pages = NULL;
2051
2052 /*
2053 * We want the transfer count to reflect the size of the
2054 * original write request. There is no such thing as a
2055 * successful short write, so if the request was successful
2056 * we can just set it to the originally-requested length.
2057 */
2058 if (!obj_request->result)
2059 obj_request->xferred = obj_request->length;
2060
2061 /* Finish up with the normal image object callback */
2062
2063 rbd_img_obj_callback(obj_request);
2064}
2065
2066static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002067rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2068{
2069 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002070 struct ceph_osd_request *osd_req;
2071 struct ceph_osd_client *osdc;
2072 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002073 struct page **pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002074 int result;
2075 u64 obj_size;
2076 u64 xferred;
2077
2078 rbd_assert(img_request_child_test(img_request));
2079
2080 /* First get what we need from the image request */
2081
2082 pages = img_request->copyup_pages;
2083 rbd_assert(pages != NULL);
2084 img_request->copyup_pages = NULL;
2085
2086 orig_request = img_request->obj_request;
2087 rbd_assert(orig_request != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002088 rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
Alex Elder3d7efd12013-04-19 15:34:50 -05002089 result = img_request->result;
2090 obj_size = img_request->length;
2091 xferred = img_request->xferred;
2092
Alex Elder0eefd472013-04-19 15:34:50 -05002093 rbd_dev = img_request->rbd_dev;
2094 rbd_assert(rbd_dev);
2095 rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2096
Alex Elder3d7efd12013-04-19 15:34:50 -05002097 rbd_img_request_put(img_request);
2098
Alex Elder0eefd472013-04-19 15:34:50 -05002099 if (result)
2100 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002101
Alex Elder0eefd472013-04-19 15:34:50 -05002102 /* Allocate the new copyup osd request for the original request */
Alex Elder3d7efd12013-04-19 15:34:50 -05002103
Alex Elder0eefd472013-04-19 15:34:50 -05002104 result = -ENOMEM;
2105 rbd_assert(!orig_request->osd_req);
2106 osd_req = rbd_osd_req_create_copyup(orig_request);
2107 if (!osd_req)
2108 goto out_err;
2109 orig_request->osd_req = osd_req;
2110 orig_request->copyup_pages = pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002111
Alex Elder0eefd472013-04-19 15:34:50 -05002112 /* Initialize the copyup op */
2113
2114 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2115 osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2116 false, false);
2117
2118 /* Then the original write request op */
2119
2120 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2121 orig_request->offset,
2122 orig_request->length, 0, 0);
2123 osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2124 orig_request->length);
2125
2126 rbd_osd_req_format_write(orig_request);
2127
2128 /* All set, send it off. */
2129
2130 orig_request->callback = rbd_img_obj_copyup_callback;
2131 osdc = &rbd_dev->rbd_client->client->osdc;
2132 result = rbd_obj_request_submit(osdc, orig_request);
2133 if (!result)
2134 return;
2135out_err:
2136 /* Record the error code and complete the request */
2137
2138 orig_request->result = result;
2139 orig_request->xferred = 0;
2140 obj_request_done_set(orig_request);
2141 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002142}
2143
2144/*
2145 * Read from the parent image the range of data that covers the
2146 * entire target of the given object request. This is used for
2147 * satisfying a layered image write request when the target of an
2148 * object request from the image request does not exist.
2149 *
2150 * A page array big enough to hold the returned data is allocated
2151 * and supplied to rbd_img_request_fill() as the "data descriptor."
2152 * When the read completes, this page array will be transferred to
2153 * the original object request for the copyup operation.
2154 *
2155 * If an error occurs, record it as the result of the original
2156 * object request and mark it done so it gets completed.
2157 */
2158static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2159{
2160 struct rbd_img_request *img_request = NULL;
2161 struct rbd_img_request *parent_request = NULL;
2162 struct rbd_device *rbd_dev;
2163 u64 img_offset;
2164 u64 length;
2165 struct page **pages = NULL;
2166 u32 page_count;
2167 int result;
2168
2169 rbd_assert(obj_request_img_data_test(obj_request));
2170 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2171
2172 img_request = obj_request->img_request;
2173 rbd_assert(img_request != NULL);
2174 rbd_dev = img_request->rbd_dev;
2175 rbd_assert(rbd_dev->parent != NULL);
2176
2177 /*
Alex Elder0eefd472013-04-19 15:34:50 -05002178 * First things first. The original osd request is of no
2179 * use to use any more, we'll need a new one that can hold
2180 * the two ops in a copyup request. We'll get that later,
2181 * but for now we can release the old one.
2182 */
2183 rbd_osd_req_destroy(obj_request->osd_req);
2184 obj_request->osd_req = NULL;
2185
2186 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002187 * Determine the byte range covered by the object in the
2188 * child image to which the original request was to be sent.
2189 */
2190 img_offset = obj_request->img_offset - obj_request->offset;
2191 length = (u64)1 << rbd_dev->header.obj_order;
2192
2193 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002194 * There is no defined parent data beyond the parent
2195 * overlap, so limit what we read at that boundary if
2196 * necessary.
2197 */
2198 if (img_offset + length > rbd_dev->parent_overlap) {
2199 rbd_assert(img_offset < rbd_dev->parent_overlap);
2200 length = rbd_dev->parent_overlap - img_offset;
2201 }
2202
2203 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002204 * Allocate a page array big enough to receive the data read
2205 * from the parent.
2206 */
2207 page_count = (u32)calc_pages_for(0, length);
2208 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2209 if (IS_ERR(pages)) {
2210 result = PTR_ERR(pages);
2211 pages = NULL;
2212 goto out_err;
2213 }
2214
2215 result = -ENOMEM;
2216 parent_request = rbd_img_request_create(rbd_dev->parent,
2217 img_offset, length,
2218 false, true);
2219 if (!parent_request)
2220 goto out_err;
2221 rbd_obj_request_get(obj_request);
2222 parent_request->obj_request = obj_request;
2223
2224 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2225 if (result)
2226 goto out_err;
2227 parent_request->copyup_pages = pages;
2228
2229 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2230 result = rbd_img_request_submit(parent_request);
2231 if (!result)
2232 return 0;
2233
2234 parent_request->copyup_pages = NULL;
2235 parent_request->obj_request = NULL;
2236 rbd_obj_request_put(obj_request);
2237out_err:
2238 if (pages)
2239 ceph_release_page_vector(pages, page_count);
2240 if (parent_request)
2241 rbd_img_request_put(parent_request);
2242 obj_request->result = result;
2243 obj_request->xferred = 0;
2244 obj_request_done_set(obj_request);
2245
2246 return result;
2247}
2248
Alex Elderc5b5ef62013-02-11 12:33:24 -06002249static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2250{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002251 struct rbd_obj_request *orig_request;
2252 int result;
2253
2254 rbd_assert(!obj_request_img_data_test(obj_request));
2255
2256 /*
2257 * All we need from the object request is the original
2258 * request and the result of the STAT op. Grab those, then
2259 * we're done with the request.
2260 */
2261 orig_request = obj_request->obj_request;
2262 obj_request->obj_request = NULL;
2263 rbd_assert(orig_request);
2264 rbd_assert(orig_request->img_request);
2265
2266 result = obj_request->result;
2267 obj_request->result = 0;
2268
2269 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2270 obj_request, orig_request, result,
2271 obj_request->xferred, obj_request->length);
2272 rbd_obj_request_put(obj_request);
2273
2274 rbd_assert(orig_request);
2275 rbd_assert(orig_request->img_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002276
2277 /*
2278 * Our only purpose here is to determine whether the object
2279 * exists, and we don't want to treat the non-existence as
2280 * an error. If something else comes back, transfer the
2281 * error to the original request and complete it now.
2282 */
2283 if (!result) {
2284 obj_request_existence_set(orig_request, true);
2285 } else if (result == -ENOENT) {
2286 obj_request_existence_set(orig_request, false);
2287 } else if (result) {
2288 orig_request->result = result;
Alex Elder3d7efd12013-04-19 15:34:50 -05002289 goto out;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002290 }
2291
2292 /*
2293 * Resubmit the original request now that we have recorded
2294 * whether the target object exists.
2295 */
Alex Elderb454e362013-04-19 15:34:50 -05002296 orig_request->result = rbd_img_obj_request_submit(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002297out:
Alex Elderc5b5ef62013-02-11 12:33:24 -06002298 if (orig_request->result)
2299 rbd_obj_request_complete(orig_request);
2300 rbd_obj_request_put(orig_request);
2301}
2302
2303static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2304{
2305 struct rbd_obj_request *stat_request;
2306 struct rbd_device *rbd_dev;
2307 struct ceph_osd_client *osdc;
2308 struct page **pages = NULL;
2309 u32 page_count;
2310 size_t size;
2311 int ret;
2312
2313 /*
2314 * The response data for a STAT call consists of:
2315 * le64 length;
2316 * struct {
2317 * le32 tv_sec;
2318 * le32 tv_nsec;
2319 * } mtime;
2320 */
2321 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2322 page_count = (u32)calc_pages_for(0, size);
2323 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2324 if (IS_ERR(pages))
2325 return PTR_ERR(pages);
2326
2327 ret = -ENOMEM;
2328 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2329 OBJ_REQUEST_PAGES);
2330 if (!stat_request)
2331 goto out;
2332
2333 rbd_obj_request_get(obj_request);
2334 stat_request->obj_request = obj_request;
2335 stat_request->pages = pages;
2336 stat_request->page_count = page_count;
2337
2338 rbd_assert(obj_request->img_request);
2339 rbd_dev = obj_request->img_request->rbd_dev;
2340 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2341 stat_request);
2342 if (!stat_request->osd_req)
2343 goto out;
2344 stat_request->callback = rbd_img_obj_exists_callback;
2345
2346 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2347 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2348 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002349 rbd_osd_req_format_read(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002350
2351 osdc = &rbd_dev->rbd_client->client->osdc;
2352 ret = rbd_obj_request_submit(osdc, stat_request);
2353out:
2354 if (ret)
2355 rbd_obj_request_put(obj_request);
2356
2357 return ret;
2358}
2359
Alex Elderb454e362013-04-19 15:34:50 -05002360static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2361{
2362 struct rbd_img_request *img_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002363 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002364 bool known;
Alex Elderb454e362013-04-19 15:34:50 -05002365
2366 rbd_assert(obj_request_img_data_test(obj_request));
2367
2368 img_request = obj_request->img_request;
2369 rbd_assert(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002370 rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002371
Alex Elderb454e362013-04-19 15:34:50 -05002372 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002373 * Only writes to layered images need special handling.
2374 * Reads and non-layered writes are simple object requests.
2375 * Layered writes that start beyond the end of the overlap
2376 * with the parent have no parent data, so they too are
2377 * simple object requests. Finally, if the target object is
2378 * known to already exist, its parent data has already been
2379 * copied, so a write to the object can also be handled as a
2380 * simple object request.
Alex Elderb454e362013-04-19 15:34:50 -05002381 */
2382 if (!img_request_write_test(img_request) ||
2383 !img_request_layered_test(img_request) ||
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002384 rbd_dev->parent_overlap <= obj_request->img_offset ||
Alex Elder3d7efd12013-04-19 15:34:50 -05002385 ((known = obj_request_known_test(obj_request)) &&
2386 obj_request_exists_test(obj_request))) {
Alex Elderb454e362013-04-19 15:34:50 -05002387
2388 struct rbd_device *rbd_dev;
2389 struct ceph_osd_client *osdc;
2390
2391 rbd_dev = obj_request->img_request->rbd_dev;
2392 osdc = &rbd_dev->rbd_client->client->osdc;
2393
2394 return rbd_obj_request_submit(osdc, obj_request);
2395 }
2396
2397 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002398 * It's a layered write. The target object might exist but
2399 * we may not know that yet. If we know it doesn't exist,
2400 * start by reading the data for the full target object from
2401 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002402 */
Alex Elder3d7efd12013-04-19 15:34:50 -05002403 if (known)
2404 return rbd_img_obj_parent_read_full(obj_request);
2405
2406 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002407
2408 return rbd_img_obj_exists_submit(obj_request);
2409}
2410
Alex Elderbf0d5f502012-11-22 00:00:08 -06002411static int rbd_img_request_submit(struct rbd_img_request *img_request)
2412{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002413 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002414 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002415
Alex Elder37206ee2013-02-20 17:32:08 -06002416 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05002417 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002418 int ret;
2419
Alex Elderb454e362013-04-19 15:34:50 -05002420 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002421 if (ret)
2422 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002423 }
2424
2425 return 0;
2426}
2427
Alex Elder8b3e1a52013-01-24 16:13:36 -06002428static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2429{
2430 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002431 struct rbd_device *rbd_dev;
2432 u64 obj_end;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002433
2434 rbd_assert(img_request_child_test(img_request));
2435
2436 obj_request = img_request->obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002437 rbd_assert(obj_request);
2438 rbd_assert(obj_request->img_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002439
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002440 obj_request->result = img_request->result;
2441 if (obj_request->result)
2442 goto out;
2443
2444 /*
2445 * We need to zero anything beyond the parent overlap
2446 * boundary. Since rbd_img_obj_request_read_callback()
2447 * will zero anything beyond the end of a short read, an
2448 * easy way to do this is to pretend the data from the
2449 * parent came up short--ending at the overlap boundary.
2450 */
2451 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2452 obj_end = obj_request->img_offset + obj_request->length;
2453 rbd_dev = obj_request->img_request->rbd_dev;
2454 if (obj_end > rbd_dev->parent_overlap) {
2455 u64 xferred = 0;
2456
2457 if (obj_request->img_offset < rbd_dev->parent_overlap)
2458 xferred = rbd_dev->parent_overlap -
2459 obj_request->img_offset;
2460
2461 obj_request->xferred = min(img_request->xferred, xferred);
2462 } else {
2463 obj_request->xferred = img_request->xferred;
2464 }
2465out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06002466 rbd_img_obj_request_read_callback(obj_request);
2467 rbd_obj_request_complete(obj_request);
2468}
2469
2470static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2471{
2472 struct rbd_device *rbd_dev;
2473 struct rbd_img_request *img_request;
2474 int result;
2475
2476 rbd_assert(obj_request_img_data_test(obj_request));
2477 rbd_assert(obj_request->img_request != NULL);
2478 rbd_assert(obj_request->result == (s32) -ENOENT);
2479 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2480
2481 rbd_dev = obj_request->img_request->rbd_dev;
2482 rbd_assert(rbd_dev->parent != NULL);
2483 /* rbd_read_finish(obj_request, obj_request->length); */
2484 img_request = rbd_img_request_create(rbd_dev->parent,
2485 obj_request->img_offset,
2486 obj_request->length,
2487 false, true);
2488 result = -ENOMEM;
2489 if (!img_request)
2490 goto out_err;
2491
2492 rbd_obj_request_get(obj_request);
2493 img_request->obj_request = obj_request;
2494
Alex Elderf1a47392013-04-19 15:34:50 -05002495 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2496 obj_request->bio_list);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002497 if (result)
2498 goto out_err;
2499
2500 img_request->callback = rbd_img_parent_read_callback;
2501 result = rbd_img_request_submit(img_request);
2502 if (result)
2503 goto out_err;
2504
2505 return;
2506out_err:
2507 if (img_request)
2508 rbd_img_request_put(img_request);
2509 obj_request->result = result;
2510 obj_request->xferred = 0;
2511 obj_request_done_set(obj_request);
2512}
2513
Alex Eldercf81b602013-01-17 12:18:46 -06002514static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06002515 u64 ver, u64 notify_id)
2516{
2517 struct rbd_obj_request *obj_request;
Alex Elder21692382013-04-05 01:27:12 -05002518 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderb8d70032012-11-30 17:53:04 -06002519 int ret;
2520
2521 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2522 OBJ_REQUEST_NODATA);
2523 if (!obj_request)
2524 return -ENOMEM;
2525
2526 ret = -ENOMEM;
Alex Elder430c28c2013-04-03 21:32:51 -05002527 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002528 if (!obj_request->osd_req)
2529 goto out;
Alex Elder21692382013-04-05 01:27:12 -05002530 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06002531
Alex Elderc99d2d42013-04-05 01:27:11 -05002532 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2533 notify_id, ver, 0);
Alex Elder9d4df012013-04-19 15:34:50 -05002534 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002535
Alex Elderb8d70032012-11-30 17:53:04 -06002536 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002537out:
Alex Eldercf81b602013-01-17 12:18:46 -06002538 if (ret)
2539 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002540
2541 return ret;
2542}
2543
2544static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2545{
2546 struct rbd_device *rbd_dev = (struct rbd_device *)data;
2547 u64 hver;
Alex Elderb8d70032012-11-30 17:53:04 -06002548
2549 if (!rbd_dev)
2550 return;
2551
Alex Elder37206ee2013-02-20 17:32:08 -06002552 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Elderb8d70032012-11-30 17:53:04 -06002553 rbd_dev->header_name, (unsigned long long) notify_id,
2554 (unsigned int) opcode);
Alex Elder522a0cc2013-04-25 15:09:41 -05002555 (void)rbd_dev_refresh(rbd_dev, &hver);
Alex Elderb8d70032012-11-30 17:53:04 -06002556
Alex Eldercf81b602013-01-17 12:18:46 -06002557 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06002558}
2559
Alex Elder9969ebc2013-01-18 12:31:10 -06002560/*
2561 * Request sync osd watch/unwatch. The value of "start" determines
2562 * whether a watch request is being initiated or torn down.
2563 */
2564static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2565{
2566 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2567 struct rbd_obj_request *obj_request;
Alex Elder9969ebc2013-01-18 12:31:10 -06002568 int ret;
2569
2570 rbd_assert(start ^ !!rbd_dev->watch_event);
2571 rbd_assert(start ^ !!rbd_dev->watch_request);
2572
2573 if (start) {
Alex Elder3c663bb2013-02-15 11:42:30 -06002574 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
Alex Elder9969ebc2013-01-18 12:31:10 -06002575 &rbd_dev->watch_event);
2576 if (ret < 0)
2577 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06002578 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06002579 }
2580
2581 ret = -ENOMEM;
2582 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2583 OBJ_REQUEST_NODATA);
2584 if (!obj_request)
2585 goto out_cancel;
2586
Alex Elder430c28c2013-04-03 21:32:51 -05002587 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2588 if (!obj_request->osd_req)
2589 goto out_cancel;
2590
Alex Elder8eb87562013-01-25 17:08:55 -06002591 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06002592 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06002593 else
Alex Elder6977c3f2013-01-25 17:08:55 -06002594 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06002595 rbd_dev->watch_request->osd_req);
Alex Elder21692382013-04-05 01:27:12 -05002596
2597 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2598 rbd_dev->watch_event->cookie,
2599 rbd_dev->header.obj_version, start);
Alex Elder9d4df012013-04-19 15:34:50 -05002600 rbd_osd_req_format_write(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002601
Alex Elder9969ebc2013-01-18 12:31:10 -06002602 ret = rbd_obj_request_submit(osdc, obj_request);
2603 if (ret)
2604 goto out_cancel;
2605 ret = rbd_obj_request_wait(obj_request);
2606 if (ret)
2607 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06002608 ret = obj_request->result;
2609 if (ret)
2610 goto out_cancel;
2611
Alex Elder8eb87562013-01-25 17:08:55 -06002612 /*
2613 * A watch request is set to linger, so the underlying osd
2614 * request won't go away until we unregister it. We retain
2615 * a pointer to the object request during that time (in
2616 * rbd_dev->watch_request), so we'll keep a reference to
2617 * it. We'll drop that reference (below) after we've
2618 * unregistered it.
2619 */
2620 if (start) {
2621 rbd_dev->watch_request = obj_request;
2622
2623 return 0;
2624 }
2625
2626 /* We have successfully torn down the watch request */
2627
2628 rbd_obj_request_put(rbd_dev->watch_request);
2629 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002630out_cancel:
2631 /* Cancel the event if we're tearing down, or on error */
2632 ceph_osdc_cancel_event(rbd_dev->watch_event);
2633 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002634 if (obj_request)
2635 rbd_obj_request_put(obj_request);
2636
2637 return ret;
2638}
2639
Alex Elder36be9a72013-01-19 00:30:28 -06002640/*
Alex Elderf40eb342013-04-25 15:09:42 -05002641 * Synchronous osd object method call. Returns the number of bytes
2642 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06002643 */
2644static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2645 const char *object_name,
2646 const char *class_name,
2647 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05002648 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06002649 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05002650 void *inbound,
Alex Elder36be9a72013-01-19 00:30:28 -06002651 size_t inbound_size,
2652 u64 *version)
2653{
Alex Elder21692382013-04-05 01:27:12 -05002654 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder36be9a72013-01-19 00:30:28 -06002655 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06002656 struct page **pages;
2657 u32 page_count;
2658 int ret;
2659
2660 /*
Alex Elder6010a452013-04-05 01:27:11 -05002661 * Method calls are ultimately read operations. The result
2662 * should placed into the inbound buffer provided. They
2663 * also supply outbound data--parameters for the object
2664 * method. Currently if this is present it will be a
2665 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06002666 */
Alex Elder57385b52013-04-21 12:14:45 -05002667 page_count = (u32)calc_pages_for(0, inbound_size);
Alex Elder36be9a72013-01-19 00:30:28 -06002668 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2669 if (IS_ERR(pages))
2670 return PTR_ERR(pages);
2671
2672 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05002673 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06002674 OBJ_REQUEST_PAGES);
2675 if (!obj_request)
2676 goto out;
2677
2678 obj_request->pages = pages;
2679 obj_request->page_count = page_count;
2680
Alex Elder430c28c2013-04-03 21:32:51 -05002681 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06002682 if (!obj_request->osd_req)
2683 goto out;
2684
Alex Elderc99d2d42013-04-05 01:27:11 -05002685 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05002686 class_name, method_name);
2687 if (outbound_size) {
2688 struct ceph_pagelist *pagelist;
2689
2690 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2691 if (!pagelist)
2692 goto out;
2693
2694 ceph_pagelist_init(pagelist);
2695 ceph_pagelist_append(pagelist, outbound, outbound_size);
2696 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2697 pagelist);
2698 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05002699 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2700 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05002701 0, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002702 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002703
Alex Elder36be9a72013-01-19 00:30:28 -06002704 ret = rbd_obj_request_submit(osdc, obj_request);
2705 if (ret)
2706 goto out;
2707 ret = rbd_obj_request_wait(obj_request);
2708 if (ret)
2709 goto out;
2710
2711 ret = obj_request->result;
2712 if (ret < 0)
2713 goto out;
Alex Elder57385b52013-04-21 12:14:45 -05002714
2715 rbd_assert(obj_request->xferred < (u64)INT_MAX);
2716 ret = (int)obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002717 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06002718 if (version)
2719 *version = obj_request->version;
2720out:
2721 if (obj_request)
2722 rbd_obj_request_put(obj_request);
2723 else
2724 ceph_release_page_vector(pages, page_count);
2725
2726 return ret;
2727}
2728
Alex Elderbf0d5f502012-11-22 00:00:08 -06002729static void rbd_request_fn(struct request_queue *q)
Alex Eldercc344fa2013-02-19 12:25:56 -06002730 __releases(q->queue_lock) __acquires(q->queue_lock)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002731{
2732 struct rbd_device *rbd_dev = q->queuedata;
2733 bool read_only = rbd_dev->mapping.read_only;
2734 struct request *rq;
2735 int result;
2736
2737 while ((rq = blk_fetch_request(q))) {
2738 bool write_request = rq_data_dir(rq) == WRITE;
2739 struct rbd_img_request *img_request;
2740 u64 offset;
2741 u64 length;
2742
2743 /* Ignore any non-FS requests that filter through. */
2744
2745 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06002746 dout("%s: non-fs request type %d\n", __func__,
2747 (int) rq->cmd_type);
2748 __blk_end_request_all(rq, 0);
2749 continue;
2750 }
2751
2752 /* Ignore/skip any zero-length requests */
2753
2754 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2755 length = (u64) blk_rq_bytes(rq);
2756
2757 if (!length) {
2758 dout("%s: zero-length request\n", __func__);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002759 __blk_end_request_all(rq, 0);
2760 continue;
2761 }
2762
2763 spin_unlock_irq(q->queue_lock);
2764
2765 /* Disallow writes to a read-only device */
2766
2767 if (write_request) {
2768 result = -EROFS;
2769 if (read_only)
2770 goto end_request;
2771 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2772 }
2773
Alex Elder6d292902013-01-14 12:43:31 -06002774 /*
2775 * Quit early if the mapped snapshot no longer
2776 * exists. It's still possible the snapshot will
2777 * have disappeared by the time our request arrives
2778 * at the osd, but there's no sense in sending it if
2779 * we already know.
2780 */
2781 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002782 dout("request for non-existent snapshot");
2783 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2784 result = -ENXIO;
2785 goto end_request;
2786 }
2787
Alex Elderbf0d5f502012-11-22 00:00:08 -06002788 result = -EINVAL;
Alex Elderc0cd10db2013-04-26 09:43:47 -05002789 if (offset && length > U64_MAX - offset + 1) {
2790 rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2791 offset, length);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002792 goto end_request; /* Shouldn't happen */
Alex Elderc0cd10db2013-04-26 09:43:47 -05002793 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002794
2795 result = -ENOMEM;
2796 img_request = rbd_img_request_create(rbd_dev, offset, length,
Alex Elder9849e982013-01-24 16:13:36 -06002797 write_request, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002798 if (!img_request)
2799 goto end_request;
2800
2801 img_request->rq = rq;
2802
Alex Elderf1a47392013-04-19 15:34:50 -05002803 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2804 rq->bio);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002805 if (!result)
2806 result = rbd_img_request_submit(img_request);
2807 if (result)
2808 rbd_img_request_put(img_request);
2809end_request:
2810 spin_lock_irq(q->queue_lock);
2811 if (result < 0) {
Alex Elder7da22d22013-01-24 16:13:36 -06002812 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2813 write_request ? "write" : "read",
2814 length, offset, result);
2815
Alex Elderbf0d5f502012-11-22 00:00:08 -06002816 __blk_end_request_all(rq, result);
2817 }
2818 }
2819}
2820
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002821/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002822 * a queue callback. Makes sure that we don't create a bio that spans across
2823 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002824 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002825 */
2826static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2827 struct bio_vec *bvec)
2828{
2829 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05002830 sector_t sector_offset;
2831 sector_t sectors_per_obj;
2832 sector_t obj_sector_offset;
2833 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002834
Alex Eldere5cfeed22012-10-20 22:17:27 -05002835 /*
2836 * Find how far into its rbd object the partition-relative
2837 * bio start sector is to offset relative to the enclosing
2838 * device.
2839 */
2840 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2841 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2842 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002843
Alex Eldere5cfeed22012-10-20 22:17:27 -05002844 /*
2845 * Compute the number of bytes from that offset to the end
2846 * of the object. Account for what's already used by the bio.
2847 */
2848 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2849 if (ret > bmd->bi_size)
2850 ret -= bmd->bi_size;
2851 else
2852 ret = 0;
2853
2854 /*
2855 * Don't send back more than was asked for. And if the bio
2856 * was empty, let the whole thing through because: "Note
2857 * that a block device *must* allow a single page to be
2858 * added to an empty bio."
2859 */
2860 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2861 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2862 ret = (int) bvec->bv_len;
2863
2864 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002865}
2866
2867static void rbd_free_disk(struct rbd_device *rbd_dev)
2868{
2869 struct gendisk *disk = rbd_dev->disk;
2870
2871 if (!disk)
2872 return;
2873
Alex Eldera0cab922013-04-25 23:15:08 -05002874 rbd_dev->disk = NULL;
2875 if (disk->flags & GENHD_FL_UP) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002876 del_gendisk(disk);
Alex Eldera0cab922013-04-25 23:15:08 -05002877 if (disk->queue)
2878 blk_cleanup_queue(disk->queue);
2879 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002880 put_disk(disk);
2881}
2882
Alex Elder788e2df2013-01-17 12:25:27 -06002883static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2884 const char *object_name,
2885 u64 offset, u64 length,
Alex Elder80ef15b2013-04-21 12:14:45 -05002886 void *buf, u64 *version)
Alex Elder788e2df2013-01-17 12:25:27 -06002887
2888{
Alex Elder21692382013-04-05 01:27:12 -05002889 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder788e2df2013-01-17 12:25:27 -06002890 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06002891 struct page **pages = NULL;
2892 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002893 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06002894 int ret;
2895
2896 page_count = (u32) calc_pages_for(offset, length);
2897 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2898 if (IS_ERR(pages))
2899 ret = PTR_ERR(pages);
2900
2901 ret = -ENOMEM;
2902 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002903 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002904 if (!obj_request)
2905 goto out;
2906
2907 obj_request->pages = pages;
2908 obj_request->page_count = page_count;
2909
Alex Elder430c28c2013-04-03 21:32:51 -05002910 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06002911 if (!obj_request->osd_req)
2912 goto out;
2913
Alex Elderc99d2d42013-04-05 01:27:11 -05002914 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2915 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05002916 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05002917 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05002918 obj_request->length,
2919 obj_request->offset & ~PAGE_MASK,
2920 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002921 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002922
Alex Elder788e2df2013-01-17 12:25:27 -06002923 ret = rbd_obj_request_submit(osdc, obj_request);
2924 if (ret)
2925 goto out;
2926 ret = rbd_obj_request_wait(obj_request);
2927 if (ret)
2928 goto out;
2929
2930 ret = obj_request->result;
2931 if (ret < 0)
2932 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002933
2934 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2935 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002936 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder23ed6e12013-02-06 13:11:38 -06002937 rbd_assert(size <= (size_t) INT_MAX);
2938 ret = (int) size;
Alex Elder788e2df2013-01-17 12:25:27 -06002939 if (version)
2940 *version = obj_request->version;
2941out:
2942 if (obj_request)
2943 rbd_obj_request_put(obj_request);
2944 else
2945 ceph_release_page_vector(pages, page_count);
2946
2947 return ret;
2948}
2949
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002950/*
Alex Elder4156d992012-08-02 11:29:46 -05002951 * Read the complete header for the given rbd device.
2952 *
2953 * Returns a pointer to a dynamically-allocated buffer containing
2954 * the complete and validated header. Caller can pass the address
2955 * of a variable that will be filled in with the version of the
2956 * header object at the time it was read.
2957 *
2958 * Returns a pointer-coded errno if a failure occurs.
2959 */
2960static struct rbd_image_header_ondisk *
2961rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2962{
2963 struct rbd_image_header_ondisk *ondisk = NULL;
2964 u32 snap_count = 0;
2965 u64 names_size = 0;
2966 u32 want_count;
2967 int ret;
2968
2969 /*
2970 * The complete header will include an array of its 64-bit
2971 * snapshot ids, followed by the names of those snapshots as
2972 * a contiguous block of NUL-terminated strings. Note that
2973 * the number of snapshots could change by the time we read
2974 * it in, in which case we re-read it.
2975 */
2976 do {
2977 size_t size;
2978
2979 kfree(ondisk);
2980
2981 size = sizeof (*ondisk);
2982 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2983 size += names_size;
2984 ondisk = kmalloc(size, GFP_KERNEL);
2985 if (!ondisk)
2986 return ERR_PTR(-ENOMEM);
2987
Alex Elder788e2df2013-01-17 12:25:27 -06002988 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder80ef15b2013-04-21 12:14:45 -05002989 0, size, ondisk, version);
Alex Elder4156d992012-08-02 11:29:46 -05002990 if (ret < 0)
2991 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05002992 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05002993 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002994 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2995 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002996 goto out_err;
2997 }
2998 if (!rbd_dev_ondisk_valid(ondisk)) {
2999 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003000 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05003001 goto out_err;
3002 }
3003
3004 names_size = le64_to_cpu(ondisk->snap_names_len);
3005 want_count = snap_count;
3006 snap_count = le32_to_cpu(ondisk->snap_count);
3007 } while (snap_count != want_count);
3008
3009 return ondisk;
3010
3011out_err:
3012 kfree(ondisk);
3013
3014 return ERR_PTR(ret);
3015}
3016
3017/*
3018 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003019 */
3020static int rbd_read_header(struct rbd_device *rbd_dev,
3021 struct rbd_image_header *header)
3022{
Alex Elder4156d992012-08-02 11:29:46 -05003023 struct rbd_image_header_ondisk *ondisk;
3024 u64 ver = 0;
3025 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003026
Alex Elder4156d992012-08-02 11:29:46 -05003027 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
3028 if (IS_ERR(ondisk))
3029 return PTR_ERR(ondisk);
3030 ret = rbd_header_from_disk(header, ondisk);
3031 if (ret >= 0)
3032 header->obj_version = ver;
3033 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003034
Alex Elder4156d992012-08-02 11:29:46 -05003035 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003036}
3037
Alex Elder41f38c22012-10-25 23:34:40 -05003038static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003039{
3040 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05003041 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003042
Alex Elder6087b512013-04-25 15:09:41 -05003043 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) {
3044 list_del(&snap->node);
3045 rbd_snap_destroy(snap);
3046 }
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003047}
3048
Alex Elder94785542012-10-09 13:50:17 -07003049static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
3050{
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003051 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07003052 return;
3053
Alex Eldere28626a2013-04-26 15:44:35 -05003054 if (rbd_dev->mapping.size != rbd_dev->header.image_size) {
3055 sector_t size;
3056
3057 rbd_dev->mapping.size = rbd_dev->header.image_size;
3058 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3059 dout("setting size to %llu sectors", (unsigned long long)size);
3060 set_capacity(rbd_dev->disk, size);
3061 }
Alex Elder94785542012-10-09 13:50:17 -07003062}
3063
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003064/*
3065 * only read the first part of the ondisk header, without the snaps info
3066 */
Alex Elder117973f2012-08-31 17:29:55 -05003067static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003068{
3069 int ret;
3070 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003071
3072 ret = rbd_read_header(rbd_dev, &h);
3073 if (ret < 0)
3074 return ret;
3075
Josh Durgina51aa0c2011-12-05 10:35:04 -08003076 down_write(&rbd_dev->header_rwsem);
3077
Alex Elder94785542012-10-09 13:50:17 -07003078 /* Update image size, and check for resize of mapped image */
3079 rbd_dev->header.image_size = h.image_size;
3080 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07003081
Alex Elder849b4262012-07-09 21:04:24 -05003082 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003083 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05003084 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08003085 /* osd requests may still refer to snapc */
Alex Elder468521c2013-04-26 09:43:47 -05003086 rbd_snap_context_put(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003087
Alex Elderb8136232012-07-25 09:32:41 -05003088 if (hver)
3089 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08003090 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08003091 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003092 rbd_dev->header.snapc = h.snapc;
3093 rbd_dev->header.snap_names = h.snap_names;
3094 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05003095 /* Free the extra copy of the object prefix */
Alex Elderc0cd10db2013-04-26 09:43:47 -05003096 if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
3097 rbd_warn(rbd_dev, "object prefix changed (ignoring)");
Alex Elder849b4262012-07-09 21:04:24 -05003098 kfree(h.object_prefix);
3099
Alex Elder304f6802012-08-31 17:29:52 -05003100 ret = rbd_dev_snaps_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003101
Josh Durginc6666012011-11-21 17:11:12 -08003102 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003103
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003104 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003105}
3106
Alex Elder117973f2012-08-31 17:29:55 -05003107static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05003108{
3109 int ret;
3110
Alex Elder117973f2012-08-31 17:29:55 -05003111 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05003112 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05003113 if (rbd_dev->image_format == 1)
3114 ret = rbd_dev_v1_refresh(rbd_dev, hver);
3115 else
3116 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05003117 mutex_unlock(&ctl_mutex);
Laurent Barbed98df632013-04-10 17:47:46 -05003118 revalidate_disk(rbd_dev->disk);
Alex Elder522a0cc2013-04-25 15:09:41 -05003119 if (ret)
3120 rbd_warn(rbd_dev, "got notification but failed to "
3121 " update snaps: %d\n", ret);
Alex Elder1fe5e992012-07-25 09:32:41 -05003122
3123 return ret;
3124}
3125
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003126static int rbd_init_disk(struct rbd_device *rbd_dev)
3127{
3128 struct gendisk *disk;
3129 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003130 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003131
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003132 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003133 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3134 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003135 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003136
Alex Elderf0f8cef2012-01-29 13:57:44 -06003137 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003138 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003139 disk->major = rbd_dev->major;
3140 disk->first_minor = 0;
3141 disk->fops = &rbd_bd_ops;
3142 disk->private_data = rbd_dev;
3143
Alex Elderbf0d5f502012-11-22 00:00:08 -06003144 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003145 if (!q)
3146 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003147
Alex Elder593a9e72012-02-07 12:03:37 -06003148 /* We use the default size, but let's be explicit about it. */
3149 blk_queue_physical_block_size(q, SECTOR_SIZE);
3150
Josh Durgin029bcbd2011-07-22 11:35:23 -07003151 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003152 segment_size = rbd_obj_bytes(&rbd_dev->header);
3153 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3154 blk_queue_max_segment_size(q, segment_size);
3155 blk_queue_io_min(q, segment_size);
3156 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003157
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003158 blk_queue_merge_bvec(q, rbd_merge_bvec);
3159 disk->queue = q;
3160
3161 q->queuedata = rbd_dev;
3162
3163 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003164
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003165 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003166out_disk:
3167 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003168
3169 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003170}
3171
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003172/*
3173 sysfs
3174*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003175
Alex Elder593a9e72012-02-07 12:03:37 -06003176static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3177{
3178 return container_of(dev, struct rbd_device, dev);
3179}
3180
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003181static ssize_t rbd_size_show(struct device *dev,
3182 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003183{
Alex Elder593a9e72012-02-07 12:03:37 -06003184 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003185
Alex Elderfc71d832013-04-26 15:44:36 -05003186 return sprintf(buf, "%llu\n",
3187 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003188}
3189
Alex Elder34b13182012-07-13 20:35:12 -05003190/*
3191 * Note this shows the features for whatever's mapped, which is not
3192 * necessarily the base image.
3193 */
3194static ssize_t rbd_features_show(struct device *dev,
3195 struct device_attribute *attr, char *buf)
3196{
3197 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3198
3199 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003200 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05003201}
3202
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003203static ssize_t rbd_major_show(struct device *dev,
3204 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003205{
Alex Elder593a9e72012-02-07 12:03:37 -06003206 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003207
Alex Elderfc71d832013-04-26 15:44:36 -05003208 if (rbd_dev->major)
3209 return sprintf(buf, "%d\n", rbd_dev->major);
3210
3211 return sprintf(buf, "(none)\n");
3212
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003213}
3214
3215static ssize_t rbd_client_id_show(struct device *dev,
3216 struct device_attribute *attr, char *buf)
3217{
Alex Elder593a9e72012-02-07 12:03:37 -06003218 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003219
Alex Elder1dbb4392012-01-24 10:08:37 -06003220 return sprintf(buf, "client%lld\n",
3221 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003222}
3223
3224static ssize_t rbd_pool_show(struct device *dev,
3225 struct device_attribute *attr, char *buf)
3226{
Alex Elder593a9e72012-02-07 12:03:37 -06003227 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003228
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003229 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003230}
3231
Alex Elder9bb2f332012-07-12 10:46:35 -05003232static ssize_t rbd_pool_id_show(struct device *dev,
3233 struct device_attribute *attr, char *buf)
3234{
3235 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3236
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003237 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003238 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003239}
3240
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003241static ssize_t rbd_name_show(struct device *dev,
3242 struct device_attribute *attr, char *buf)
3243{
Alex Elder593a9e72012-02-07 12:03:37 -06003244 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003245
Alex Eldera92ffdf2012-10-30 19:40:33 -05003246 if (rbd_dev->spec->image_name)
3247 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3248
3249 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003250}
3251
Alex Elder589d30e2012-07-10 20:30:11 -05003252static ssize_t rbd_image_id_show(struct device *dev,
3253 struct device_attribute *attr, char *buf)
3254{
3255 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3256
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003257 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003258}
3259
Alex Elder34b13182012-07-13 20:35:12 -05003260/*
3261 * Shows the name of the currently-mapped snapshot (or
3262 * RBD_SNAP_HEAD_NAME for the base image).
3263 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003264static ssize_t rbd_snap_show(struct device *dev,
3265 struct device_attribute *attr,
3266 char *buf)
3267{
Alex Elder593a9e72012-02-07 12:03:37 -06003268 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003269
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003270 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003271}
3272
Alex Elder86b00e02012-10-25 23:34:42 -05003273/*
3274 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3275 * for the parent image. If there is no parent, simply shows
3276 * "(no parent image)".
3277 */
3278static ssize_t rbd_parent_show(struct device *dev,
3279 struct device_attribute *attr,
3280 char *buf)
3281{
3282 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3283 struct rbd_spec *spec = rbd_dev->parent_spec;
3284 int count;
3285 char *bufp = buf;
3286
3287 if (!spec)
3288 return sprintf(buf, "(no parent image)\n");
3289
3290 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3291 (unsigned long long) spec->pool_id, spec->pool_name);
3292 if (count < 0)
3293 return count;
3294 bufp += count;
3295
3296 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3297 spec->image_name ? spec->image_name : "(unknown)");
3298 if (count < 0)
3299 return count;
3300 bufp += count;
3301
3302 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3303 (unsigned long long) spec->snap_id, spec->snap_name);
3304 if (count < 0)
3305 return count;
3306 bufp += count;
3307
3308 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3309 if (count < 0)
3310 return count;
3311 bufp += count;
3312
3313 return (ssize_t) (bufp - buf);
3314}
3315
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003316static ssize_t rbd_image_refresh(struct device *dev,
3317 struct device_attribute *attr,
3318 const char *buf,
3319 size_t size)
3320{
Alex Elder593a9e72012-02-07 12:03:37 -06003321 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003322 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003323
Alex Elder117973f2012-08-31 17:29:55 -05003324 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05003325
3326 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003327}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003328
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003329static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003330static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003331static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3332static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3333static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003334static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003335static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003336static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003337static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3338static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003339static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003340
3341static struct attribute *rbd_attrs[] = {
3342 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003343 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003344 &dev_attr_major.attr,
3345 &dev_attr_client_id.attr,
3346 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05003347 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003348 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05003349 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003350 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05003351 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003352 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003353 NULL
3354};
3355
3356static struct attribute_group rbd_attr_group = {
3357 .attrs = rbd_attrs,
3358};
3359
3360static const struct attribute_group *rbd_attr_groups[] = {
3361 &rbd_attr_group,
3362 NULL
3363};
3364
3365static void rbd_sysfs_dev_release(struct device *dev)
3366{
3367}
3368
3369static struct device_type rbd_device_type = {
3370 .name = "rbd",
3371 .groups = rbd_attr_groups,
3372 .release = rbd_sysfs_dev_release,
3373};
3374
Alex Elder8b8fb992012-10-26 17:25:24 -05003375static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3376{
3377 kref_get(&spec->kref);
3378
3379 return spec;
3380}
3381
3382static void rbd_spec_free(struct kref *kref);
3383static void rbd_spec_put(struct rbd_spec *spec)
3384{
3385 if (spec)
3386 kref_put(&spec->kref, rbd_spec_free);
3387}
3388
3389static struct rbd_spec *rbd_spec_alloc(void)
3390{
3391 struct rbd_spec *spec;
3392
3393 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3394 if (!spec)
3395 return NULL;
3396 kref_init(&spec->kref);
3397
Alex Elder8b8fb992012-10-26 17:25:24 -05003398 return spec;
3399}
3400
3401static void rbd_spec_free(struct kref *kref)
3402{
3403 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3404
3405 kfree(spec->pool_name);
3406 kfree(spec->image_id);
3407 kfree(spec->image_name);
3408 kfree(spec->snap_name);
3409 kfree(spec);
3410}
3411
Alex Eldercc344fa2013-02-19 12:25:56 -06003412static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05003413 struct rbd_spec *spec)
3414{
3415 struct rbd_device *rbd_dev;
3416
3417 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3418 if (!rbd_dev)
3419 return NULL;
3420
3421 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06003422 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05003423 INIT_LIST_HEAD(&rbd_dev->node);
3424 INIT_LIST_HEAD(&rbd_dev->snaps);
3425 init_rwsem(&rbd_dev->header_rwsem);
3426
3427 rbd_dev->spec = spec;
3428 rbd_dev->rbd_client = rbdc;
3429
Alex Elder0903e872012-11-14 12:25:19 -06003430 /* Initialize the layout used for all rbd requests */
3431
3432 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3433 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3434 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3435 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3436
Alex Elderc53d5892012-10-25 23:34:42 -05003437 return rbd_dev;
3438}
3439
3440static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3441{
Alex Elderc53d5892012-10-25 23:34:42 -05003442 rbd_put_client(rbd_dev->rbd_client);
3443 rbd_spec_put(rbd_dev->spec);
3444 kfree(rbd_dev);
3445}
3446
Alex Elder6087b512013-04-25 15:09:41 -05003447static void rbd_snap_destroy(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003448{
Alex Elder3e83b652013-04-23 13:52:53 -05003449 kfree(snap->name);
3450 kfree(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003451}
3452
Alex Elder6087b512013-04-25 15:09:41 -05003453static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05003454 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05003455 u64 snap_id, u64 snap_size,
3456 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003457{
Alex Elder4e891e02012-07-10 20:30:10 -05003458 struct rbd_snap *snap;
Alex Elder4e891e02012-07-10 20:30:10 -05003459
3460 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003461 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05003462 return ERR_PTR(-ENOMEM);
3463
Alex Elder6e584f52013-04-25 15:09:42 -05003464 snap->name = snap_name;
Alex Elderc8d18422012-07-10 20:30:11 -05003465 snap->id = snap_id;
3466 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05003467 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05003468
3469 return snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003470}
3471
Alex Elder6e584f52013-04-25 15:09:42 -05003472/*
3473 * Returns a dynamically-allocated snapshot name if successful, or a
3474 * pointer-coded error otherwise.
3475 */
Alex Eldercd892122012-07-03 16:01:19 -05003476static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
3477 u64 *snap_size, u64 *snap_features)
3478{
3479 char *snap_name;
Alex Elder6e584f52013-04-25 15:09:42 -05003480 int i;
Alex Eldercd892122012-07-03 16:01:19 -05003481
3482 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3483
Alex Eldercd892122012-07-03 16:01:19 -05003484 /* Skip over names until we find the one we are looking for */
3485
3486 snap_name = rbd_dev->header.snap_names;
Alex Elder6e584f52013-04-25 15:09:42 -05003487 for (i = 0; i < which; i++)
Alex Eldercd892122012-07-03 16:01:19 -05003488 snap_name += strlen(snap_name) + 1;
3489
Alex Elder6e584f52013-04-25 15:09:42 -05003490 snap_name = kstrdup(snap_name, GFP_KERNEL);
3491 if (!snap_name)
3492 return ERR_PTR(-ENOMEM);
3493
3494 *snap_size = rbd_dev->header.snap_sizes[which];
3495 *snap_features = 0; /* No features for v1 */
3496
Alex Eldercd892122012-07-03 16:01:19 -05003497 return snap_name;
3498}
3499
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003500/*
Alex Elder9d475de2012-07-03 16:01:19 -05003501 * Get the size and object order for an image snapshot, or if
3502 * snap_id is CEPH_NOSNAP, gets this information for the base
3503 * image.
3504 */
3505static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3506 u8 *order, u64 *snap_size)
3507{
3508 __le64 snapid = cpu_to_le64(snap_id);
3509 int ret;
3510 struct {
3511 u8 order;
3512 __le64 size;
3513 } __attribute__ ((packed)) size_buf = { 0 };
3514
Alex Elder36be9a72013-01-19 00:30:28 -06003515 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05003516 "rbd", "get_size",
Alex Elder41579762013-04-21 12:14:45 -05003517 &snapid, sizeof (snapid),
3518 &size_buf, sizeof (size_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003519 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05003520 if (ret < 0)
3521 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003522 if (ret < sizeof (size_buf))
3523 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05003524
Alex Elderc86f86e2013-04-25 15:09:41 -05003525 if (order)
3526 *order = size_buf.order;
Alex Elder9d475de2012-07-03 16:01:19 -05003527 *snap_size = le64_to_cpu(size_buf.size);
3528
3529 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
Alex Elder57385b52013-04-21 12:14:45 -05003530 (unsigned long long)snap_id, (unsigned int)*order,
3531 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05003532
3533 return 0;
3534}
3535
3536static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3537{
3538 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3539 &rbd_dev->header.obj_order,
3540 &rbd_dev->header.image_size);
3541}
3542
Alex Elder1e130192012-07-03 16:01:19 -05003543static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3544{
3545 void *reply_buf;
3546 int ret;
3547 void *p;
3548
3549 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3550 if (!reply_buf)
3551 return -ENOMEM;
3552
Alex Elder36be9a72013-01-19 00:30:28 -06003553 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003554 "rbd", "get_object_prefix", NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003555 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003556 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05003557 if (ret < 0)
3558 goto out;
3559
3560 p = reply_buf;
3561 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05003562 p + ret, NULL, GFP_NOIO);
3563 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05003564
3565 if (IS_ERR(rbd_dev->header.object_prefix)) {
3566 ret = PTR_ERR(rbd_dev->header.object_prefix);
3567 rbd_dev->header.object_prefix = NULL;
3568 } else {
3569 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
3570 }
Alex Elder1e130192012-07-03 16:01:19 -05003571out:
3572 kfree(reply_buf);
3573
3574 return ret;
3575}
3576
Alex Elderb1b54022012-07-03 16:01:19 -05003577static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3578 u64 *snap_features)
3579{
3580 __le64 snapid = cpu_to_le64(snap_id);
3581 struct {
3582 __le64 features;
3583 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05003584 } __attribute__ ((packed)) features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07003585 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05003586 int ret;
3587
Alex Elder36be9a72013-01-19 00:30:28 -06003588 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05003589 "rbd", "get_features",
Alex Elder41579762013-04-21 12:14:45 -05003590 &snapid, sizeof (snapid),
3591 &features_buf, sizeof (features_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003592 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05003593 if (ret < 0)
3594 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003595 if (ret < sizeof (features_buf))
3596 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07003597
3598 incompat = le64_to_cpu(features_buf.incompat);
Alex Elder5cbf6f122013-04-11 09:29:48 -05003599 if (incompat & ~RBD_FEATURES_SUPPORTED)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05003600 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07003601
Alex Elderb1b54022012-07-03 16:01:19 -05003602 *snap_features = le64_to_cpu(features_buf.features);
3603
3604 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05003605 (unsigned long long)snap_id,
3606 (unsigned long long)*snap_features,
3607 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05003608
3609 return 0;
3610}
3611
3612static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3613{
3614 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3615 &rbd_dev->header.features);
3616}
3617
Alex Elder86b00e02012-10-25 23:34:42 -05003618static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3619{
3620 struct rbd_spec *parent_spec;
3621 size_t size;
3622 void *reply_buf = NULL;
3623 __le64 snapid;
3624 void *p;
3625 void *end;
3626 char *image_id;
3627 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05003628 int ret;
3629
3630 parent_spec = rbd_spec_alloc();
3631 if (!parent_spec)
3632 return -ENOMEM;
3633
3634 size = sizeof (__le64) + /* pool_id */
3635 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
3636 sizeof (__le64) + /* snap_id */
3637 sizeof (__le64); /* overlap */
3638 reply_buf = kmalloc(size, GFP_KERNEL);
3639 if (!reply_buf) {
3640 ret = -ENOMEM;
3641 goto out_err;
3642 }
3643
3644 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06003645 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05003646 "rbd", "get_parent",
Alex Elder41579762013-04-21 12:14:45 -05003647 &snapid, sizeof (snapid),
3648 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003649 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05003650 if (ret < 0)
3651 goto out_err;
3652
Alex Elder86b00e02012-10-25 23:34:42 -05003653 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003654 end = reply_buf + ret;
3655 ret = -ERANGE;
Alex Elder86b00e02012-10-25 23:34:42 -05003656 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3657 if (parent_spec->pool_id == CEPH_NOPOOL)
3658 goto out; /* No parent? No problem. */
3659
Alex Elder0903e872012-11-14 12:25:19 -06003660 /* The ceph file layout needs to fit pool id in 32 bits */
3661
3662 ret = -EIO;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003663 if (parent_spec->pool_id > (u64)U32_MAX) {
3664 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3665 (unsigned long long)parent_spec->pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05003666 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003667 }
Alex Elder0903e872012-11-14 12:25:19 -06003668
Alex Elder979ed482012-11-01 08:39:26 -05003669 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05003670 if (IS_ERR(image_id)) {
3671 ret = PTR_ERR(image_id);
3672 goto out_err;
3673 }
3674 parent_spec->image_id = image_id;
3675 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3676 ceph_decode_64_safe(&p, end, overlap, out_err);
3677
3678 rbd_dev->parent_overlap = overlap;
3679 rbd_dev->parent_spec = parent_spec;
3680 parent_spec = NULL; /* rbd_dev now owns this */
3681out:
3682 ret = 0;
3683out_err:
3684 kfree(reply_buf);
3685 rbd_spec_put(parent_spec);
3686
3687 return ret;
3688}
3689
Alex Eldercc070d52013-04-21 12:14:45 -05003690static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3691{
3692 struct {
3693 __le64 stripe_unit;
3694 __le64 stripe_count;
3695 } __attribute__ ((packed)) striping_info_buf = { 0 };
3696 size_t size = sizeof (striping_info_buf);
3697 void *p;
3698 u64 obj_size;
3699 u64 stripe_unit;
3700 u64 stripe_count;
3701 int ret;
3702
3703 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3704 "rbd", "get_stripe_unit_count", NULL, 0,
3705 (char *)&striping_info_buf, size, NULL);
3706 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3707 if (ret < 0)
3708 return ret;
3709 if (ret < size)
3710 return -ERANGE;
3711
3712 /*
3713 * We don't actually support the "fancy striping" feature
3714 * (STRIPINGV2) yet, but if the striping sizes are the
3715 * defaults the behavior is the same as before. So find
3716 * out, and only fail if the image has non-default values.
3717 */
3718 ret = -EINVAL;
3719 obj_size = (u64)1 << rbd_dev->header.obj_order;
3720 p = &striping_info_buf;
3721 stripe_unit = ceph_decode_64(&p);
3722 if (stripe_unit != obj_size) {
3723 rbd_warn(rbd_dev, "unsupported stripe unit "
3724 "(got %llu want %llu)",
3725 stripe_unit, obj_size);
3726 return -EINVAL;
3727 }
3728 stripe_count = ceph_decode_64(&p);
3729 if (stripe_count != 1) {
3730 rbd_warn(rbd_dev, "unsupported stripe count "
3731 "(got %llu want 1)", stripe_count);
3732 return -EINVAL;
3733 }
Alex Elder500d0c02013-04-26 09:43:47 -05003734 rbd_dev->header.stripe_unit = stripe_unit;
3735 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05003736
3737 return 0;
3738}
3739
Alex Elder9e15b772012-10-30 19:40:33 -05003740static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3741{
3742 size_t image_id_size;
3743 char *image_id;
3744 void *p;
3745 void *end;
3746 size_t size;
3747 void *reply_buf = NULL;
3748 size_t len = 0;
3749 char *image_name = NULL;
3750 int ret;
3751
3752 rbd_assert(!rbd_dev->spec->image_name);
3753
Alex Elder69e7a022012-11-01 08:39:26 -05003754 len = strlen(rbd_dev->spec->image_id);
3755 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003756 image_id = kmalloc(image_id_size, GFP_KERNEL);
3757 if (!image_id)
3758 return NULL;
3759
3760 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05003761 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05003762 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05003763
3764 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3765 reply_buf = kmalloc(size, GFP_KERNEL);
3766 if (!reply_buf)
3767 goto out;
3768
Alex Elder36be9a72013-01-19 00:30:28 -06003769 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003770 "rbd", "dir_get_name",
3771 image_id, image_id_size,
Alex Elder41579762013-04-21 12:14:45 -05003772 reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05003773 if (ret < 0)
3774 goto out;
3775 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05003776 end = reply_buf + ret;
3777
Alex Elder9e15b772012-10-30 19:40:33 -05003778 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3779 if (IS_ERR(image_name))
3780 image_name = NULL;
3781 else
3782 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3783out:
3784 kfree(reply_buf);
3785 kfree(image_id);
3786
3787 return image_name;
3788}
3789
3790/*
Alex Elder2e9f7f12013-04-26 09:43:48 -05003791 * When an rbd image has a parent image, it is identified by the
3792 * pool, image, and snapshot ids (not names). This function fills
3793 * in the names for those ids. (It's OK if we can't figure out the
3794 * name for an image id, but the pool and snapshot ids should always
3795 * exist and have names.) All names in an rbd spec are dynamically
3796 * allocated.
Alex Eldere1d42132013-04-25 23:15:08 -05003797 *
3798 * When an image being mapped (not a parent) is probed, we have the
3799 * pool name and pool id, image name and image id, and the snapshot
3800 * name. The only thing we're missing is the snapshot id.
Alex Elder2e9f7f12013-04-26 09:43:48 -05003801 *
3802 * The set of snapshots for an image is not known until they have
3803 * been read by rbd_dev_snaps_update(), so we can't completely fill
3804 * in this information until after that has been called.
Alex Elder9e15b772012-10-30 19:40:33 -05003805 */
Alex Elder2e9f7f12013-04-26 09:43:48 -05003806static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05003807{
Alex Elder2e9f7f12013-04-26 09:43:48 -05003808 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3809 struct rbd_spec *spec = rbd_dev->spec;
3810 const char *pool_name;
3811 const char *image_name;
3812 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05003813 int ret;
3814
Alex Eldere1d42132013-04-25 23:15:08 -05003815 /*
3816 * An image being mapped will have the pool name (etc.), but
3817 * we need to look up the snapshot id.
3818 */
Alex Elder2e9f7f12013-04-26 09:43:48 -05003819 if (spec->pool_name) {
3820 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
Alex Eldere1d42132013-04-25 23:15:08 -05003821 struct rbd_snap *snap;
3822
Alex Elder2e9f7f12013-04-26 09:43:48 -05003823 snap = snap_by_name(rbd_dev, spec->snap_name);
Alex Eldere1d42132013-04-25 23:15:08 -05003824 if (!snap)
3825 return -ENOENT;
Alex Elder2e9f7f12013-04-26 09:43:48 -05003826 spec->snap_id = snap->id;
Alex Eldere1d42132013-04-25 23:15:08 -05003827 } else {
Alex Elder2e9f7f12013-04-26 09:43:48 -05003828 spec->snap_id = CEPH_NOSNAP;
Alex Eldere1d42132013-04-25 23:15:08 -05003829 }
3830
3831 return 0;
3832 }
Alex Elder9e15b772012-10-30 19:40:33 -05003833
Alex Elder2e9f7f12013-04-26 09:43:48 -05003834 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05003835
Alex Elder2e9f7f12013-04-26 09:43:48 -05003836 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
3837 if (!pool_name) {
3838 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003839 return -EIO;
3840 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05003841 pool_name = kstrdup(pool_name, GFP_KERNEL);
3842 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05003843 return -ENOMEM;
3844
3845 /* Fetch the image name; tolerate failure here */
3846
Alex Elder2e9f7f12013-04-26 09:43:48 -05003847 image_name = rbd_dev_image_name(rbd_dev);
3848 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05003849 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003850
Alex Elder2e9f7f12013-04-26 09:43:48 -05003851 /* Look up the snapshot name, and make a copy */
Alex Elder9e15b772012-10-30 19:40:33 -05003852
Alex Elder2e9f7f12013-04-26 09:43:48 -05003853 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
3854 if (!snap_name) {
3855 rbd_warn(rbd_dev, "no snapshot with id %llu", spec->snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05003856 ret = -EIO;
3857 goto out_err;
3858 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05003859 snap_name = kstrdup(snap_name, GFP_KERNEL);
3860 if (!snap_name) {
3861 ret = -ENOMEM;
Alex Elder9e15b772012-10-30 19:40:33 -05003862 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05003863 }
3864
3865 spec->pool_name = pool_name;
3866 spec->image_name = image_name;
3867 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05003868
3869 return 0;
3870out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05003871 kfree(image_name);
3872 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05003873
3874 return ret;
3875}
3876
Alex Elder6e14b1a2012-07-03 16:01:19 -05003877static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003878{
3879 size_t size;
3880 int ret;
3881 void *reply_buf;
3882 void *p;
3883 void *end;
3884 u64 seq;
3885 u32 snap_count;
3886 struct ceph_snap_context *snapc;
3887 u32 i;
3888
3889 /*
3890 * We'll need room for the seq value (maximum snapshot id),
3891 * snapshot count, and array of that many snapshot ids.
3892 * For now we have a fixed upper limit on the number we're
3893 * prepared to receive.
3894 */
3895 size = sizeof (__le64) + sizeof (__le32) +
3896 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3897 reply_buf = kzalloc(size, GFP_KERNEL);
3898 if (!reply_buf)
3899 return -ENOMEM;
3900
Alex Elder36be9a72013-01-19 00:30:28 -06003901 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003902 "rbd", "get_snapcontext", NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003903 reply_buf, size, ver);
Alex Elder36be9a72013-01-19 00:30:28 -06003904 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003905 if (ret < 0)
3906 goto out;
3907
Alex Elder35d489f2012-07-03 16:01:19 -05003908 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003909 end = reply_buf + ret;
3910 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05003911 ceph_decode_64_safe(&p, end, seq, out);
3912 ceph_decode_32_safe(&p, end, snap_count, out);
3913
3914 /*
3915 * Make sure the reported number of snapshot ids wouldn't go
3916 * beyond the end of our buffer. But before checking that,
3917 * make sure the computed size of the snapshot context we
3918 * allocate is representable in a size_t.
3919 */
3920 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3921 / sizeof (u64)) {
3922 ret = -EINVAL;
3923 goto out;
3924 }
3925 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3926 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05003927 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05003928
Alex Elder468521c2013-04-26 09:43:47 -05003929 snapc = rbd_snap_context_create(snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05003930 if (!snapc) {
3931 ret = -ENOMEM;
3932 goto out;
3933 }
Alex Elder35d489f2012-07-03 16:01:19 -05003934 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05003935 for (i = 0; i < snap_count; i++)
3936 snapc->snaps[i] = ceph_decode_64(&p);
3937
3938 rbd_dev->header.snapc = snapc;
3939
3940 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05003941 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05003942out:
3943 kfree(reply_buf);
3944
Alex Elder57385b52013-04-21 12:14:45 -05003945 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05003946}
3947
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003948static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3949{
3950 size_t size;
3951 void *reply_buf;
3952 __le64 snap_id;
3953 int ret;
3954 void *p;
3955 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003956 char *snap_name;
3957
3958 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3959 reply_buf = kmalloc(size, GFP_KERNEL);
3960 if (!reply_buf)
3961 return ERR_PTR(-ENOMEM);
3962
Alex Elderacb1b6c2013-04-25 15:09:41 -05003963 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003964 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003965 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003966 "rbd", "get_snapshot_name",
Alex Elder41579762013-04-21 12:14:45 -05003967 &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003968 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003969 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05003970 if (ret < 0) {
3971 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003972 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05003973 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003974
3975 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05003976 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05003977 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05003978 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003979 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003980
Alex Elderf40eb342013-04-25 15:09:42 -05003981 dout(" snap_id 0x%016llx snap_name = %s\n",
3982 (unsigned long long)le64_to_cpu(snap_id), snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003983out:
3984 kfree(reply_buf);
3985
Alex Elderf40eb342013-04-25 15:09:42 -05003986 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003987}
3988
3989static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3990 u64 *snap_size, u64 *snap_features)
3991{
Alex Eldere0b49862013-01-09 14:44:18 -06003992 u64 snap_id;
Alex Elderacb1b6c2013-04-25 15:09:41 -05003993 u64 size;
3994 u64 features;
3995 char *snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003996 int ret;
3997
Alex Elderacb1b6c2013-04-25 15:09:41 -05003998 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003999 snap_id = rbd_dev->header.snapc->snaps[which];
Alex Elderacb1b6c2013-04-25 15:09:41 -05004000 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004001 if (ret)
Alex Elderacb1b6c2013-04-25 15:09:41 -05004002 goto out_err;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004003
Alex Elderacb1b6c2013-04-25 15:09:41 -05004004 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
4005 if (ret)
4006 goto out_err;
4007
4008 snap_name = rbd_dev_v2_snap_name(rbd_dev, which);
4009 if (!IS_ERR(snap_name)) {
4010 *snap_size = size;
4011 *snap_features = features;
4012 }
4013
4014 return snap_name;
4015out_err:
4016 return ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004017}
4018
4019static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
4020 u64 *snap_size, u64 *snap_features)
4021{
4022 if (rbd_dev->image_format == 1)
4023 return rbd_dev_v1_snap_info(rbd_dev, which,
4024 snap_size, snap_features);
4025 if (rbd_dev->image_format == 2)
4026 return rbd_dev_v2_snap_info(rbd_dev, which,
4027 snap_size, snap_features);
4028 return ERR_PTR(-EINVAL);
4029}
4030
Alex Elder117973f2012-08-31 17:29:55 -05004031static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
4032{
4033 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05004034
4035 down_write(&rbd_dev->header_rwsem);
4036
Alex Elder117973f2012-08-31 17:29:55 -05004037 ret = rbd_dev_v2_image_size(rbd_dev);
4038 if (ret)
4039 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05004040 rbd_update_mapping_size(rbd_dev);
4041
4042 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
4043 dout("rbd_dev_v2_snap_context returned %d\n", ret);
4044 if (ret)
4045 goto out;
4046 ret = rbd_dev_snaps_update(rbd_dev);
4047 dout("rbd_dev_snaps_update returned %d\n", ret);
4048 if (ret)
4049 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05004050out:
4051 up_write(&rbd_dev->header_rwsem);
4052
4053 return ret;
4054}
4055
Alex Elder9d475de2012-07-03 16:01:19 -05004056/*
Alex Elder35938152012-08-02 11:29:46 -05004057 * Scan the rbd device's current snapshot list and compare it to the
4058 * newly-received snapshot context. Remove any existing snapshots
4059 * not present in the new snapshot context. Add a new snapshot for
4060 * any snaphots in the snapshot context not in the current list.
4061 * And verify there are no changes to snapshots we already know
4062 * about.
4063 *
4064 * Assumes the snapshots in the snapshot context are sorted by
4065 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
4066 * are also maintained in that order.)
Alex Elder522a0cc2013-04-25 15:09:41 -05004067 *
4068 * Note that any error occurs while updating the snapshot list
4069 * aborts the update, and the entire list is cleared. The snapshot
4070 * list becomes inconsistent at that point anyway, so it might as
4071 * well be empty.
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004072 */
Alex Elder304f6802012-08-31 17:29:52 -05004073static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004074{
Alex Elder35938152012-08-02 11:29:46 -05004075 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4076 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05004077 struct list_head *head = &rbd_dev->snaps;
4078 struct list_head *links = head->next;
4079 u32 index = 0;
Alex Elder522a0cc2013-04-25 15:09:41 -05004080 int ret = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004081
Alex Elder522a0cc2013-04-25 15:09:41 -05004082 dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count);
Alex Elder35938152012-08-02 11:29:46 -05004083 while (index < snap_count || links != head) {
4084 u64 snap_id;
4085 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05004086 char *snap_name;
4087 u64 snap_size = 0;
4088 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004089
Alex Elder35938152012-08-02 11:29:46 -05004090 snap_id = index < snap_count ? snapc->snaps[index]
4091 : CEPH_NOSNAP;
4092 snap = links != head ? list_entry(links, struct rbd_snap, node)
4093 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05004094 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004095
Alex Elder35938152012-08-02 11:29:46 -05004096 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
4097 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004098
Alex Elder6d292902013-01-14 12:43:31 -06004099 /*
4100 * A previously-existing snapshot is not in
4101 * the new snap context.
4102 *
Alex Elder522a0cc2013-04-25 15:09:41 -05004103 * If the now-missing snapshot is the one
4104 * the image represents, clear its existence
4105 * flag so we can avoid sending any more
4106 * requests to it.
Alex Elder6d292902013-01-14 12:43:31 -06004107 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004108 if (rbd_dev->spec->snap_id == snap->id)
Alex Elder6d292902013-01-14 12:43:31 -06004109 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder3e83b652013-04-23 13:52:53 -05004110 dout("removing %ssnap id %llu\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004111 rbd_dev->spec->snap_id == snap->id ?
4112 "mapped " : "",
Alex Elder522a0cc2013-04-25 15:09:41 -05004113 (unsigned long long)snap->id);
Alex Elder6087b512013-04-25 15:09:41 -05004114
4115 list_del(&snap->node);
4116 rbd_snap_destroy(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004117
Alex Elder35938152012-08-02 11:29:46 -05004118 /* Done with this list entry; advance */
4119
4120 links = next;
4121 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004122 }
Alex Elder35938152012-08-02 11:29:46 -05004123
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004124 snap_name = rbd_dev_snap_info(rbd_dev, index,
4125 &snap_size, &snap_features);
Alex Elder522a0cc2013-04-25 15:09:41 -05004126 if (IS_ERR(snap_name)) {
4127 ret = PTR_ERR(snap_name);
4128 dout("failed to get snap info, error %d\n", ret);
4129 goto out_err;
4130 }
Alex Eldercd892122012-07-03 16:01:19 -05004131
Alex Elder522a0cc2013-04-25 15:09:41 -05004132 dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count,
4133 (unsigned long long)snap_id);
Alex Elder35938152012-08-02 11:29:46 -05004134 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
4135 struct rbd_snap *new_snap;
4136
4137 /* We haven't seen this snapshot before */
4138
Alex Elder6087b512013-04-25 15:09:41 -05004139 new_snap = rbd_snap_create(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05004140 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05004141 if (IS_ERR(new_snap)) {
Alex Elder522a0cc2013-04-25 15:09:41 -05004142 ret = PTR_ERR(new_snap);
4143 dout(" failed to add dev, error %d\n", ret);
4144 goto out_err;
Alex Elder9fcbb802012-08-23 23:48:49 -05004145 }
Alex Elder35938152012-08-02 11:29:46 -05004146
4147 /* New goes before existing, or at end of list */
4148
Alex Elder9fcbb802012-08-23 23:48:49 -05004149 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05004150 if (snap)
4151 list_add_tail(&new_snap->node, &snap->node);
4152 else
Alex Elder523f3252012-08-30 00:16:37 -05004153 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05004154 } else {
4155 /* Already have this one */
4156
Alex Elder9fcbb802012-08-23 23:48:49 -05004157 dout(" already present\n");
4158
Alex Eldercd892122012-07-03 16:01:19 -05004159 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05004160 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05004161 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05004162
4163 /* Done with this list entry; advance */
4164
4165 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004166 }
Alex Elder35938152012-08-02 11:29:46 -05004167
4168 /* Advance to the next entry in the snapshot context */
4169
4170 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004171 }
Alex Elder9fcbb802012-08-23 23:48:49 -05004172 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004173
4174 return 0;
Alex Elder522a0cc2013-04-25 15:09:41 -05004175out_err:
4176 rbd_remove_all_snaps(rbd_dev);
4177
4178 return ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004179}
4180
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004181static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4182{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004183 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05004184 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004185
4186 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004187
Alex Eldercd789ab2012-08-30 00:16:38 -05004188 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004189 dev->bus = &rbd_bus_type;
4190 dev->type = &rbd_device_type;
4191 dev->parent = &rbd_root_dev;
Alex Elder200a6a82013-04-28 23:32:34 -05004192 dev->release = rbd_dev_device_release;
Alex Elderde71a292012-07-03 16:01:19 -05004193 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004194 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004195
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004196 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05004197
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004198 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004199}
4200
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004201static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4202{
4203 device_unregister(&rbd_dev->dev);
4204}
4205
Alex Eldere2839302012-08-29 17:11:06 -05004206static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06004207
4208/*
Alex Elder499afd52012-02-02 08:13:29 -06004209 * Get a unique rbd identifier for the given new rbd_dev, and add
4210 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06004211 */
Alex Eldere2839302012-08-29 17:11:06 -05004212static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06004213{
Alex Eldere2839302012-08-29 17:11:06 -05004214 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06004215
4216 spin_lock(&rbd_dev_list_lock);
4217 list_add_tail(&rbd_dev->node, &rbd_dev_list);
4218 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05004219 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4220 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06004221}
Alex Elderb7f23c32012-01-29 13:57:43 -06004222
Alex Elder1ddbe942012-01-29 13:57:44 -06004223/*
Alex Elder499afd52012-02-02 08:13:29 -06004224 * Remove an rbd_dev from the global list, and record that its
4225 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06004226 */
Alex Eldere2839302012-08-29 17:11:06 -05004227static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06004228{
Alex Elderd184f6b2012-01-29 13:57:44 -06004229 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05004230 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004231 int max_id;
4232
Alex Elderaafb2302012-09-06 16:00:54 -05004233 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06004234
Alex Eldere2839302012-08-29 17:11:06 -05004235 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4236 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06004237 spin_lock(&rbd_dev_list_lock);
4238 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06004239
4240 /*
4241 * If the id being "put" is not the current maximum, there
4242 * is nothing special we need to do.
4243 */
Alex Eldere2839302012-08-29 17:11:06 -05004244 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06004245 spin_unlock(&rbd_dev_list_lock);
4246 return;
4247 }
4248
4249 /*
4250 * We need to update the current maximum id. Search the
4251 * list to find out what it is. We're more likely to find
4252 * the maximum at the end, so search the list backward.
4253 */
4254 max_id = 0;
4255 list_for_each_prev(tmp, &rbd_dev_list) {
4256 struct rbd_device *rbd_dev;
4257
4258 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07004259 if (rbd_dev->dev_id > max_id)
4260 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004261 }
Alex Elder499afd52012-02-02 08:13:29 -06004262 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06004263
Alex Elder1ddbe942012-01-29 13:57:44 -06004264 /*
Alex Eldere2839302012-08-29 17:11:06 -05004265 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06004266 * which case it now accurately reflects the new maximum.
4267 * Be careful not to overwrite the maximum value in that
4268 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06004269 */
Alex Eldere2839302012-08-29 17:11:06 -05004270 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4271 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06004272}
4273
Alex Eldera725f65e2012-02-02 08:13:30 -06004274/*
Alex Eldere28fff262012-02-02 08:13:30 -06004275 * Skips over white space at *buf, and updates *buf to point to the
4276 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004277 * the token (string of non-white space characters) found. Note
4278 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004279 */
4280static inline size_t next_token(const char **buf)
4281{
4282 /*
4283 * These are the characters that produce nonzero for
4284 * isspace() in the "C" and "POSIX" locales.
4285 */
4286 const char *spaces = " \f\n\r\t\v";
4287
4288 *buf += strspn(*buf, spaces); /* Find start of token */
4289
4290 return strcspn(*buf, spaces); /* Return token length */
4291}
4292
4293/*
4294 * Finds the next token in *buf, and if the provided token buffer is
4295 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06004296 * copied, is guaranteed to be terminated with '\0'. Note that *buf
4297 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06004298 *
4299 * Returns the length of the token found (not including the '\0').
4300 * Return value will be 0 if no token is found, and it will be >=
4301 * token_size if the token would not fit.
4302 *
Alex Elder593a9e72012-02-07 12:03:37 -06004303 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06004304 * found token. Note that this occurs even if the token buffer is
4305 * too small to hold it.
4306 */
4307static inline size_t copy_token(const char **buf,
4308 char *token,
4309 size_t token_size)
4310{
4311 size_t len;
4312
4313 len = next_token(buf);
4314 if (len < token_size) {
4315 memcpy(token, *buf, len);
4316 *(token + len) = '\0';
4317 }
4318 *buf += len;
4319
4320 return len;
4321}
4322
4323/*
Alex Elderea3352f2012-07-09 21:04:23 -05004324 * Finds the next token in *buf, dynamically allocates a buffer big
4325 * enough to hold a copy of it, and copies the token into the new
4326 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4327 * that a duplicate buffer is created even for a zero-length token.
4328 *
4329 * Returns a pointer to the newly-allocated duplicate, or a null
4330 * pointer if memory for the duplicate was not available. If
4331 * the lenp argument is a non-null pointer, the length of the token
4332 * (not including the '\0') is returned in *lenp.
4333 *
4334 * If successful, the *buf pointer will be updated to point beyond
4335 * the end of the found token.
4336 *
4337 * Note: uses GFP_KERNEL for allocation.
4338 */
4339static inline char *dup_token(const char **buf, size_t *lenp)
4340{
4341 char *dup;
4342 size_t len;
4343
4344 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004345 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004346 if (!dup)
4347 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004348 *(dup + len) = '\0';
4349 *buf += len;
4350
4351 if (lenp)
4352 *lenp = len;
4353
4354 return dup;
4355}
4356
4357/*
Alex Elder859c31d2012-10-25 23:34:42 -05004358 * Parse the options provided for an "rbd add" (i.e., rbd image
4359 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4360 * and the data written is passed here via a NUL-terminated buffer.
4361 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004362 *
Alex Elder859c31d2012-10-25 23:34:42 -05004363 * The information extracted from these options is recorded in
4364 * the other parameters which return dynamically-allocated
4365 * structures:
4366 * ceph_opts
4367 * The address of a pointer that will refer to a ceph options
4368 * structure. Caller must release the returned pointer using
4369 * ceph_destroy_options() when it is no longer needed.
4370 * rbd_opts
4371 * Address of an rbd options pointer. Fully initialized by
4372 * this function; caller must release with kfree().
4373 * spec
4374 * Address of an rbd image specification pointer. Fully
4375 * initialized by this function based on parsed options.
4376 * Caller must release with rbd_spec_put().
4377 *
4378 * The options passed take this form:
4379 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4380 * where:
4381 * <mon_addrs>
4382 * A comma-separated list of one or more monitor addresses.
4383 * A monitor address is an ip address, optionally followed
4384 * by a port number (separated by a colon).
4385 * I.e.: ip1[:port1][,ip2[:port2]...]
4386 * <options>
4387 * A comma-separated list of ceph and/or rbd options.
4388 * <pool_name>
4389 * The name of the rados pool containing the rbd image.
4390 * <image_name>
4391 * The name of the image in that pool to map.
4392 * <snap_id>
4393 * An optional snapshot id. If provided, the mapping will
4394 * present data from the image at the time that snapshot was
4395 * created. The image head is used if no snapshot id is
4396 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004397 */
Alex Elder859c31d2012-10-25 23:34:42 -05004398static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004399 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004400 struct rbd_options **opts,
4401 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004402{
Alex Elderd22f76e2012-07-12 10:46:35 -05004403 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004404 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004405 const char *mon_addrs;
Alex Elderecb4dc222013-04-26 09:43:47 -05004406 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05004407 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004408 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004409 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004410 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004411 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004412
4413 /* The first four tokens are required */
4414
Alex Elder7ef32142012-02-02 08:13:30 -06004415 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004416 if (!len) {
4417 rbd_warn(NULL, "no monitor address(es) provided");
4418 return -EINVAL;
4419 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004420 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004421 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004422 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004423
Alex Elderdc79b112012-10-25 23:34:41 -05004424 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004425 options = dup_token(&buf, NULL);
4426 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004427 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004428 if (!*options) {
4429 rbd_warn(NULL, "no options provided");
4430 goto out_err;
4431 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004432
Alex Elder859c31d2012-10-25 23:34:42 -05004433 spec = rbd_spec_alloc();
4434 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004435 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004436
4437 spec->pool_name = dup_token(&buf, NULL);
4438 if (!spec->pool_name)
4439 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004440 if (!*spec->pool_name) {
4441 rbd_warn(NULL, "no pool name provided");
4442 goto out_err;
4443 }
Alex Eldere28fff262012-02-02 08:13:30 -06004444
Alex Elder69e7a022012-11-01 08:39:26 -05004445 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004446 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004447 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004448 if (!*spec->image_name) {
4449 rbd_warn(NULL, "no image name provided");
4450 goto out_err;
4451 }
Alex Eldere28fff262012-02-02 08:13:30 -06004452
Alex Elderf28e5652012-10-25 23:34:41 -05004453 /*
4454 * Snapshot name is optional; default is to use "-"
4455 * (indicating the head/no snapshot).
4456 */
Alex Elder3feeb8942012-08-31 17:29:52 -05004457 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05004458 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05004459 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4460 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05004461 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05004462 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05004463 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05004464 }
Alex Elderecb4dc222013-04-26 09:43:47 -05004465 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4466 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004467 goto out_mem;
Alex Elderecb4dc222013-04-26 09:43:47 -05004468 *(snap_name + len) = '\0';
4469 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05004470
Alex Elder0ddebc02012-10-25 23:34:41 -05004471 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06004472
Alex Elder4e9afeb2012-10-25 23:34:41 -05004473 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4474 if (!rbd_opts)
4475 goto out_mem;
4476
4477 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05004478
Alex Elder859c31d2012-10-25 23:34:42 -05004479 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05004480 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05004481 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004482 if (IS_ERR(copts)) {
4483 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05004484 goto out_err;
4485 }
Alex Elder859c31d2012-10-25 23:34:42 -05004486 kfree(options);
4487
4488 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004489 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05004490 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05004491
Alex Elderdc79b112012-10-25 23:34:41 -05004492 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05004493out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05004494 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05004495out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05004496 kfree(rbd_opts);
4497 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05004498 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05004499
Alex Elderdc79b112012-10-25 23:34:41 -05004500 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06004501}
4502
Alex Elder589d30e2012-07-10 20:30:11 -05004503/*
4504 * An rbd format 2 image has a unique identifier, distinct from the
4505 * name given to it by the user. Internally, that identifier is
4506 * what's used to specify the names of objects related to the image.
4507 *
4508 * A special "rbd id" object is used to map an rbd image name to its
4509 * id. If that object doesn't exist, then there is no v2 rbd image
4510 * with the supplied name.
4511 *
4512 * This function will record the given rbd_dev's image_id field if
4513 * it can be determined, and in that case will return 0. If any
4514 * errors occur a negative errno will be returned and the rbd_dev's
4515 * image_id field will be unchanged (and should be NULL).
4516 */
4517static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4518{
4519 int ret;
4520 size_t size;
4521 char *object_name;
4522 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05004523 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05004524
Alex Elder589d30e2012-07-10 20:30:11 -05004525 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05004526 * When probing a parent image, the image id is already
4527 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05004528 * need to fetch the image id again in this case. We
4529 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05004530 */
Alex Elderc0fba362013-04-25 23:15:08 -05004531 if (rbd_dev->spec->image_id) {
4532 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4533
Alex Elder2c0d0a12012-10-30 19:40:33 -05004534 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05004535 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05004536
4537 /*
Alex Elder589d30e2012-07-10 20:30:11 -05004538 * First, see if the format 2 image id file exists, and if
4539 * so, get the image's persistent id from it.
4540 */
Alex Elder69e7a022012-11-01 08:39:26 -05004541 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004542 object_name = kmalloc(size, GFP_NOIO);
4543 if (!object_name)
4544 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004545 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004546 dout("rbd id object name is %s\n", object_name);
4547
4548 /* Response will be an encoded string, which includes a length */
4549
4550 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4551 response = kzalloc(size, GFP_NOIO);
4552 if (!response) {
4553 ret = -ENOMEM;
4554 goto out;
4555 }
4556
Alex Elderc0fba362013-04-25 23:15:08 -05004557 /* If it doesn't exist we'll assume it's a format 1 image */
4558
Alex Elder36be9a72013-01-19 00:30:28 -06004559 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder41579762013-04-21 12:14:45 -05004560 "rbd", "get_id", NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06004561 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06004562 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05004563 if (ret == -ENOENT) {
4564 image_id = kstrdup("", GFP_KERNEL);
4565 ret = image_id ? 0 : -ENOMEM;
4566 if (!ret)
4567 rbd_dev->image_format = 1;
4568 } else if (ret > sizeof (__le32)) {
4569 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05004570
Alex Elderc0fba362013-04-25 23:15:08 -05004571 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05004572 NULL, GFP_NOIO);
Alex Elderc0fba362013-04-25 23:15:08 -05004573 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4574 if (!ret)
4575 rbd_dev->image_format = 2;
Alex Elder589d30e2012-07-10 20:30:11 -05004576 } else {
Alex Elderc0fba362013-04-25 23:15:08 -05004577 ret = -EINVAL;
4578 }
4579
4580 if (!ret) {
4581 rbd_dev->spec->image_id = image_id;
4582 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004583 }
4584out:
4585 kfree(response);
4586 kfree(object_name);
4587
4588 return ret;
4589}
4590
Alex Eldera30b71b2012-07-10 20:30:11 -05004591static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4592{
4593 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004594
4595 /* Populate rbd image metadata */
4596
4597 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4598 if (ret < 0)
4599 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05004600
4601 /* Version 1 images have no parent (no layering) */
4602
4603 rbd_dev->parent_spec = NULL;
4604 rbd_dev->parent_overlap = 0;
4605
Alex Eldera30b71b2012-07-10 20:30:11 -05004606 dout("discovered version 1 image, header name is %s\n",
4607 rbd_dev->header_name);
4608
4609 return 0;
4610
4611out_err:
4612 kfree(rbd_dev->header_name);
4613 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004614 kfree(rbd_dev->spec->image_id);
4615 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05004616
4617 return ret;
4618}
4619
4620static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4621{
Alex Elder9d475de2012-07-03 16:01:19 -05004622 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004623 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05004624
Alex Elder9d475de2012-07-03 16:01:19 -05004625 ret = rbd_dev_v2_image_size(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004626 if (ret)
Alex Elder9d475de2012-07-03 16:01:19 -05004627 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05004628
4629 /* Get the object prefix (a.k.a. block_name) for the image */
4630
4631 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004632 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05004633 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05004634
Alex Elderd8891402012-10-09 13:50:17 -07004635 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05004636
4637 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004638 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05004639 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05004640
Alex Elder86b00e02012-10-25 23:34:42 -05004641 /* If the image supports layering, get the parent info */
4642
4643 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4644 ret = rbd_dev_v2_parent_info(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004645 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004646 goto out_err;
Alex Elder770eba62012-10-25 23:34:40 -05004647 rbd_warn(rbd_dev, "WARNING: kernel support for "
4648 "layered rbd images is EXPERIMENTAL!");
Alex Elder86b00e02012-10-25 23:34:42 -05004649 }
4650
Alex Eldercc070d52013-04-21 12:14:45 -05004651 /* If the image supports fancy striping, get its parameters */
4652
4653 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4654 ret = rbd_dev_v2_striping_info(rbd_dev);
4655 if (ret < 0)
4656 goto out_err;
4657 }
4658
Alex Elder6e14b1a2012-07-03 16:01:19 -05004659 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05004660
Alex Elder6e14b1a2012-07-03 16:01:19 -05004661 rbd_dev->header.crypt_type = 0;
4662 rbd_dev->header.comp_type = 0;
4663
4664 /* Get the snapshot context, plus the header version */
4665
4666 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05004667 if (ret)
4668 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004669 rbd_dev->header.obj_version = ver;
4670
Alex Eldera30b71b2012-07-10 20:30:11 -05004671 dout("discovered version 2 image, header name is %s\n",
4672 rbd_dev->header_name);
4673
Alex Elder35152972012-08-31 17:29:55 -05004674 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05004675out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05004676 rbd_dev->parent_overlap = 0;
4677 rbd_spec_put(rbd_dev->parent_spec);
4678 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004679 kfree(rbd_dev->header_name);
4680 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05004681 kfree(rbd_dev->header.object_prefix);
4682 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004683
4684 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004685}
4686
Alex Elder124afba2013-04-26 15:44:36 -05004687static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
Alex Elder83a06262012-10-30 15:47:17 -05004688{
Alex Elder2f82ee52012-10-30 19:40:33 -05004689 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05004690 struct rbd_spec *parent_spec;
4691 struct rbd_client *rbdc;
4692 int ret;
4693
4694 if (!rbd_dev->parent_spec)
4695 return 0;
4696 /*
4697 * We need to pass a reference to the client and the parent
4698 * spec when creating the parent rbd_dev. Images related by
4699 * parent/child relationships always share both.
4700 */
4701 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4702 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4703
4704 ret = -ENOMEM;
4705 parent = rbd_dev_create(rbdc, parent_spec);
4706 if (!parent)
4707 goto out_err;
4708
4709 ret = rbd_dev_image_probe(parent);
4710 if (ret < 0)
4711 goto out_err;
4712 rbd_dev->parent = parent;
4713
4714 return 0;
4715out_err:
4716 if (parent) {
4717 rbd_spec_put(rbd_dev->parent_spec);
4718 kfree(rbd_dev->header_name);
4719 rbd_dev_destroy(parent);
4720 } else {
4721 rbd_put_client(rbdc);
4722 rbd_spec_put(parent_spec);
4723 }
4724
4725 return ret;
4726}
4727
Alex Elder200a6a82013-04-28 23:32:34 -05004728static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05004729{
Alex Elder83a06262012-10-30 15:47:17 -05004730 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05004731
Alex Elderd1cf5782013-04-27 09:59:30 -05004732 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004733 if (ret)
Alex Elder9bb81c92013-04-27 09:59:30 -05004734 return ret;
Alex Elder5de10f32013-04-26 15:44:37 -05004735
Alex Elder83a06262012-10-30 15:47:17 -05004736 /* generate unique id: find highest unique id, add one */
4737 rbd_dev_id_get(rbd_dev);
4738
4739 /* Fill in the device name, now that we have its id. */
4740 BUILD_BUG_ON(DEV_NAME_LEN
4741 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4742 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4743
4744 /* Get our block major device number. */
4745
4746 ret = register_blkdev(0, rbd_dev->name);
4747 if (ret < 0)
4748 goto err_out_id;
4749 rbd_dev->major = ret;
4750
4751 /* Set up the blkdev mapping. */
4752
4753 ret = rbd_init_disk(rbd_dev);
4754 if (ret)
4755 goto err_out_blkdev;
4756
4757 ret = rbd_bus_add_dev(rbd_dev);
4758 if (ret)
4759 goto err_out_disk;
4760
Alex Elder83a06262012-10-30 15:47:17 -05004761 /* Everything's ready. Announce the disk to the world. */
4762
Alex Elderb5156e72013-04-26 15:44:36 -05004763 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Alex Elder129b79d2013-04-26 15:44:36 -05004764 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder83a06262012-10-30 15:47:17 -05004765 add_disk(rbd_dev->disk);
4766
4767 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4768 (unsigned long long) rbd_dev->mapping.size);
4769
4770 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05004771
Alex Elder83a06262012-10-30 15:47:17 -05004772err_out_disk:
4773 rbd_free_disk(rbd_dev);
4774err_out_blkdev:
4775 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4776err_out_id:
4777 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05004778 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004779
4780 return ret;
4781}
4782
Alex Elder332bb122013-04-27 09:59:30 -05004783static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4784{
4785 struct rbd_spec *spec = rbd_dev->spec;
4786 size_t size;
4787
4788 /* Record the header object name for this rbd image. */
4789
4790 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4791
4792 if (rbd_dev->image_format == 1)
4793 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4794 else
4795 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4796
4797 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4798 if (!rbd_dev->header_name)
4799 return -ENOMEM;
4800
4801 if (rbd_dev->image_format == 1)
4802 sprintf(rbd_dev->header_name, "%s%s",
4803 spec->image_name, RBD_SUFFIX);
4804 else
4805 sprintf(rbd_dev->header_name, "%s%s",
4806 RBD_HEADER_PREFIX, spec->image_id);
4807 return 0;
4808}
4809
Alex Elder200a6a82013-04-28 23:32:34 -05004810static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4811{
4812 rbd_header_free(&rbd_dev->header);
4813 rbd_assert(rbd_dev->rbd_client != NULL);
4814 rbd_spec_put(rbd_dev->parent_spec);
4815 kfree(rbd_dev->header_name);
4816 rbd_dev_destroy(rbd_dev);
4817}
4818
Alex Eldera30b71b2012-07-10 20:30:11 -05004819/*
4820 * Probe for the existence of the header object for the given rbd
4821 * device. For format 2 images this includes determining the image
4822 * id.
4823 */
Alex Elder71f293e2013-04-26 09:43:48 -05004824static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05004825{
4826 int ret;
Alex Elderb644de22013-04-27 09:59:31 -05004827 int tmp;
Alex Eldera30b71b2012-07-10 20:30:11 -05004828
4829 /*
4830 * Get the id from the image id object. If it's not a
4831 * format 2 image, we'll get ENOENT back, and we'll assume
4832 * it's a format 1 image.
4833 */
4834 ret = rbd_dev_image_id(rbd_dev);
4835 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05004836 return ret;
4837 rbd_assert(rbd_dev->spec->image_id);
4838 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4839
Alex Elder332bb122013-04-27 09:59:30 -05004840 ret = rbd_dev_header_name(rbd_dev);
4841 if (ret)
4842 goto err_out_format;
4843
Alex Elderb644de22013-04-27 09:59:31 -05004844 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4845 if (ret)
4846 goto out_header_name;
4847
Alex Elderc0fba362013-04-25 23:15:08 -05004848 if (rbd_dev->image_format == 1)
Alex Eldera30b71b2012-07-10 20:30:11 -05004849 ret = rbd_dev_v1_probe(rbd_dev);
4850 else
4851 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05004852 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05004853 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05004854
Alex Elder9bb81c92013-04-27 09:59:30 -05004855 ret = rbd_dev_snaps_update(rbd_dev);
4856 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05004857 goto err_out_watch;
Alex Elder9bb81c92013-04-27 09:59:30 -05004858
4859 ret = rbd_dev_spec_update(rbd_dev);
4860 if (ret)
4861 goto err_out_snaps;
4862
4863 ret = rbd_dev_probe_parent(rbd_dev);
4864 if (ret)
4865 goto err_out_snaps;
4866
Alex Elder200a6a82013-04-28 23:32:34 -05004867 ret = rbd_dev_device_setup(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004868 if (ret)
Alex Elder9bb81c92013-04-27 09:59:30 -05004869 goto err_out_parent;
Alex Elder83a06262012-10-30 15:47:17 -05004870
Alex Eldera30b71b2012-07-10 20:30:11 -05004871 return ret;
Alex Elder9bb81c92013-04-27 09:59:30 -05004872err_out_parent:
4873 rbd_dev_remove_parent(rbd_dev);
4874 rbd_header_free(&rbd_dev->header);
4875err_out_snaps:
4876 rbd_remove_all_snaps(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05004877err_out_watch:
4878 tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4879 if (tmp)
4880 rbd_warn(rbd_dev, "unable to tear down watch request\n");
Alex Elder332bb122013-04-27 09:59:30 -05004881out_header_name:
4882 kfree(rbd_dev->header_name);
4883 rbd_dev->header_name = NULL;
4884err_out_format:
4885 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05004886 kfree(rbd_dev->spec->image_id);
4887 rbd_dev->spec->image_id = NULL;
4888
4889 dout("probe failed, returning %d\n", ret);
4890
4891 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004892}
4893
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004894static ssize_t rbd_add(struct bus_type *bus,
4895 const char *buf,
4896 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004897{
Alex Eldercb8627c2012-07-09 21:04:23 -05004898 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004899 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004900 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004901 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004902 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004903 struct ceph_osd_client *osdc;
4904 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004905
4906 if (!try_module_get(THIS_MODULE))
4907 return -ENODEV;
4908
Alex Eldera725f65e2012-02-02 08:13:30 -06004909 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004910 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004911 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004912 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004913
Alex Elder9d3997f2012-10-25 23:34:42 -05004914 rbdc = rbd_get_client(ceph_opts);
4915 if (IS_ERR(rbdc)) {
4916 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004917 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004918 }
Alex Elderc53d5892012-10-25 23:34:42 -05004919 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004920
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004921 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004922 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004923 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004924 if (rc < 0)
4925 goto err_out_client;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004926 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05004927
Alex Elder0903e872012-11-14 12:25:19 -06004928 /* The ceph file layout needs to fit pool id in 32 bits */
4929
Alex Elderc0cd10db2013-04-26 09:43:47 -05004930 if (spec->pool_id > (u64)U32_MAX) {
4931 rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4932 (unsigned long long)spec->pool_id, U32_MAX);
Alex Elder0903e872012-11-14 12:25:19 -06004933 rc = -EIO;
4934 goto err_out_client;
4935 }
4936
Alex Elderc53d5892012-10-25 23:34:42 -05004937 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004938 if (!rbd_dev)
4939 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004940 rbdc = NULL; /* rbd_dev now owns this */
4941 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004942
Alex Elderbd4ba652012-10-25 23:34:42 -05004943 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004944 kfree(rbd_opts);
4945 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004946
Alex Elder71f293e2013-04-26 09:43:48 -05004947 rc = rbd_dev_image_probe(rbd_dev);
Alex Eldera30b71b2012-07-10 20:30:11 -05004948 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004949 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004950
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004951 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004952err_out_rbd_dev:
Alex Elder9f5dffd2013-04-26 09:43:47 -05004953 kfree(rbd_dev->header_name);
Alex Elderc53d5892012-10-25 23:34:42 -05004954 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004955err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004956 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004957err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004958 if (ceph_opts)
4959 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004960 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004961 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004962err_out_module:
4963 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004964
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004965 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004966
Alex Elderc0cd10db2013-04-26 09:43:47 -05004967 return (ssize_t)rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004968}
4969
Alex Elderde71a292012-07-03 16:01:19 -05004970static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004971{
4972 struct list_head *tmp;
4973 struct rbd_device *rbd_dev;
4974
Alex Eldere124a82f2012-01-29 13:57:44 -06004975 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004976 list_for_each(tmp, &rbd_dev_list) {
4977 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004978 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06004979 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004980 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06004981 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004982 }
Alex Eldere124a82f2012-01-29 13:57:44 -06004983 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004984 return NULL;
4985}
4986
Alex Elder200a6a82013-04-28 23:32:34 -05004987static void rbd_dev_device_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004988{
Alex Elder593a9e72012-02-07 12:03:37 -06004989 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004990
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004991 rbd_free_disk(rbd_dev);
Alex Elder200a6a82013-04-28 23:32:34 -05004992 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4993 rbd_dev_clear_mapping(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004994 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder200a6a82013-04-28 23:32:34 -05004995 rbd_dev->major = 0;
Alex Eldere2839302012-08-29 17:11:06 -05004996 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05004997 rbd_dev_mapping_clear(rbd_dev);
Alex Elder200a6a82013-04-28 23:32:34 -05004998
4999 rbd_dev_image_release(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005000}
5001
Alex Elder05a46af2013-04-26 15:44:36 -05005002static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5003{
Alex Elderad945fc2013-04-26 15:44:36 -05005004 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005005 struct rbd_device *first = rbd_dev;
5006 struct rbd_device *second = first->parent;
5007 struct rbd_device *third;
Alex Elder0d8189e2013-04-27 09:59:30 -05005008 int ret;
Alex Elder05a46af2013-04-26 15:44:36 -05005009
5010 /*
5011 * Follow to the parent with no grandparent and
5012 * remove it.
5013 */
5014 while (second && (third = second->parent)) {
5015 first = second;
5016 second = third;
5017 }
Alex Elderad945fc2013-04-26 15:44:36 -05005018 rbd_assert(second);
Alex Elder0d8189e2013-04-27 09:59:30 -05005019 ret = rbd_dev_header_watch_sync(rbd_dev, 0);
5020 if (ret)
5021 rbd_warn(rbd_dev,
5022 "failed to cancel watch event (%d)\n", ret);
Alex Elderb4808152013-04-26 15:44:36 -05005023 rbd_remove_all_snaps(second);
5024 rbd_bus_del_dev(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005025 first->parent = NULL;
5026 first->parent_overlap = 0;
5027
5028 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005029 rbd_spec_put(first->parent_spec);
5030 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005031 }
5032}
5033
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005034static ssize_t rbd_remove(struct bus_type *bus,
5035 const char *buf,
5036 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005037{
5038 struct rbd_device *rbd_dev = NULL;
Alex Elder0d8189e2013-04-27 09:59:30 -05005039 int target_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005040 unsigned long ul;
Alex Elder0d8189e2013-04-27 09:59:30 -05005041 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005042
Alex Elder0d8189e2013-04-27 09:59:30 -05005043 ret = strict_strtoul(buf, 10, &ul);
5044 if (ret)
5045 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005046
5047 /* convert to int; abort if we lost anything in the conversion */
5048 target_id = (int) ul;
5049 if (target_id != ul)
5050 return -EINVAL;
5051
5052 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
5053
5054 rbd_dev = __rbd_get_dev(target_id);
5055 if (!rbd_dev) {
5056 ret = -ENOENT;
5057 goto done;
5058 }
5059
Alex Eldera14ea262013-02-05 13:23:12 -06005060 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06005061 if (rbd_dev->open_count)
Alex Elder42382b72012-11-16 09:29:16 -06005062 ret = -EBUSY;
Alex Elderb82d1672013-01-14 12:43:31 -06005063 else
5064 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
Alex Eldera14ea262013-02-05 13:23:12 -06005065 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06005066 if (ret < 0)
Alex Elder42382b72012-11-16 09:29:16 -06005067 goto done;
Alex Elder42382b72012-11-16 09:29:16 -06005068
Alex Elder0d8189e2013-04-27 09:59:30 -05005069 ret = rbd_dev_header_watch_sync(rbd_dev, 0);
5070 if (ret) {
5071 rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
5072 clear_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
5073 smp_mb();
5074 return ret;
5075 }
5076 ret = count;
5077
Alex Elder05a46af2013-04-26 15:44:36 -05005078 rbd_dev_remove_parent(rbd_dev);
Alex Elder2f82ee52012-10-30 19:40:33 -05005079
Alex Elderb4808152013-04-26 15:44:36 -05005080 rbd_remove_all_snaps(rbd_dev);
5081 rbd_bus_del_dev(rbd_dev);
Alex Elder79ab7552013-04-28 23:32:34 -05005082 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005083done:
5084 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05005085
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005086 return ret;
5087}
5088
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005089/*
5090 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005091 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005092 */
5093static int rbd_sysfs_init(void)
5094{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005095 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005096
Alex Elderfed4c142012-02-07 12:03:36 -06005097 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005098 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005099 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005100
Alex Elderfed4c142012-02-07 12:03:36 -06005101 ret = bus_register(&rbd_bus_type);
5102 if (ret < 0)
5103 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005104
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005105 return ret;
5106}
5107
5108static void rbd_sysfs_cleanup(void)
5109{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005110 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005111 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005112}
5113
Alex Eldercc344fa2013-02-19 12:25:56 -06005114static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005115{
5116 int rc;
5117
Alex Elder1e32d342013-01-30 11:13:33 -06005118 if (!libceph_compatible(NULL)) {
5119 rbd_warn(NULL, "libceph incompatibility (quitting)");
5120
5121 return -EINVAL;
5122 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005123 rc = rbd_sysfs_init();
5124 if (rc)
5125 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06005126 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005127 return 0;
5128}
5129
Alex Eldercc344fa2013-02-19 12:25:56 -06005130static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005131{
5132 rbd_sysfs_cleanup();
5133}
5134
5135module_init(rbd_init);
5136module_exit(rbd_exit);
5137
5138MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5139MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5140MODULE_DESCRIPTION("rados block device");
5141
5142/* following authorship retained from original osdblk.c */
5143MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5144
5145MODULE_LICENSE("GPL");