blob: d74be04ceeffcf319119157995a3fdc0e7c9b71e [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
34#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070035#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050036#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070037
38#include <linux/kernel.h>
39#include <linux/device.h>
40#include <linux/module.h>
41#include <linux/fs.h>
42#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050043#include <linux/slab.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044
45#include "rbd_types.h"
46
Alex Elderaafb2302012-09-06 16:00:54 -050047#define RBD_DEBUG /* Activate rbd_assert() calls */
48
Alex Elder593a9e72012-02-07 12:03:37 -060049/*
50 * The basic unit of block I/O is a sector. It is interpreted in a
51 * number of contexts in Linux (blk, bio, genhd), but the default is
52 * universally 512 bytes. These symbols are just slightly more
53 * meaningful than the bare numbers they represent.
54 */
55#define SECTOR_SHIFT 9
56#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
57
Alex Elderf0f8cef2012-01-29 13:57:44 -060058#define RBD_DRV_NAME "rbd"
59#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070060
61#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
62
Alex Elderd4b125e2012-07-03 16:01:19 -050063#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
64#define RBD_MAX_SNAP_NAME_LEN \
65 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
66
Alex Elder35d489f2012-07-03 16:01:19 -050067#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070068
69#define RBD_SNAP_HEAD_NAME "-"
70
Alex Elder9682fc62013-04-30 00:44:33 -050071#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
72
Alex Elder9e15b772012-10-30 19:40:33 -050073/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050075#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050076
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050078
Alex Elderd8891402012-10-09 13:50:17 -070079/* Feature bits */
80
Alex Elder5cbf6f122013-04-11 09:29:48 -050081#define RBD_FEATURE_LAYERING (1<<0)
82#define RBD_FEATURE_STRIPINGV2 (1<<1)
83#define RBD_FEATURES_ALL \
84 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
Alex Elderd8891402012-10-09 13:50:17 -070085
86/* Features supported by this (client software) implementation. */
87
Alex Elder770eba62012-10-25 23:34:40 -050088#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -070089
Alex Elder81a89792012-02-02 08:13:30 -060090/*
91 * An RBD device name will be "rbd#", where the "rbd" comes from
92 * RBD_DRV_NAME above, and # is a unique integer identifier.
93 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
94 * enough to hold all possible device names.
95 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060097#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098
99/*
100 * block device image metadata (in-memory version)
101 */
102struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500103 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500104 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500105 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700106 __u8 obj_order;
107 __u8 crypt_type;
108 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109
Alex Elderf84344f2012-08-31 17:29:51 -0500110 /* The remaining fields need to be updated occasionally */
111 u64 image_size;
112 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700113 char *snap_names;
114 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700115
Alex Elder500d0c02013-04-26 09:43:47 -0500116 u64 stripe_unit;
117 u64 stripe_count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700118};
119
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500120/*
121 * An rbd image specification.
122 *
123 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500124 * identify an image. Each rbd_dev structure includes a pointer to
125 * an rbd_spec structure that encapsulates this identity.
126 *
127 * Each of the id's in an rbd_spec has an associated name. For a
128 * user-mapped image, the names are supplied and the id's associated
129 * with them are looked up. For a layered image, a parent image is
130 * defined by the tuple, and the names are looked up.
131 *
132 * An rbd_dev structure contains a parent_spec pointer which is
133 * non-null if the image it represents is a child in a layered
134 * image. This pointer will refer to the rbd_spec structure used
135 * by the parent rbd_dev for its own identity (i.e., the structure
136 * is shared between the parent and child).
137 *
138 * Since these structures are populated once, during the discovery
139 * phase of image construction, they are effectively immutable so
140 * we make no effort to synchronize access to them.
141 *
142 * Note that code herein does not assume the image name is known (it
143 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500144 */
145struct rbd_spec {
146 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500147 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148
Alex Elderecb4dc22013-04-26 09:43:47 -0500149 const char *image_id;
150 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500151
152 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500153 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500154
155 struct kref kref;
156};
157
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600159 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700160 */
161struct rbd_client {
162 struct ceph_client *client;
163 struct kref kref;
164 struct list_head node;
165};
166
Alex Elderbf0d5f502012-11-22 00:00:08 -0600167struct rbd_img_request;
168typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
169
170#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
171
172struct rbd_obj_request;
173typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
174
Alex Elder9969ebc2013-01-18 12:31:10 -0600175enum obj_request_type {
176 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
177};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600178
Alex Elder926f9b32013-02-11 12:33:24 -0600179enum obj_req_flags {
180 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600181 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600182 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
183 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600184};
185
Alex Elderbf0d5f502012-11-22 00:00:08 -0600186struct rbd_obj_request {
187 const char *object_name;
188 u64 offset; /* object start byte */
189 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600190 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600191
Alex Elderc5b5ef62013-02-11 12:33:24 -0600192 /*
193 * An object request associated with an image will have its
194 * img_data flag set; a standalone object request will not.
195 *
196 * A standalone object request will have which == BAD_WHICH
197 * and a null obj_request pointer.
198 *
199 * An object request initiated in support of a layered image
200 * object (to check for its existence before a write) will
201 * have which == BAD_WHICH and a non-null obj_request pointer.
202 *
203 * Finally, an object request for rbd image data will have
204 * which != BAD_WHICH, and will have a non-null img_request
205 * pointer. The value of which will be in the range
206 * 0..(img_request->obj_request_count-1).
207 */
208 union {
209 struct rbd_obj_request *obj_request; /* STAT op */
210 struct {
211 struct rbd_img_request *img_request;
212 u64 img_offset;
213 /* links for img_request->obj_requests list */
214 struct list_head links;
215 };
216 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600217 u32 which; /* posn image request list */
218
219 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600220 union {
221 struct bio *bio_list;
222 struct {
223 struct page **pages;
224 u32 page_count;
225 };
226 };
Alex Elder0eefd472013-04-19 15:34:50 -0500227 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600228
229 struct ceph_osd_request *osd_req;
230
231 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800232 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600233
234 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600235 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600236
237 struct kref kref;
238};
239
Alex Elder0c425242013-02-08 09:55:49 -0600240enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600241 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
242 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600243 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600244};
245
Alex Elderbf0d5f502012-11-22 00:00:08 -0600246struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600247 struct rbd_device *rbd_dev;
248 u64 offset; /* starting image byte offset */
249 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600250 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600251 union {
Alex Elder9849e982013-01-24 16:13:36 -0600252 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600253 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600254 };
255 union {
256 struct request *rq; /* block request */
257 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600258 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500259 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600260 spinlock_t completion_lock;/* protects next_completion */
261 u32 next_completion;
262 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500263 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600264 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600265
266 u32 obj_request_count;
267 struct list_head obj_requests; /* rbd_obj_request structs */
268
269 struct kref kref;
270};
271
272#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600273 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600274#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600275 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600277 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600278
Alex Elderf84344f2012-08-31 17:29:51 -0500279struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500280 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500281 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500282 bool read_only;
283};
284
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700285/*
286 * a single device
287 */
288struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500289 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700290
291 int major; /* blkdev assigned major */
292 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293
Alex Eldera30b71b2012-07-10 20:30:11 -0500294 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 struct rbd_client *rbd_client;
296
297 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
298
Alex Elderb82d1672013-01-14 12:43:31 -0600299 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700300
301 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600302 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500303 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500305 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500306
Alex Elder0903e872012-11-14 12:25:19 -0600307 struct ceph_file_layout layout;
308
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700309 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600310 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700311
Alex Elder86b00e02012-10-25 23:34:42 -0500312 struct rbd_spec *parent_spec;
313 u64 parent_overlap;
Alex Elder2f82ee52012-10-30 19:40:33 -0500314 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500315
Josh Durginc6666012011-11-21 17:11:12 -0800316 /* protects updating the header */
317 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500318
319 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700320
321 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800322
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800323 /* sysfs related */
324 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600325 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800326};
327
Alex Elderb82d1672013-01-14 12:43:31 -0600328/*
329 * Flag bits for rbd_dev->flags. If atomicity is required,
330 * rbd_dev->lock is used to protect access.
331 *
332 * Currently, only the "removing" flag (which is coupled with the
333 * "open_count" field) requires atomic access.
334 */
Alex Elder6d292902013-01-14 12:43:31 -0600335enum rbd_dev_flags {
336 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600337 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600338};
339
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700340static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600341
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700342static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600343static DEFINE_SPINLOCK(rbd_dev_list_lock);
344
Alex Elder432b8582012-01-29 13:57:44 -0600345static LIST_HEAD(rbd_client_list); /* clients */
346static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347
Alex Elder1c2a9df2013-05-01 12:43:03 -0500348static struct kmem_cache *rbd_img_request_cache;
349
Alex Elder3d7efd12013-04-19 15:34:50 -0500350static int rbd_img_request_submit(struct rbd_img_request *img_request);
351
Alex Elder200a6a82013-04-28 23:32:34 -0500352static void rbd_dev_device_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800353
Alex Elderf0f8cef2012-01-29 13:57:44 -0600354static ssize_t rbd_add(struct bus_type *bus, const char *buf,
355 size_t count);
356static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
357 size_t count);
Alex Elder71f293e2013-04-26 09:43:48 -0500358static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600359
360static struct bus_attribute rbd_bus_attrs[] = {
361 __ATTR(add, S_IWUSR, NULL, rbd_add),
362 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
363 __ATTR_NULL
364};
365
366static struct bus_type rbd_bus_type = {
367 .name = "rbd",
368 .bus_attrs = rbd_bus_attrs,
369};
370
371static void rbd_root_dev_release(struct device *dev)
372{
373}
374
375static struct device rbd_root_dev = {
376 .init_name = "rbd",
377 .release = rbd_root_dev_release,
378};
379
Alex Elder06ecc6c2012-11-01 10:17:15 -0500380static __printf(2, 3)
381void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
382{
383 struct va_format vaf;
384 va_list args;
385
386 va_start(args, fmt);
387 vaf.fmt = fmt;
388 vaf.va = &args;
389
390 if (!rbd_dev)
391 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
392 else if (rbd_dev->disk)
393 printk(KERN_WARNING "%s: %s: %pV\n",
394 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
395 else if (rbd_dev->spec && rbd_dev->spec->image_name)
396 printk(KERN_WARNING "%s: image %s: %pV\n",
397 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
398 else if (rbd_dev->spec && rbd_dev->spec->image_id)
399 printk(KERN_WARNING "%s: id %s: %pV\n",
400 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
401 else /* punt */
402 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
403 RBD_DRV_NAME, rbd_dev, &vaf);
404 va_end(args);
405}
406
Alex Elderaafb2302012-09-06 16:00:54 -0500407#ifdef RBD_DEBUG
408#define rbd_assert(expr) \
409 if (unlikely(!(expr))) { \
410 printk(KERN_ERR "\nAssertion failure in %s() " \
411 "at line %d:\n\n" \
412 "\trbd_assert(%s);\n\n", \
413 __func__, __LINE__, #expr); \
414 BUG(); \
415 }
416#else /* !RBD_DEBUG */
417# define rbd_assert(expr) ((void) 0)
418#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800419
Alex Elderb454e362013-04-19 15:34:50 -0500420static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder05a46af2013-04-26 15:44:36 -0500421static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
422static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600423
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500424static int rbd_dev_refresh(struct rbd_device *rbd_dev);
425static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500426static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
427 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500428static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
429 u8 *order, u64 *snap_size);
430static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
431 u64 *snap_features);
432static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700433
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700434static int rbd_open(struct block_device *bdev, fmode_t mode)
435{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600436 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600437 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438
Alex Elderf84344f2012-08-31 17:29:51 -0500439 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440 return -EROFS;
441
Alex Eldera14ea262013-02-05 13:23:12 -0600442 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600443 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
444 removing = true;
445 else
446 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600447 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600448 if (removing)
449 return -ENOENT;
450
Alex Elder42382b72012-11-16 09:29:16 -0600451 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600452 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500453 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600454 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700455
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456 return 0;
457}
458
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800459static int rbd_release(struct gendisk *disk, fmode_t mode)
460{
461 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600462 unsigned long open_count_before;
463
Alex Eldera14ea262013-02-05 13:23:12 -0600464 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600465 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600466 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600467 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800468
Alex Elder42382b72012-11-16 09:29:16 -0600469 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600470 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600471 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800472
473 return 0;
474}
475
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700476static const struct block_device_operations rbd_bd_ops = {
477 .owner = THIS_MODULE,
478 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800479 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700480};
481
482/*
483 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500484 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700485 */
Alex Elderf8c38922012-08-10 13:12:07 -0700486static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700487{
488 struct rbd_client *rbdc;
489 int ret = -ENOMEM;
490
Alex Elder37206ee2013-02-20 17:32:08 -0600491 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700492 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
493 if (!rbdc)
494 goto out_opt;
495
496 kref_init(&rbdc->kref);
497 INIT_LIST_HEAD(&rbdc->node);
498
Alex Elderbc534d82012-01-29 13:57:44 -0600499 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
500
Alex Elder43ae4702012-07-03 16:01:18 -0500501 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700502 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600503 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500504 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700505
506 ret = ceph_open_session(rbdc->client);
507 if (ret < 0)
508 goto out_err;
509
Alex Elder432b8582012-01-29 13:57:44 -0600510 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700511 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600512 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700513
Alex Elderbc534d82012-01-29 13:57:44 -0600514 mutex_unlock(&ctl_mutex);
Alex Elder37206ee2013-02-20 17:32:08 -0600515 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600516
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700517 return rbdc;
518
519out_err:
520 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600521out_mutex:
522 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700523 kfree(rbdc);
524out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500525 if (ceph_opts)
526 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600527 dout("%s: error %d\n", __func__, ret);
528
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400529 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700530}
531
Alex Elder2f82ee52012-10-30 19:40:33 -0500532static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
533{
534 kref_get(&rbdc->kref);
535
536 return rbdc;
537}
538
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700539/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700540 * Find a ceph client with specific addr and configuration. If
541 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700542 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700543static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700544{
545 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700546 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700547
Alex Elder43ae4702012-07-03 16:01:18 -0500548 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700549 return NULL;
550
Alex Elder1f7ba332012-08-10 13:12:07 -0700551 spin_lock(&rbd_client_list_lock);
552 list_for_each_entry(client_node, &rbd_client_list, node) {
553 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500554 __rbd_get_client(client_node);
555
Alex Elder1f7ba332012-08-10 13:12:07 -0700556 found = true;
557 break;
558 }
559 }
560 spin_unlock(&rbd_client_list_lock);
561
562 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700563}
564
565/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700566 * mount options
567 */
568enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700569 Opt_last_int,
570 /* int args above */
571 Opt_last_string,
572 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700573 Opt_read_only,
574 Opt_read_write,
575 /* Boolean args above */
576 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700577};
578
Alex Elder43ae4702012-07-03 16:01:18 -0500579static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700580 /* int args above */
581 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500582 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700583 {Opt_read_only, "ro"}, /* Alternate spelling */
584 {Opt_read_write, "read_write"},
585 {Opt_read_write, "rw"}, /* Alternate spelling */
586 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700587 {-1, NULL}
588};
589
Alex Elder98571b52013-01-20 14:44:42 -0600590struct rbd_options {
591 bool read_only;
592};
593
594#define RBD_READ_ONLY_DEFAULT false
595
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700596static int parse_rbd_opts_token(char *c, void *private)
597{
Alex Elder43ae4702012-07-03 16:01:18 -0500598 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700599 substring_t argstr[MAX_OPT_ARGS];
600 int token, intval, ret;
601
Alex Elder43ae4702012-07-03 16:01:18 -0500602 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700603 if (token < 0)
604 return -EINVAL;
605
606 if (token < Opt_last_int) {
607 ret = match_int(&argstr[0], &intval);
608 if (ret < 0) {
609 pr_err("bad mount option arg (not int) "
610 "at '%s'\n", c);
611 return ret;
612 }
613 dout("got int token %d val %d\n", token, intval);
614 } else if (token > Opt_last_int && token < Opt_last_string) {
615 dout("got string token %d val %s\n", token,
616 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700617 } else if (token > Opt_last_string && token < Opt_last_bool) {
618 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700619 } else {
620 dout("got token %d\n", token);
621 }
622
623 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700624 case Opt_read_only:
625 rbd_opts->read_only = true;
626 break;
627 case Opt_read_write:
628 rbd_opts->read_only = false;
629 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700630 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500631 rbd_assert(false);
632 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700633 }
634 return 0;
635}
636
637/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638 * Get a ceph client with specific addr and configuration, if one does
639 * not exist create it.
640 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500641static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642{
Alex Elderf8c38922012-08-10 13:12:07 -0700643 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700644
Alex Elder1f7ba332012-08-10 13:12:07 -0700645 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500646 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500647 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500648 else
Alex Elderf8c38922012-08-10 13:12:07 -0700649 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650
Alex Elder9d3997f2012-10-25 23:34:42 -0500651 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652}
653
654/*
655 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600656 *
Alex Elder432b8582012-01-29 13:57:44 -0600657 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658 */
659static void rbd_client_release(struct kref *kref)
660{
661 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
662
Alex Elder37206ee2013-02-20 17:32:08 -0600663 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500664 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500666 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667
668 ceph_destroy_client(rbdc->client);
669 kfree(rbdc);
670}
671
672/*
673 * Drop reference to ceph client node. If it's not referenced anymore, release
674 * it.
675 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500676static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677{
Alex Elderc53d5892012-10-25 23:34:42 -0500678 if (rbdc)
679 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680}
681
Alex Eldera30b71b2012-07-10 20:30:11 -0500682static bool rbd_image_format_valid(u32 image_format)
683{
684 return image_format == 1 || image_format == 2;
685}
686
Alex Elder8e94af82012-07-25 09:32:40 -0500687static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
688{
Alex Elder103a1502012-08-02 11:29:45 -0500689 size_t size;
690 u32 snap_count;
691
692 /* The header has to start with the magic rbd header text */
693 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
694 return false;
695
Alex Elderdb2388b2012-10-20 22:17:27 -0500696 /* The bio layer requires at least sector-sized I/O */
697
698 if (ondisk->options.order < SECTOR_SHIFT)
699 return false;
700
701 /* If we use u64 in a few spots we may be able to loosen this */
702
703 if (ondisk->options.order > 8 * sizeof (int) - 1)
704 return false;
705
Alex Elder103a1502012-08-02 11:29:45 -0500706 /*
707 * The size of a snapshot header has to fit in a size_t, and
708 * that limits the number of snapshots.
709 */
710 snap_count = le32_to_cpu(ondisk->snap_count);
711 size = SIZE_MAX - sizeof (struct ceph_snap_context);
712 if (snap_count > size / sizeof (__le64))
713 return false;
714
715 /*
716 * Not only that, but the size of the entire the snapshot
717 * header must also be representable in a size_t.
718 */
719 size -= snap_count * sizeof (__le64);
720 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
721 return false;
722
723 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500724}
725
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726/*
727 * Create a new header structure, translate header format from the on-disk
728 * header.
729 */
730static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500731 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700732{
Alex Elderccece232012-07-10 20:30:10 -0500733 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500734 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500735 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500736 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700737
Alex Elder6a523252012-07-19 17:12:59 -0500738 memset(header, 0, sizeof (*header));
739
Alex Elder103a1502012-08-02 11:29:45 -0500740 snap_count = le32_to_cpu(ondisk->snap_count);
741
Alex Elder58c17b02012-08-23 23:22:06 -0500742 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
743 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500744 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500746 memcpy(header->object_prefix, ondisk->object_prefix, len);
747 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600748
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700749 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500750 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
751
Alex Elder621901d2012-08-23 23:22:06 -0500752 /* Save a copy of the snapshot names */
753
Alex Elderf785cc12012-08-23 23:22:06 -0500754 if (snap_names_len > (u64) SIZE_MAX)
755 return -EIO;
756 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700757 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500758 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500759 /*
760 * Note that rbd_dev_v1_header_read() guarantees
761 * the ondisk buffer we're working with has
762 * snap_names_len bytes beyond the end of the
763 * snapshot id array, this memcpy() is safe.
764 */
765 memcpy(header->snap_names, &ondisk->snaps[snap_count],
766 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500767
Alex Elder621901d2012-08-23 23:22:06 -0500768 /* Record each snapshot's size */
769
Alex Elderd2bb24e2012-07-26 23:37:14 -0500770 size = snap_count * sizeof (*header->snap_sizes);
771 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700772 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500773 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500774 for (i = 0; i < snap_count; i++)
775 header->snap_sizes[i] =
776 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700777 } else {
778 header->snap_names = NULL;
779 header->snap_sizes = NULL;
780 }
Alex Elder849b4262012-07-09 21:04:24 -0500781
Alex Elder34b13182012-07-13 20:35:12 -0500782 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700783 header->obj_order = ondisk->options.order;
784 header->crypt_type = ondisk->options.crypt_type;
785 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500786
Alex Elder621901d2012-08-23 23:22:06 -0500787 /* Allocate and fill in the snapshot context */
788
Alex Elderf84344f2012-08-31 17:29:51 -0500789 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder468521c2013-04-26 09:43:47 -0500790
Alex Elder812164f82013-04-30 00:44:32 -0500791 header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500792 if (!header->snapc)
793 goto out_err;
Alex Elder505cbb92012-07-19 08:49:18 -0500794 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Alex Elder621901d2012-08-23 23:22:06 -0500795 for (i = 0; i < snap_count; i++)
Alex Elder468521c2013-04-26 09:43:47 -0500796 header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700797
798 return 0;
799
Alex Elder6a523252012-07-19 17:12:59 -0500800out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500801 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500802 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700803 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500804 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500805 kfree(header->object_prefix);
806 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500807
Alex Elder00f1f362012-02-07 12:03:36 -0600808 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700809}
810
Alex Elder9682fc62013-04-30 00:44:33 -0500811static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
812{
813 const char *snap_name;
814
815 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
816
817 /* Skip over names until we find the one we are looking for */
818
819 snap_name = rbd_dev->header.snap_names;
820 while (which--)
821 snap_name += strlen(snap_name) + 1;
822
823 return kstrdup(snap_name, GFP_KERNEL);
824}
825
Alex Elder30d1cff2013-05-01 12:43:03 -0500826/*
827 * Snapshot id comparison function for use with qsort()/bsearch().
828 * Note that result is for snapshots in *descending* order.
829 */
830static int snapid_compare_reverse(const void *s1, const void *s2)
831{
832 u64 snap_id1 = *(u64 *)s1;
833 u64 snap_id2 = *(u64 *)s2;
834
835 if (snap_id1 < snap_id2)
836 return 1;
837 return snap_id1 == snap_id2 ? 0 : -1;
838}
839
840/*
841 * Search a snapshot context to see if the given snapshot id is
842 * present.
843 *
844 * Returns the position of the snapshot id in the array if it's found,
845 * or BAD_SNAP_INDEX otherwise.
846 *
847 * Note: The snapshot array is in kept sorted (by the osd) in
848 * reverse order, highest snapshot id first.
849 */
Alex Elder9682fc62013-04-30 00:44:33 -0500850static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
851{
852 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -0500853 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -0500854
Alex Elder30d1cff2013-05-01 12:43:03 -0500855 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
856 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -0500857
Alex Elder30d1cff2013-05-01 12:43:03 -0500858 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -0500859}
860
Alex Elder2ad3d712013-04-30 00:44:33 -0500861static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
862 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -0500863{
864 u32 which;
865
866 which = rbd_dev_snap_index(rbd_dev, snap_id);
867 if (which == BAD_SNAP_INDEX)
868 return NULL;
869
870 return _rbd_dev_v1_snap_name(rbd_dev, which);
871}
872
Alex Elder9e15b772012-10-30 19:40:33 -0500873static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
874{
Alex Elder9e15b772012-10-30 19:40:33 -0500875 if (snap_id == CEPH_NOSNAP)
876 return RBD_SNAP_HEAD_NAME;
877
Alex Elder54cac612013-04-30 00:44:33 -0500878 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
879 if (rbd_dev->image_format == 1)
880 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -0500881
Alex Elder54cac612013-04-30 00:44:33 -0500882 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -0500883}
884
Alex Elder2ad3d712013-04-30 00:44:33 -0500885static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
886 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887{
Alex Elder2ad3d712013-04-30 00:44:33 -0500888 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
889 if (snap_id == CEPH_NOSNAP) {
890 *snap_size = rbd_dev->header.image_size;
891 } else if (rbd_dev->image_format == 1) {
892 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -0600893
Alex Elder2ad3d712013-04-30 00:44:33 -0500894 which = rbd_dev_snap_index(rbd_dev, snap_id);
895 if (which == BAD_SNAP_INDEX)
896 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -0600897
Alex Elder2ad3d712013-04-30 00:44:33 -0500898 *snap_size = rbd_dev->header.snap_sizes[which];
899 } else {
900 u64 size = 0;
901 int ret;
902
903 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
904 if (ret)
905 return ret;
906
907 *snap_size = size;
908 }
909 return 0;
910}
911
912static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
913 u64 *snap_features)
914{
915 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
916 if (snap_id == CEPH_NOSNAP) {
917 *snap_features = rbd_dev->header.features;
918 } else if (rbd_dev->image_format == 1) {
919 *snap_features = 0; /* No features for format 1 */
920 } else {
921 u64 features = 0;
922 int ret;
923
924 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
925 if (ret)
926 return ret;
927
928 *snap_features = features;
929 }
930 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700931}
932
Alex Elderd1cf5782013-04-27 09:59:30 -0500933static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700934{
Alex Elder2ad3d712013-04-30 00:44:33 -0500935 const char *snap_name = rbd_dev->spec->snap_name;
936 u64 snap_id;
937 u64 size = 0;
938 u64 features = 0;
939 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -0500940
Alex Elder2ad3d712013-04-30 00:44:33 -0500941 if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) {
942 snap_id = rbd_snap_id_by_name(rbd_dev, snap_name);
943 if (snap_id == CEPH_NOSNAP)
Alex Elder8b0241f2013-04-25 23:15:08 -0500944 return -ENOENT;
Alex Elder2ad3d712013-04-30 00:44:33 -0500945 } else {
946 snap_id = CEPH_NOSNAP;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700947 }
Alex Elder6d292902013-01-14 12:43:31 -0600948
Alex Elder2ad3d712013-04-30 00:44:33 -0500949 ret = rbd_snap_size(rbd_dev, snap_id, &size);
950 if (ret)
951 return ret;
952 ret = rbd_snap_features(rbd_dev, snap_id, &features);
953 if (ret)
954 return ret;
955
956 rbd_dev->mapping.size = size;
957 rbd_dev->mapping.features = features;
958
959 /* If we are mapping a snapshot it must be marked read-only */
960
961 if (snap_id != CEPH_NOSNAP)
962 rbd_dev->mapping.read_only = true;
963
Alex Elder8b0241f2013-04-25 23:15:08 -0500964 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700965}
966
Alex Elderd1cf5782013-04-27 09:59:30 -0500967static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
968{
969 rbd_dev->mapping.size = 0;
970 rbd_dev->mapping.features = 0;
971 rbd_dev->mapping.read_only = true;
972}
973
Alex Elder200a6a82013-04-28 23:32:34 -0500974static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
975{
976 rbd_dev->mapping.size = 0;
977 rbd_dev->mapping.features = 0;
978 rbd_dev->mapping.read_only = true;
979}
980
Alex Elder98571b52013-01-20 14:44:42 -0600981static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700982{
Alex Elder65ccfe22012-08-09 10:33:26 -0700983 char *name;
984 u64 segment;
985 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700986
Alex Elder2fd82b92012-11-09 15:05:54 -0600987 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700988 if (!name)
989 return NULL;
990 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600991 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700992 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600993 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700994 pr_err("error formatting segment name for #%llu (%d)\n",
995 segment, ret);
996 kfree(name);
997 name = NULL;
998 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700999
Alex Elder65ccfe22012-08-09 10:33:26 -07001000 return name;
1001}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001002
Alex Elder65ccfe22012-08-09 10:33:26 -07001003static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1004{
1005 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001006
Alex Elder65ccfe22012-08-09 10:33:26 -07001007 return offset & (segment_size - 1);
1008}
1009
1010static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1011 u64 offset, u64 length)
1012{
1013 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1014
1015 offset &= segment_size - 1;
1016
Alex Elderaafb2302012-09-06 16:00:54 -05001017 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001018 if (offset + length > segment_size)
1019 length = segment_size - offset;
1020
1021 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001022}
1023
1024/*
Josh Durgin029bcbd2011-07-22 11:35:23 -07001025 * returns the size of an object in the image
1026 */
1027static u64 rbd_obj_bytes(struct rbd_image_header *header)
1028{
1029 return 1 << header->obj_order;
1030}
1031
1032/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001033 * bio helpers
1034 */
1035
1036static void bio_chain_put(struct bio *chain)
1037{
1038 struct bio *tmp;
1039
1040 while (chain) {
1041 tmp = chain;
1042 chain = chain->bi_next;
1043 bio_put(tmp);
1044 }
1045}
1046
1047/*
1048 * zeros a bio chain, starting at specific offset
1049 */
1050static void zero_bio_chain(struct bio *chain, int start_ofs)
1051{
1052 struct bio_vec *bv;
1053 unsigned long flags;
1054 void *buf;
1055 int i;
1056 int pos = 0;
1057
1058 while (chain) {
1059 bio_for_each_segment(bv, chain, i) {
1060 if (pos + bv->bv_len > start_ofs) {
1061 int remainder = max(start_ofs - pos, 0);
1062 buf = bvec_kmap_irq(bv, &flags);
1063 memset(buf + remainder, 0,
1064 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +02001065 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001066 }
1067 pos += bv->bv_len;
1068 }
1069
1070 chain = chain->bi_next;
1071 }
1072}
1073
1074/*
Alex Elderb9434c52013-04-19 15:34:50 -05001075 * similar to zero_bio_chain(), zeros data defined by a page array,
1076 * starting at the given byte offset from the start of the array and
1077 * continuing up to the given end offset. The pages array is
1078 * assumed to be big enough to hold all bytes up to the end.
1079 */
1080static void zero_pages(struct page **pages, u64 offset, u64 end)
1081{
1082 struct page **page = &pages[offset >> PAGE_SHIFT];
1083
1084 rbd_assert(end > offset);
1085 rbd_assert(end - offset <= (u64)SIZE_MAX);
1086 while (offset < end) {
1087 size_t page_offset;
1088 size_t length;
1089 unsigned long flags;
1090 void *kaddr;
1091
1092 page_offset = (size_t)(offset & ~PAGE_MASK);
1093 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
1094 local_irq_save(flags);
1095 kaddr = kmap_atomic(*page);
1096 memset(kaddr + page_offset, 0, length);
1097 kunmap_atomic(kaddr);
1098 local_irq_restore(flags);
1099
1100 offset += length;
1101 page++;
1102 }
1103}
1104
1105/*
Alex Elderf7760da2012-10-20 22:17:27 -05001106 * Clone a portion of a bio, starting at the given byte offset
1107 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001108 */
Alex Elderf7760da2012-10-20 22:17:27 -05001109static struct bio *bio_clone_range(struct bio *bio_src,
1110 unsigned int offset,
1111 unsigned int len,
1112 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001113{
Alex Elderf7760da2012-10-20 22:17:27 -05001114 struct bio_vec *bv;
1115 unsigned int resid;
1116 unsigned short idx;
1117 unsigned int voff;
1118 unsigned short end_idx;
1119 unsigned short vcnt;
1120 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001121
Alex Elderf7760da2012-10-20 22:17:27 -05001122 /* Handle the easy case for the caller */
1123
1124 if (!offset && len == bio_src->bi_size)
1125 return bio_clone(bio_src, gfpmask);
1126
1127 if (WARN_ON_ONCE(!len))
1128 return NULL;
1129 if (WARN_ON_ONCE(len > bio_src->bi_size))
1130 return NULL;
1131 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1132 return NULL;
1133
1134 /* Find first affected segment... */
1135
1136 resid = offset;
1137 __bio_for_each_segment(bv, bio_src, idx, 0) {
1138 if (resid < bv->bv_len)
1139 break;
1140 resid -= bv->bv_len;
1141 }
1142 voff = resid;
1143
1144 /* ...and the last affected segment */
1145
1146 resid += len;
1147 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1148 if (resid <= bv->bv_len)
1149 break;
1150 resid -= bv->bv_len;
1151 }
1152 vcnt = end_idx - idx + 1;
1153
1154 /* Build the clone */
1155
1156 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1157 if (!bio)
1158 return NULL; /* ENOMEM */
1159
1160 bio->bi_bdev = bio_src->bi_bdev;
1161 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1162 bio->bi_rw = bio_src->bi_rw;
1163 bio->bi_flags |= 1 << BIO_CLONED;
1164
1165 /*
1166 * Copy over our part of the bio_vec, then update the first
1167 * and last (or only) entries.
1168 */
1169 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1170 vcnt * sizeof (struct bio_vec));
1171 bio->bi_io_vec[0].bv_offset += voff;
1172 if (vcnt > 1) {
1173 bio->bi_io_vec[0].bv_len -= voff;
1174 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1175 } else {
1176 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001177 }
1178
Alex Elderf7760da2012-10-20 22:17:27 -05001179 bio->bi_vcnt = vcnt;
1180 bio->bi_size = len;
1181 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001182
Alex Elderf7760da2012-10-20 22:17:27 -05001183 return bio;
1184}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001185
Alex Elderf7760da2012-10-20 22:17:27 -05001186/*
1187 * Clone a portion of a bio chain, starting at the given byte offset
1188 * into the first bio in the source chain and continuing for the
1189 * number of bytes indicated. The result is another bio chain of
1190 * exactly the given length, or a null pointer on error.
1191 *
1192 * The bio_src and offset parameters are both in-out. On entry they
1193 * refer to the first source bio and the offset into that bio where
1194 * the start of data to be cloned is located.
1195 *
1196 * On return, bio_src is updated to refer to the bio in the source
1197 * chain that contains first un-cloned byte, and *offset will
1198 * contain the offset of that byte within that bio.
1199 */
1200static struct bio *bio_chain_clone_range(struct bio **bio_src,
1201 unsigned int *offset,
1202 unsigned int len,
1203 gfp_t gfpmask)
1204{
1205 struct bio *bi = *bio_src;
1206 unsigned int off = *offset;
1207 struct bio *chain = NULL;
1208 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001209
Alex Elderf7760da2012-10-20 22:17:27 -05001210 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001211
Alex Elderf7760da2012-10-20 22:17:27 -05001212 if (!bi || off >= bi->bi_size || !len)
1213 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001214
Alex Elderf7760da2012-10-20 22:17:27 -05001215 end = &chain;
1216 while (len) {
1217 unsigned int bi_size;
1218 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001219
Alex Elderf5400b72012-11-01 10:17:15 -05001220 if (!bi) {
1221 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001222 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001223 }
Alex Elderf7760da2012-10-20 22:17:27 -05001224 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1225 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1226 if (!bio)
1227 goto out_err; /* ENOMEM */
1228
1229 *end = bio;
1230 end = &bio->bi_next;
1231
1232 off += bi_size;
1233 if (off == bi->bi_size) {
1234 bi = bi->bi_next;
1235 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001236 }
Alex Elderf7760da2012-10-20 22:17:27 -05001237 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001238 }
Alex Elderf7760da2012-10-20 22:17:27 -05001239 *bio_src = bi;
1240 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001241
Alex Elderf7760da2012-10-20 22:17:27 -05001242 return chain;
1243out_err:
1244 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001245
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001246 return NULL;
1247}
1248
Alex Elder926f9b32013-02-11 12:33:24 -06001249/*
1250 * The default/initial value for all object request flags is 0. For
1251 * each flag, once its value is set to 1 it is never reset to 0
1252 * again.
1253 */
Alex Elder6365d332013-02-11 12:33:24 -06001254static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1255{
1256 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001257 struct rbd_device *rbd_dev;
1258
Alex Elder57acbaa2013-02-11 12:33:24 -06001259 rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder6365d332013-02-11 12:33:24 -06001260 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1261 obj_request);
1262 }
1263}
1264
1265static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1266{
1267 smp_mb();
1268 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1269}
1270
Alex Elder57acbaa2013-02-11 12:33:24 -06001271static void obj_request_done_set(struct rbd_obj_request *obj_request)
1272{
1273 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1274 struct rbd_device *rbd_dev = NULL;
1275
1276 if (obj_request_img_data_test(obj_request))
1277 rbd_dev = obj_request->img_request->rbd_dev;
1278 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1279 obj_request);
1280 }
1281}
1282
1283static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1284{
1285 smp_mb();
1286 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1287}
1288
Alex Elder5679c592013-02-11 12:33:24 -06001289/*
1290 * This sets the KNOWN flag after (possibly) setting the EXISTS
1291 * flag. The latter is set based on the "exists" value provided.
1292 *
1293 * Note that for our purposes once an object exists it never goes
1294 * away again. It's possible that the response from two existence
1295 * checks are separated by the creation of the target object, and
1296 * the first ("doesn't exist") response arrives *after* the second
1297 * ("does exist"). In that case we ignore the second one.
1298 */
1299static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1300 bool exists)
1301{
1302 if (exists)
1303 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1304 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1305 smp_mb();
1306}
1307
1308static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1309{
1310 smp_mb();
1311 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1312}
1313
1314static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1315{
1316 smp_mb();
1317 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1318}
1319
Alex Elderbf0d5f502012-11-22 00:00:08 -06001320static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1321{
Alex Elder37206ee2013-02-20 17:32:08 -06001322 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1323 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001324 kref_get(&obj_request->kref);
1325}
1326
1327static void rbd_obj_request_destroy(struct kref *kref);
1328static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1329{
1330 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001331 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1332 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001333 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1334}
1335
1336static void rbd_img_request_get(struct rbd_img_request *img_request)
1337{
Alex Elder37206ee2013-02-20 17:32:08 -06001338 dout("%s: img %p (was %d)\n", __func__, img_request,
1339 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001340 kref_get(&img_request->kref);
1341}
1342
1343static void rbd_img_request_destroy(struct kref *kref);
1344static void rbd_img_request_put(struct rbd_img_request *img_request)
1345{
1346 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001347 dout("%s: img %p (was %d)\n", __func__, img_request,
1348 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001349 kref_put(&img_request->kref, rbd_img_request_destroy);
1350}
1351
1352static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1353 struct rbd_obj_request *obj_request)
1354{
Alex Elder25dcf952013-01-25 17:08:55 -06001355 rbd_assert(obj_request->img_request == NULL);
1356
Alex Elderb155e862013-04-15 14:50:37 -05001357 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001358 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001359 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001360 rbd_assert(!obj_request_img_data_test(obj_request));
1361 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001362 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001363 img_request->obj_request_count++;
1364 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001365 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1366 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001367}
1368
1369static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1370 struct rbd_obj_request *obj_request)
1371{
1372 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001373
Alex Elder37206ee2013-02-20 17:32:08 -06001374 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1375 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001376 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001377 rbd_assert(img_request->obj_request_count > 0);
1378 img_request->obj_request_count--;
1379 rbd_assert(obj_request->which == img_request->obj_request_count);
1380 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001381 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001382 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001383 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001384 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001385 rbd_obj_request_put(obj_request);
1386}
1387
1388static bool obj_request_type_valid(enum obj_request_type type)
1389{
1390 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001391 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001392 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001393 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001394 return true;
1395 default:
1396 return false;
1397 }
1398}
1399
Alex Elderbf0d5f502012-11-22 00:00:08 -06001400static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1401 struct rbd_obj_request *obj_request)
1402{
Alex Elder37206ee2013-02-20 17:32:08 -06001403 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1404
Alex Elderbf0d5f502012-11-22 00:00:08 -06001405 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1406}
1407
1408static void rbd_img_request_complete(struct rbd_img_request *img_request)
1409{
Alex Elder55f27e02013-04-10 12:34:25 -05001410
Alex Elder37206ee2013-02-20 17:32:08 -06001411 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001412
1413 /*
1414 * If no error occurred, compute the aggregate transfer
1415 * count for the image request. We could instead use
1416 * atomic64_cmpxchg() to update it as each object request
1417 * completes; not clear which way is better off hand.
1418 */
1419 if (!img_request->result) {
1420 struct rbd_obj_request *obj_request;
1421 u64 xferred = 0;
1422
1423 for_each_obj_request(img_request, obj_request)
1424 xferred += obj_request->xferred;
1425 img_request->xferred = xferred;
1426 }
1427
Alex Elderbf0d5f502012-11-22 00:00:08 -06001428 if (img_request->callback)
1429 img_request->callback(img_request);
1430 else
1431 rbd_img_request_put(img_request);
1432}
1433
Alex Elder788e2df2013-01-17 12:25:27 -06001434/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1435
1436static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1437{
Alex Elder37206ee2013-02-20 17:32:08 -06001438 dout("%s: obj %p\n", __func__, obj_request);
1439
Alex Elder788e2df2013-01-17 12:25:27 -06001440 return wait_for_completion_interruptible(&obj_request->completion);
1441}
1442
Alex Elder0c425242013-02-08 09:55:49 -06001443/*
1444 * The default/initial value for all image request flags is 0. Each
1445 * is conditionally set to 1 at image request initialization time
1446 * and currently never change thereafter.
1447 */
1448static void img_request_write_set(struct rbd_img_request *img_request)
1449{
1450 set_bit(IMG_REQ_WRITE, &img_request->flags);
1451 smp_mb();
1452}
1453
1454static bool img_request_write_test(struct rbd_img_request *img_request)
1455{
1456 smp_mb();
1457 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1458}
1459
Alex Elder9849e982013-01-24 16:13:36 -06001460static void img_request_child_set(struct rbd_img_request *img_request)
1461{
1462 set_bit(IMG_REQ_CHILD, &img_request->flags);
1463 smp_mb();
1464}
1465
1466static bool img_request_child_test(struct rbd_img_request *img_request)
1467{
1468 smp_mb();
1469 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1470}
1471
Alex Elderd0b2e942013-01-24 16:13:36 -06001472static void img_request_layered_set(struct rbd_img_request *img_request)
1473{
1474 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1475 smp_mb();
1476}
1477
1478static bool img_request_layered_test(struct rbd_img_request *img_request)
1479{
1480 smp_mb();
1481 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1482}
1483
Alex Elder6e2a4502013-03-27 09:16:30 -05001484static void
1485rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1486{
Alex Elderb9434c52013-04-19 15:34:50 -05001487 u64 xferred = obj_request->xferred;
1488 u64 length = obj_request->length;
1489
Alex Elder6e2a4502013-03-27 09:16:30 -05001490 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1491 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001492 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001493 /*
1494 * ENOENT means a hole in the image. We zero-fill the
1495 * entire length of the request. A short read also implies
1496 * zero-fill to the end of the request. Either way we
1497 * update the xferred count to indicate the whole request
1498 * was satisfied.
1499 */
Alex Elderb9434c52013-04-19 15:34:50 -05001500 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001501 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001502 if (obj_request->type == OBJ_REQUEST_BIO)
1503 zero_bio_chain(obj_request->bio_list, 0);
1504 else
1505 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001506 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001507 obj_request->xferred = length;
1508 } else if (xferred < length && !obj_request->result) {
1509 if (obj_request->type == OBJ_REQUEST_BIO)
1510 zero_bio_chain(obj_request->bio_list, xferred);
1511 else
1512 zero_pages(obj_request->pages, xferred, length);
1513 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001514 }
1515 obj_request_done_set(obj_request);
1516}
1517
Alex Elderbf0d5f502012-11-22 00:00:08 -06001518static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1519{
Alex Elder37206ee2013-02-20 17:32:08 -06001520 dout("%s: obj %p cb %p\n", __func__, obj_request,
1521 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001522 if (obj_request->callback)
1523 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001524 else
1525 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001526}
1527
Alex Elderc47f9372013-02-26 14:23:07 -06001528static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001529{
1530 dout("%s: obj %p\n", __func__, obj_request);
1531 obj_request_done_set(obj_request);
1532}
1533
Alex Elderc47f9372013-02-26 14:23:07 -06001534static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001535{
Alex Elder57acbaa2013-02-11 12:33:24 -06001536 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001537 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001538 bool layered = false;
1539
1540 if (obj_request_img_data_test(obj_request)) {
1541 img_request = obj_request->img_request;
1542 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001543 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001544 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001545
1546 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1547 obj_request, img_request, obj_request->result,
1548 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001549 if (layered && obj_request->result == -ENOENT &&
1550 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001551 rbd_img_parent_read(obj_request);
1552 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001553 rbd_img_obj_request_read_callback(obj_request);
1554 else
1555 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001556}
1557
Alex Elderc47f9372013-02-26 14:23:07 -06001558static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001559{
Sage Weil1b83bef2013-02-25 16:11:12 -08001560 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1561 obj_request->result, obj_request->length);
1562 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001563 * There is no such thing as a successful short write. Set
1564 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001565 */
1566 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001567 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001568}
1569
Alex Elderfbfab532013-02-08 09:55:48 -06001570/*
1571 * For a simple stat call there's nothing to do. We'll do more if
1572 * this is part of a write sequence for a layered image.
1573 */
Alex Elderc47f9372013-02-26 14:23:07 -06001574static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001575{
Alex Elder37206ee2013-02-20 17:32:08 -06001576 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001577 obj_request_done_set(obj_request);
1578}
1579
Alex Elderbf0d5f502012-11-22 00:00:08 -06001580static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1581 struct ceph_msg *msg)
1582{
1583 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001584 u16 opcode;
1585
Alex Elder37206ee2013-02-20 17:32:08 -06001586 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001587 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001588 if (obj_request_img_data_test(obj_request)) {
1589 rbd_assert(obj_request->img_request);
1590 rbd_assert(obj_request->which != BAD_WHICH);
1591 } else {
1592 rbd_assert(obj_request->which == BAD_WHICH);
1593 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001594
Sage Weil1b83bef2013-02-25 16:11:12 -08001595 if (osd_req->r_result < 0)
1596 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001597
Alex Elder0eefd472013-04-19 15:34:50 -05001598 BUG_ON(osd_req->r_num_ops > 2);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001599
Alex Elderc47f9372013-02-26 14:23:07 -06001600 /*
1601 * We support a 64-bit length, but ultimately it has to be
1602 * passed to blk_end_request(), which takes an unsigned int.
1603 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001604 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elder8b3e1a52013-01-24 16:13:36 -06001605 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Alex Elder79528732013-04-03 21:32:51 -05001606 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001607 switch (opcode) {
1608 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001609 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001610 break;
1611 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001612 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001613 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001614 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001615 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001616 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001617 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001618 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001619 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001620 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001621 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001622 default:
1623 rbd_warn(NULL, "%s: unsupported op %hu\n",
1624 obj_request->object_name, (unsigned short) opcode);
1625 break;
1626 }
1627
Alex Elder07741302013-02-05 23:41:50 -06001628 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001629 rbd_obj_request_complete(obj_request);
1630}
1631
Alex Elder9d4df012013-04-19 15:34:50 -05001632static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001633{
1634 struct rbd_img_request *img_request = obj_request->img_request;
Alex Elder8c042b02013-04-03 01:28:58 -05001635 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001636 u64 snap_id;
Alex Elder430c28c2013-04-03 21:32:51 -05001637
Alex Elder8c042b02013-04-03 01:28:58 -05001638 rbd_assert(osd_req != NULL);
Alex Elder430c28c2013-04-03 21:32:51 -05001639
Alex Elder9d4df012013-04-19 15:34:50 -05001640 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
Alex Elder8c042b02013-04-03 01:28:58 -05001641 ceph_osdc_build_request(osd_req, obj_request->offset,
Alex Elder9d4df012013-04-19 15:34:50 -05001642 NULL, snap_id, NULL);
1643}
1644
1645static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1646{
1647 struct rbd_img_request *img_request = obj_request->img_request;
1648 struct ceph_osd_request *osd_req = obj_request->osd_req;
1649 struct ceph_snap_context *snapc;
1650 struct timespec mtime = CURRENT_TIME;
1651
1652 rbd_assert(osd_req != NULL);
1653
1654 snapc = img_request ? img_request->snapc : NULL;
1655 ceph_osdc_build_request(osd_req, obj_request->offset,
1656 snapc, CEPH_NOSNAP, &mtime);
Alex Elder430c28c2013-04-03 21:32:51 -05001657}
1658
Alex Elderbf0d5f502012-11-22 00:00:08 -06001659static struct ceph_osd_request *rbd_osd_req_create(
1660 struct rbd_device *rbd_dev,
1661 bool write_request,
Alex Elder430c28c2013-04-03 21:32:51 -05001662 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001663{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001664 struct ceph_snap_context *snapc = NULL;
1665 struct ceph_osd_client *osdc;
1666 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001667
Alex Elder6365d332013-02-11 12:33:24 -06001668 if (obj_request_img_data_test(obj_request)) {
1669 struct rbd_img_request *img_request = obj_request->img_request;
1670
Alex Elder0c425242013-02-08 09:55:49 -06001671 rbd_assert(write_request ==
1672 img_request_write_test(img_request));
1673 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001674 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001675 }
1676
1677 /* Allocate and initialize the request, for the single op */
1678
1679 osdc = &rbd_dev->rbd_client->client->osdc;
1680 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1681 if (!osd_req)
1682 return NULL; /* ENOMEM */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001683
Alex Elder430c28c2013-04-03 21:32:51 -05001684 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001685 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05001686 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06001687 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001688
1689 osd_req->r_callback = rbd_osd_req_callback;
1690 osd_req->r_priv = obj_request;
1691
1692 osd_req->r_oid_len = strlen(obj_request->object_name);
1693 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1694 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1695
1696 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1697
Alex Elderbf0d5f502012-11-22 00:00:08 -06001698 return osd_req;
1699}
1700
Alex Elder0eefd472013-04-19 15:34:50 -05001701/*
1702 * Create a copyup osd request based on the information in the
1703 * object request supplied. A copyup request has two osd ops,
1704 * a copyup method call, and a "normal" write request.
1705 */
1706static struct ceph_osd_request *
1707rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1708{
1709 struct rbd_img_request *img_request;
1710 struct ceph_snap_context *snapc;
1711 struct rbd_device *rbd_dev;
1712 struct ceph_osd_client *osdc;
1713 struct ceph_osd_request *osd_req;
1714
1715 rbd_assert(obj_request_img_data_test(obj_request));
1716 img_request = obj_request->img_request;
1717 rbd_assert(img_request);
1718 rbd_assert(img_request_write_test(img_request));
1719
1720 /* Allocate and initialize the request, for the two ops */
1721
1722 snapc = img_request->snapc;
1723 rbd_dev = img_request->rbd_dev;
1724 osdc = &rbd_dev->rbd_client->client->osdc;
1725 osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1726 if (!osd_req)
1727 return NULL; /* ENOMEM */
1728
1729 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1730 osd_req->r_callback = rbd_osd_req_callback;
1731 osd_req->r_priv = obj_request;
1732
1733 osd_req->r_oid_len = strlen(obj_request->object_name);
1734 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1735 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1736
1737 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1738
1739 return osd_req;
1740}
1741
1742
Alex Elderbf0d5f502012-11-22 00:00:08 -06001743static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1744{
1745 ceph_osdc_put_request(osd_req);
1746}
1747
1748/* object_name is assumed to be a non-null pointer and NUL-terminated */
1749
1750static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1751 u64 offset, u64 length,
1752 enum obj_request_type type)
1753{
1754 struct rbd_obj_request *obj_request;
1755 size_t size;
1756 char *name;
1757
1758 rbd_assert(obj_request_type_valid(type));
1759
1760 size = strlen(object_name) + 1;
Alex Elderf907ad52013-05-01 12:43:03 -05001761 name = kmalloc(size, GFP_KERNEL);
1762 if (!name)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001763 return NULL;
1764
Alex Elderf907ad52013-05-01 12:43:03 -05001765 obj_request = kzalloc(sizeof (*obj_request), GFP_KERNEL);
1766 if (!obj_request) {
1767 kfree(name);
1768 return NULL;
1769 }
1770
Alex Elderbf0d5f502012-11-22 00:00:08 -06001771 obj_request->object_name = memcpy(name, object_name, size);
1772 obj_request->offset = offset;
1773 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06001774 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001775 obj_request->which = BAD_WHICH;
1776 obj_request->type = type;
1777 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06001778 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001779 kref_init(&obj_request->kref);
1780
Alex Elder37206ee2013-02-20 17:32:08 -06001781 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1782 offset, length, (int)type, obj_request);
1783
Alex Elderbf0d5f502012-11-22 00:00:08 -06001784 return obj_request;
1785}
1786
1787static void rbd_obj_request_destroy(struct kref *kref)
1788{
1789 struct rbd_obj_request *obj_request;
1790
1791 obj_request = container_of(kref, struct rbd_obj_request, kref);
1792
Alex Elder37206ee2013-02-20 17:32:08 -06001793 dout("%s: obj %p\n", __func__, obj_request);
1794
Alex Elderbf0d5f502012-11-22 00:00:08 -06001795 rbd_assert(obj_request->img_request == NULL);
1796 rbd_assert(obj_request->which == BAD_WHICH);
1797
1798 if (obj_request->osd_req)
1799 rbd_osd_req_destroy(obj_request->osd_req);
1800
1801 rbd_assert(obj_request_type_valid(obj_request->type));
1802 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001803 case OBJ_REQUEST_NODATA:
1804 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001805 case OBJ_REQUEST_BIO:
1806 if (obj_request->bio_list)
1807 bio_chain_put(obj_request->bio_list);
1808 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001809 case OBJ_REQUEST_PAGES:
1810 if (obj_request->pages)
1811 ceph_release_page_vector(obj_request->pages,
1812 obj_request->page_count);
1813 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001814 }
1815
Alex Elderf907ad52013-05-01 12:43:03 -05001816 kfree(obj_request->object_name);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001817 kfree(obj_request);
1818}
1819
1820/*
1821 * Caller is responsible for filling in the list of object requests
1822 * that comprises the image request, and the Linux request pointer
1823 * (if there is one).
1824 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001825static struct rbd_img_request *rbd_img_request_create(
1826 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001827 u64 offset, u64 length,
Alex Elder9849e982013-01-24 16:13:36 -06001828 bool write_request,
1829 bool child_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001830{
1831 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001832
Alex Elder1c2a9df2013-05-01 12:43:03 -05001833 img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001834 if (!img_request)
1835 return NULL;
1836
1837 if (write_request) {
1838 down_read(&rbd_dev->header_rwsem);
Alex Elder812164f82013-04-30 00:44:32 -05001839 ceph_get_snap_context(rbd_dev->header.snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001840 up_read(&rbd_dev->header_rwsem);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001841 }
1842
1843 img_request->rq = NULL;
1844 img_request->rbd_dev = rbd_dev;
1845 img_request->offset = offset;
1846 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06001847 img_request->flags = 0;
1848 if (write_request) {
1849 img_request_write_set(img_request);
Alex Elder468521c2013-04-26 09:43:47 -05001850 img_request->snapc = rbd_dev->header.snapc;
Alex Elder0c425242013-02-08 09:55:49 -06001851 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001852 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06001853 }
Alex Elder9849e982013-01-24 16:13:36 -06001854 if (child_request)
1855 img_request_child_set(img_request);
Alex Elderd0b2e942013-01-24 16:13:36 -06001856 if (rbd_dev->parent_spec)
1857 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001858 spin_lock_init(&img_request->completion_lock);
1859 img_request->next_completion = 0;
1860 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06001861 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001862 img_request->obj_request_count = 0;
1863 INIT_LIST_HEAD(&img_request->obj_requests);
1864 kref_init(&img_request->kref);
1865
1866 rbd_img_request_get(img_request); /* Avoid a warning */
1867 rbd_img_request_put(img_request); /* TEMPORARY */
1868
Alex Elder37206ee2013-02-20 17:32:08 -06001869 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1870 write_request ? "write" : "read", offset, length,
1871 img_request);
1872
Alex Elderbf0d5f502012-11-22 00:00:08 -06001873 return img_request;
1874}
1875
1876static void rbd_img_request_destroy(struct kref *kref)
1877{
1878 struct rbd_img_request *img_request;
1879 struct rbd_obj_request *obj_request;
1880 struct rbd_obj_request *next_obj_request;
1881
1882 img_request = container_of(kref, struct rbd_img_request, kref);
1883
Alex Elder37206ee2013-02-20 17:32:08 -06001884 dout("%s: img %p\n", __func__, img_request);
1885
Alex Elderbf0d5f502012-11-22 00:00:08 -06001886 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1887 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001888 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001889
Alex Elder0c425242013-02-08 09:55:49 -06001890 if (img_request_write_test(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001891 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001892
Alex Elder8b3e1a52013-01-24 16:13:36 -06001893 if (img_request_child_test(img_request))
1894 rbd_obj_request_put(img_request->obj_request);
1895
Alex Elder1c2a9df2013-05-01 12:43:03 -05001896 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001897}
1898
Alex Elder12178572013-02-08 09:55:49 -06001899static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1900{
Alex Elder6365d332013-02-11 12:33:24 -06001901 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001902 unsigned int xferred;
1903 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001904 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001905
Alex Elder6365d332013-02-11 12:33:24 -06001906 rbd_assert(obj_request_img_data_test(obj_request));
1907 img_request = obj_request->img_request;
1908
Alex Elder12178572013-02-08 09:55:49 -06001909 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1910 xferred = (unsigned int)obj_request->xferred;
1911 result = obj_request->result;
1912 if (result) {
1913 struct rbd_device *rbd_dev = img_request->rbd_dev;
1914
1915 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1916 img_request_write_test(img_request) ? "write" : "read",
1917 obj_request->length, obj_request->img_offset,
1918 obj_request->offset);
1919 rbd_warn(rbd_dev, " result %d xferred %x\n",
1920 result, xferred);
1921 if (!img_request->result)
1922 img_request->result = result;
1923 }
1924
Alex Elderf1a47392013-04-19 15:34:50 -05001925 /* Image object requests don't own their page array */
1926
1927 if (obj_request->type == OBJ_REQUEST_PAGES) {
1928 obj_request->pages = NULL;
1929 obj_request->page_count = 0;
1930 }
1931
Alex Elder8b3e1a52013-01-24 16:13:36 -06001932 if (img_request_child_test(img_request)) {
1933 rbd_assert(img_request->obj_request != NULL);
1934 more = obj_request->which < img_request->obj_request_count - 1;
1935 } else {
1936 rbd_assert(img_request->rq != NULL);
1937 more = blk_end_request(img_request->rq, result, xferred);
1938 }
1939
1940 return more;
Alex Elder12178572013-02-08 09:55:49 -06001941}
1942
Alex Elder21692382013-04-05 01:27:12 -05001943static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1944{
1945 struct rbd_img_request *img_request;
1946 u32 which = obj_request->which;
1947 bool more = true;
1948
Alex Elder6365d332013-02-11 12:33:24 -06001949 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001950 img_request = obj_request->img_request;
1951
1952 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1953 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001954 rbd_assert(img_request->obj_request_count > 0);
1955 rbd_assert(which != BAD_WHICH);
1956 rbd_assert(which < img_request->obj_request_count);
1957 rbd_assert(which >= img_request->next_completion);
1958
1959 spin_lock_irq(&img_request->completion_lock);
1960 if (which != img_request->next_completion)
1961 goto out;
1962
1963 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001964 rbd_assert(more);
1965 rbd_assert(which < img_request->obj_request_count);
1966
1967 if (!obj_request_done_test(obj_request))
1968 break;
Alex Elder12178572013-02-08 09:55:49 -06001969 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001970 which++;
1971 }
1972
1973 rbd_assert(more ^ (which == img_request->obj_request_count));
1974 img_request->next_completion = which;
1975out:
1976 spin_unlock_irq(&img_request->completion_lock);
1977
1978 if (!more)
1979 rbd_img_request_complete(img_request);
1980}
1981
Alex Elderf1a47392013-04-19 15:34:50 -05001982/*
1983 * Split up an image request into one or more object requests, each
1984 * to a different object. The "type" parameter indicates whether
1985 * "data_desc" is the pointer to the head of a list of bio
1986 * structures, or the base of a page array. In either case this
1987 * function assumes data_desc describes memory sufficient to hold
1988 * all data described by the image request.
1989 */
1990static int rbd_img_request_fill(struct rbd_img_request *img_request,
1991 enum obj_request_type type,
1992 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001993{
1994 struct rbd_device *rbd_dev = img_request->rbd_dev;
1995 struct rbd_obj_request *obj_request = NULL;
1996 struct rbd_obj_request *next_obj_request;
Alex Elder0c425242013-02-08 09:55:49 -06001997 bool write_request = img_request_write_test(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05001998 struct bio *bio_list;
1999 unsigned int bio_offset = 0;
2000 struct page **pages;
Alex Elder7da22d22013-01-24 16:13:36 -06002001 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002002 u64 resid;
2003 u16 opcode;
2004
Alex Elderf1a47392013-04-19 15:34:50 -05002005 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2006 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06002007
Alex Elder430c28c2013-04-03 21:32:51 -05002008 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
Alex Elder7da22d22013-01-24 16:13:36 -06002009 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002010 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06002011 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05002012
2013 if (type == OBJ_REQUEST_BIO) {
2014 bio_list = data_desc;
2015 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2016 } else {
2017 rbd_assert(type == OBJ_REQUEST_PAGES);
2018 pages = data_desc;
2019 }
2020
Alex Elderbf0d5f502012-11-22 00:00:08 -06002021 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05002022 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002023 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002024 u64 offset;
2025 u64 length;
2026
Alex Elder7da22d22013-01-24 16:13:36 -06002027 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002028 if (!object_name)
2029 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06002030 offset = rbd_segment_offset(rbd_dev, img_offset);
2031 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002032 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05002033 offset, length, type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002034 kfree(object_name); /* object request has its own copy */
2035 if (!obj_request)
2036 goto out_unwind;
2037
Alex Elderf1a47392013-04-19 15:34:50 -05002038 if (type == OBJ_REQUEST_BIO) {
2039 unsigned int clone_size;
2040
2041 rbd_assert(length <= (u64)UINT_MAX);
2042 clone_size = (unsigned int)length;
2043 obj_request->bio_list =
2044 bio_chain_clone_range(&bio_list,
2045 &bio_offset,
2046 clone_size,
2047 GFP_ATOMIC);
2048 if (!obj_request->bio_list)
2049 goto out_partial;
2050 } else {
2051 unsigned int page_count;
2052
2053 obj_request->pages = pages;
2054 page_count = (u32)calc_pages_for(offset, length);
2055 obj_request->page_count = page_count;
2056 if ((offset + length) & ~PAGE_MASK)
2057 page_count--; /* more on last page */
2058 pages += page_count;
2059 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002060
Alex Elder2fa12322013-04-05 01:27:12 -05002061 osd_req = rbd_osd_req_create(rbd_dev, write_request,
2062 obj_request);
2063 if (!osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002064 goto out_partial;
Alex Elder2fa12322013-04-05 01:27:12 -05002065 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05002066 obj_request->callback = rbd_img_obj_callback;
Alex Elder430c28c2013-04-03 21:32:51 -05002067
Alex Elder2fa12322013-04-05 01:27:12 -05002068 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
2069 0, 0);
Alex Elderf1a47392013-04-19 15:34:50 -05002070 if (type == OBJ_REQUEST_BIO)
2071 osd_req_op_extent_osd_data_bio(osd_req, 0,
2072 obj_request->bio_list, length);
2073 else
2074 osd_req_op_extent_osd_data_pages(osd_req, 0,
2075 obj_request->pages, length,
2076 offset & ~PAGE_MASK, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002077
2078 if (write_request)
2079 rbd_osd_req_format_write(obj_request);
2080 else
2081 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002082
Alex Elder7da22d22013-01-24 16:13:36 -06002083 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002084 rbd_img_obj_request_add(img_request, obj_request);
2085
Alex Elder7da22d22013-01-24 16:13:36 -06002086 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002087 resid -= length;
2088 }
2089
2090 return 0;
2091
2092out_partial:
2093 rbd_obj_request_put(obj_request);
2094out_unwind:
2095 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2096 rbd_obj_request_put(obj_request);
2097
2098 return -ENOMEM;
2099}
2100
Alex Elder3d7efd12013-04-19 15:34:50 -05002101static void
Alex Elder0eefd472013-04-19 15:34:50 -05002102rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2103{
2104 struct rbd_img_request *img_request;
2105 struct rbd_device *rbd_dev;
2106 u64 length;
2107 u32 page_count;
2108
2109 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2110 rbd_assert(obj_request_img_data_test(obj_request));
2111 img_request = obj_request->img_request;
2112 rbd_assert(img_request);
2113
2114 rbd_dev = img_request->rbd_dev;
2115 rbd_assert(rbd_dev);
2116 length = (u64)1 << rbd_dev->header.obj_order;
2117 page_count = (u32)calc_pages_for(0, length);
2118
2119 rbd_assert(obj_request->copyup_pages);
2120 ceph_release_page_vector(obj_request->copyup_pages, page_count);
2121 obj_request->copyup_pages = NULL;
2122
2123 /*
2124 * We want the transfer count to reflect the size of the
2125 * original write request. There is no such thing as a
2126 * successful short write, so if the request was successful
2127 * we can just set it to the originally-requested length.
2128 */
2129 if (!obj_request->result)
2130 obj_request->xferred = obj_request->length;
2131
2132 /* Finish up with the normal image object callback */
2133
2134 rbd_img_obj_callback(obj_request);
2135}
2136
2137static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002138rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2139{
2140 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002141 struct ceph_osd_request *osd_req;
2142 struct ceph_osd_client *osdc;
2143 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002144 struct page **pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002145 int result;
2146 u64 obj_size;
2147 u64 xferred;
2148
2149 rbd_assert(img_request_child_test(img_request));
2150
2151 /* First get what we need from the image request */
2152
2153 pages = img_request->copyup_pages;
2154 rbd_assert(pages != NULL);
2155 img_request->copyup_pages = NULL;
2156
2157 orig_request = img_request->obj_request;
2158 rbd_assert(orig_request != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002159 rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
Alex Elder3d7efd12013-04-19 15:34:50 -05002160 result = img_request->result;
2161 obj_size = img_request->length;
2162 xferred = img_request->xferred;
2163
Alex Elder0eefd472013-04-19 15:34:50 -05002164 rbd_dev = img_request->rbd_dev;
2165 rbd_assert(rbd_dev);
2166 rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2167
Alex Elder3d7efd12013-04-19 15:34:50 -05002168 rbd_img_request_put(img_request);
2169
Alex Elder0eefd472013-04-19 15:34:50 -05002170 if (result)
2171 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002172
Alex Elder0eefd472013-04-19 15:34:50 -05002173 /* Allocate the new copyup osd request for the original request */
Alex Elder3d7efd12013-04-19 15:34:50 -05002174
Alex Elder0eefd472013-04-19 15:34:50 -05002175 result = -ENOMEM;
2176 rbd_assert(!orig_request->osd_req);
2177 osd_req = rbd_osd_req_create_copyup(orig_request);
2178 if (!osd_req)
2179 goto out_err;
2180 orig_request->osd_req = osd_req;
2181 orig_request->copyup_pages = pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002182
Alex Elder0eefd472013-04-19 15:34:50 -05002183 /* Initialize the copyup op */
2184
2185 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2186 osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2187 false, false);
2188
2189 /* Then the original write request op */
2190
2191 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2192 orig_request->offset,
2193 orig_request->length, 0, 0);
2194 osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2195 orig_request->length);
2196
2197 rbd_osd_req_format_write(orig_request);
2198
2199 /* All set, send it off. */
2200
2201 orig_request->callback = rbd_img_obj_copyup_callback;
2202 osdc = &rbd_dev->rbd_client->client->osdc;
2203 result = rbd_obj_request_submit(osdc, orig_request);
2204 if (!result)
2205 return;
2206out_err:
2207 /* Record the error code and complete the request */
2208
2209 orig_request->result = result;
2210 orig_request->xferred = 0;
2211 obj_request_done_set(orig_request);
2212 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002213}
2214
2215/*
2216 * Read from the parent image the range of data that covers the
2217 * entire target of the given object request. This is used for
2218 * satisfying a layered image write request when the target of an
2219 * object request from the image request does not exist.
2220 *
2221 * A page array big enough to hold the returned data is allocated
2222 * and supplied to rbd_img_request_fill() as the "data descriptor."
2223 * When the read completes, this page array will be transferred to
2224 * the original object request for the copyup operation.
2225 *
2226 * If an error occurs, record it as the result of the original
2227 * object request and mark it done so it gets completed.
2228 */
2229static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2230{
2231 struct rbd_img_request *img_request = NULL;
2232 struct rbd_img_request *parent_request = NULL;
2233 struct rbd_device *rbd_dev;
2234 u64 img_offset;
2235 u64 length;
2236 struct page **pages = NULL;
2237 u32 page_count;
2238 int result;
2239
2240 rbd_assert(obj_request_img_data_test(obj_request));
2241 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2242
2243 img_request = obj_request->img_request;
2244 rbd_assert(img_request != NULL);
2245 rbd_dev = img_request->rbd_dev;
2246 rbd_assert(rbd_dev->parent != NULL);
2247
2248 /*
Alex Elder0eefd472013-04-19 15:34:50 -05002249 * First things first. The original osd request is of no
2250 * use to use any more, we'll need a new one that can hold
2251 * the two ops in a copyup request. We'll get that later,
2252 * but for now we can release the old one.
2253 */
2254 rbd_osd_req_destroy(obj_request->osd_req);
2255 obj_request->osd_req = NULL;
2256
2257 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002258 * Determine the byte range covered by the object in the
2259 * child image to which the original request was to be sent.
2260 */
2261 img_offset = obj_request->img_offset - obj_request->offset;
2262 length = (u64)1 << rbd_dev->header.obj_order;
2263
2264 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002265 * There is no defined parent data beyond the parent
2266 * overlap, so limit what we read at that boundary if
2267 * necessary.
2268 */
2269 if (img_offset + length > rbd_dev->parent_overlap) {
2270 rbd_assert(img_offset < rbd_dev->parent_overlap);
2271 length = rbd_dev->parent_overlap - img_offset;
2272 }
2273
2274 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002275 * Allocate a page array big enough to receive the data read
2276 * from the parent.
2277 */
2278 page_count = (u32)calc_pages_for(0, length);
2279 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2280 if (IS_ERR(pages)) {
2281 result = PTR_ERR(pages);
2282 pages = NULL;
2283 goto out_err;
2284 }
2285
2286 result = -ENOMEM;
2287 parent_request = rbd_img_request_create(rbd_dev->parent,
2288 img_offset, length,
2289 false, true);
2290 if (!parent_request)
2291 goto out_err;
2292 rbd_obj_request_get(obj_request);
2293 parent_request->obj_request = obj_request;
2294
2295 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2296 if (result)
2297 goto out_err;
2298 parent_request->copyup_pages = pages;
2299
2300 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2301 result = rbd_img_request_submit(parent_request);
2302 if (!result)
2303 return 0;
2304
2305 parent_request->copyup_pages = NULL;
2306 parent_request->obj_request = NULL;
2307 rbd_obj_request_put(obj_request);
2308out_err:
2309 if (pages)
2310 ceph_release_page_vector(pages, page_count);
2311 if (parent_request)
2312 rbd_img_request_put(parent_request);
2313 obj_request->result = result;
2314 obj_request->xferred = 0;
2315 obj_request_done_set(obj_request);
2316
2317 return result;
2318}
2319
Alex Elderc5b5ef62013-02-11 12:33:24 -06002320static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2321{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002322 struct rbd_obj_request *orig_request;
2323 int result;
2324
2325 rbd_assert(!obj_request_img_data_test(obj_request));
2326
2327 /*
2328 * All we need from the object request is the original
2329 * request and the result of the STAT op. Grab those, then
2330 * we're done with the request.
2331 */
2332 orig_request = obj_request->obj_request;
2333 obj_request->obj_request = NULL;
2334 rbd_assert(orig_request);
2335 rbd_assert(orig_request->img_request);
2336
2337 result = obj_request->result;
2338 obj_request->result = 0;
2339
2340 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2341 obj_request, orig_request, result,
2342 obj_request->xferred, obj_request->length);
2343 rbd_obj_request_put(obj_request);
2344
2345 rbd_assert(orig_request);
2346 rbd_assert(orig_request->img_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002347
2348 /*
2349 * Our only purpose here is to determine whether the object
2350 * exists, and we don't want to treat the non-existence as
2351 * an error. If something else comes back, transfer the
2352 * error to the original request and complete it now.
2353 */
2354 if (!result) {
2355 obj_request_existence_set(orig_request, true);
2356 } else if (result == -ENOENT) {
2357 obj_request_existence_set(orig_request, false);
2358 } else if (result) {
2359 orig_request->result = result;
Alex Elder3d7efd12013-04-19 15:34:50 -05002360 goto out;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002361 }
2362
2363 /*
2364 * Resubmit the original request now that we have recorded
2365 * whether the target object exists.
2366 */
Alex Elderb454e362013-04-19 15:34:50 -05002367 orig_request->result = rbd_img_obj_request_submit(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002368out:
Alex Elderc5b5ef62013-02-11 12:33:24 -06002369 if (orig_request->result)
2370 rbd_obj_request_complete(orig_request);
2371 rbd_obj_request_put(orig_request);
2372}
2373
2374static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2375{
2376 struct rbd_obj_request *stat_request;
2377 struct rbd_device *rbd_dev;
2378 struct ceph_osd_client *osdc;
2379 struct page **pages = NULL;
2380 u32 page_count;
2381 size_t size;
2382 int ret;
2383
2384 /*
2385 * The response data for a STAT call consists of:
2386 * le64 length;
2387 * struct {
2388 * le32 tv_sec;
2389 * le32 tv_nsec;
2390 * } mtime;
2391 */
2392 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2393 page_count = (u32)calc_pages_for(0, size);
2394 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2395 if (IS_ERR(pages))
2396 return PTR_ERR(pages);
2397
2398 ret = -ENOMEM;
2399 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2400 OBJ_REQUEST_PAGES);
2401 if (!stat_request)
2402 goto out;
2403
2404 rbd_obj_request_get(obj_request);
2405 stat_request->obj_request = obj_request;
2406 stat_request->pages = pages;
2407 stat_request->page_count = page_count;
2408
2409 rbd_assert(obj_request->img_request);
2410 rbd_dev = obj_request->img_request->rbd_dev;
2411 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2412 stat_request);
2413 if (!stat_request->osd_req)
2414 goto out;
2415 stat_request->callback = rbd_img_obj_exists_callback;
2416
2417 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2418 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2419 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002420 rbd_osd_req_format_read(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002421
2422 osdc = &rbd_dev->rbd_client->client->osdc;
2423 ret = rbd_obj_request_submit(osdc, stat_request);
2424out:
2425 if (ret)
2426 rbd_obj_request_put(obj_request);
2427
2428 return ret;
2429}
2430
Alex Elderb454e362013-04-19 15:34:50 -05002431static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2432{
2433 struct rbd_img_request *img_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002434 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002435 bool known;
Alex Elderb454e362013-04-19 15:34:50 -05002436
2437 rbd_assert(obj_request_img_data_test(obj_request));
2438
2439 img_request = obj_request->img_request;
2440 rbd_assert(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002441 rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002442
Alex Elderb454e362013-04-19 15:34:50 -05002443 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002444 * Only writes to layered images need special handling.
2445 * Reads and non-layered writes are simple object requests.
2446 * Layered writes that start beyond the end of the overlap
2447 * with the parent have no parent data, so they too are
2448 * simple object requests. Finally, if the target object is
2449 * known to already exist, its parent data has already been
2450 * copied, so a write to the object can also be handled as a
2451 * simple object request.
Alex Elderb454e362013-04-19 15:34:50 -05002452 */
2453 if (!img_request_write_test(img_request) ||
2454 !img_request_layered_test(img_request) ||
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002455 rbd_dev->parent_overlap <= obj_request->img_offset ||
Alex Elder3d7efd12013-04-19 15:34:50 -05002456 ((known = obj_request_known_test(obj_request)) &&
2457 obj_request_exists_test(obj_request))) {
Alex Elderb454e362013-04-19 15:34:50 -05002458
2459 struct rbd_device *rbd_dev;
2460 struct ceph_osd_client *osdc;
2461
2462 rbd_dev = obj_request->img_request->rbd_dev;
2463 osdc = &rbd_dev->rbd_client->client->osdc;
2464
2465 return rbd_obj_request_submit(osdc, obj_request);
2466 }
2467
2468 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002469 * It's a layered write. The target object might exist but
2470 * we may not know that yet. If we know it doesn't exist,
2471 * start by reading the data for the full target object from
2472 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002473 */
Alex Elder3d7efd12013-04-19 15:34:50 -05002474 if (known)
2475 return rbd_img_obj_parent_read_full(obj_request);
2476
2477 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002478
2479 return rbd_img_obj_exists_submit(obj_request);
2480}
2481
Alex Elderbf0d5f502012-11-22 00:00:08 -06002482static int rbd_img_request_submit(struct rbd_img_request *img_request)
2483{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002484 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002485 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002486
Alex Elder37206ee2013-02-20 17:32:08 -06002487 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05002488 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002489 int ret;
2490
Alex Elderb454e362013-04-19 15:34:50 -05002491 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002492 if (ret)
2493 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002494 }
2495
2496 return 0;
2497}
2498
Alex Elder8b3e1a52013-01-24 16:13:36 -06002499static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2500{
2501 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002502 struct rbd_device *rbd_dev;
2503 u64 obj_end;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002504
2505 rbd_assert(img_request_child_test(img_request));
2506
2507 obj_request = img_request->obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002508 rbd_assert(obj_request);
2509 rbd_assert(obj_request->img_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002510
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002511 obj_request->result = img_request->result;
2512 if (obj_request->result)
2513 goto out;
2514
2515 /*
2516 * We need to zero anything beyond the parent overlap
2517 * boundary. Since rbd_img_obj_request_read_callback()
2518 * will zero anything beyond the end of a short read, an
2519 * easy way to do this is to pretend the data from the
2520 * parent came up short--ending at the overlap boundary.
2521 */
2522 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2523 obj_end = obj_request->img_offset + obj_request->length;
2524 rbd_dev = obj_request->img_request->rbd_dev;
2525 if (obj_end > rbd_dev->parent_overlap) {
2526 u64 xferred = 0;
2527
2528 if (obj_request->img_offset < rbd_dev->parent_overlap)
2529 xferred = rbd_dev->parent_overlap -
2530 obj_request->img_offset;
2531
2532 obj_request->xferred = min(img_request->xferred, xferred);
2533 } else {
2534 obj_request->xferred = img_request->xferred;
2535 }
2536out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06002537 rbd_img_obj_request_read_callback(obj_request);
2538 rbd_obj_request_complete(obj_request);
2539}
2540
2541static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2542{
2543 struct rbd_device *rbd_dev;
2544 struct rbd_img_request *img_request;
2545 int result;
2546
2547 rbd_assert(obj_request_img_data_test(obj_request));
2548 rbd_assert(obj_request->img_request != NULL);
2549 rbd_assert(obj_request->result == (s32) -ENOENT);
2550 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2551
2552 rbd_dev = obj_request->img_request->rbd_dev;
2553 rbd_assert(rbd_dev->parent != NULL);
2554 /* rbd_read_finish(obj_request, obj_request->length); */
2555 img_request = rbd_img_request_create(rbd_dev->parent,
2556 obj_request->img_offset,
2557 obj_request->length,
2558 false, true);
2559 result = -ENOMEM;
2560 if (!img_request)
2561 goto out_err;
2562
2563 rbd_obj_request_get(obj_request);
2564 img_request->obj_request = obj_request;
2565
Alex Elderf1a47392013-04-19 15:34:50 -05002566 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2567 obj_request->bio_list);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002568 if (result)
2569 goto out_err;
2570
2571 img_request->callback = rbd_img_parent_read_callback;
2572 result = rbd_img_request_submit(img_request);
2573 if (result)
2574 goto out_err;
2575
2576 return;
2577out_err:
2578 if (img_request)
2579 rbd_img_request_put(img_request);
2580 obj_request->result = result;
2581 obj_request->xferred = 0;
2582 obj_request_done_set(obj_request);
2583}
2584
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002585static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
Alex Elderb8d70032012-11-30 17:53:04 -06002586{
2587 struct rbd_obj_request *obj_request;
Alex Elder21692382013-04-05 01:27:12 -05002588 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderb8d70032012-11-30 17:53:04 -06002589 int ret;
2590
2591 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2592 OBJ_REQUEST_NODATA);
2593 if (!obj_request)
2594 return -ENOMEM;
2595
2596 ret = -ENOMEM;
Alex Elder430c28c2013-04-03 21:32:51 -05002597 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002598 if (!obj_request->osd_req)
2599 goto out;
Alex Elder21692382013-04-05 01:27:12 -05002600 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06002601
Alex Elderc99d2d42013-04-05 01:27:11 -05002602 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002603 notify_id, 0, 0);
Alex Elder9d4df012013-04-19 15:34:50 -05002604 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002605
Alex Elderb8d70032012-11-30 17:53:04 -06002606 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002607out:
Alex Eldercf81b602013-01-17 12:18:46 -06002608 if (ret)
2609 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002610
2611 return ret;
2612}
2613
2614static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2615{
2616 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Alex Elderb8d70032012-11-30 17:53:04 -06002617
2618 if (!rbd_dev)
2619 return;
2620
Alex Elder37206ee2013-02-20 17:32:08 -06002621 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002622 rbd_dev->header_name, (unsigned long long)notify_id,
2623 (unsigned int)opcode);
2624 (void)rbd_dev_refresh(rbd_dev);
Alex Elderb8d70032012-11-30 17:53:04 -06002625
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002626 rbd_obj_notify_ack(rbd_dev, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06002627}
2628
Alex Elder9969ebc2013-01-18 12:31:10 -06002629/*
2630 * Request sync osd watch/unwatch. The value of "start" determines
2631 * whether a watch request is being initiated or torn down.
2632 */
2633static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2634{
2635 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2636 struct rbd_obj_request *obj_request;
Alex Elder9969ebc2013-01-18 12:31:10 -06002637 int ret;
2638
2639 rbd_assert(start ^ !!rbd_dev->watch_event);
2640 rbd_assert(start ^ !!rbd_dev->watch_request);
2641
2642 if (start) {
Alex Elder3c663bb2013-02-15 11:42:30 -06002643 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
Alex Elder9969ebc2013-01-18 12:31:10 -06002644 &rbd_dev->watch_event);
2645 if (ret < 0)
2646 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06002647 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06002648 }
2649
2650 ret = -ENOMEM;
2651 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2652 OBJ_REQUEST_NODATA);
2653 if (!obj_request)
2654 goto out_cancel;
2655
Alex Elder430c28c2013-04-03 21:32:51 -05002656 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2657 if (!obj_request->osd_req)
2658 goto out_cancel;
2659
Alex Elder8eb87562013-01-25 17:08:55 -06002660 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06002661 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06002662 else
Alex Elder6977c3f2013-01-25 17:08:55 -06002663 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06002664 rbd_dev->watch_request->osd_req);
Alex Elder21692382013-04-05 01:27:12 -05002665
2666 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
Alex Elderb21ebdd2013-04-30 00:44:32 -05002667 rbd_dev->watch_event->cookie, 0, start);
Alex Elder9d4df012013-04-19 15:34:50 -05002668 rbd_osd_req_format_write(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002669
Alex Elder9969ebc2013-01-18 12:31:10 -06002670 ret = rbd_obj_request_submit(osdc, obj_request);
2671 if (ret)
2672 goto out_cancel;
2673 ret = rbd_obj_request_wait(obj_request);
2674 if (ret)
2675 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06002676 ret = obj_request->result;
2677 if (ret)
2678 goto out_cancel;
2679
Alex Elder8eb87562013-01-25 17:08:55 -06002680 /*
2681 * A watch request is set to linger, so the underlying osd
2682 * request won't go away until we unregister it. We retain
2683 * a pointer to the object request during that time (in
2684 * rbd_dev->watch_request), so we'll keep a reference to
2685 * it. We'll drop that reference (below) after we've
2686 * unregistered it.
2687 */
2688 if (start) {
2689 rbd_dev->watch_request = obj_request;
2690
2691 return 0;
2692 }
2693
2694 /* We have successfully torn down the watch request */
2695
2696 rbd_obj_request_put(rbd_dev->watch_request);
2697 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002698out_cancel:
2699 /* Cancel the event if we're tearing down, or on error */
2700 ceph_osdc_cancel_event(rbd_dev->watch_event);
2701 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002702 if (obj_request)
2703 rbd_obj_request_put(obj_request);
2704
2705 return ret;
2706}
2707
Alex Elder36be9a72013-01-19 00:30:28 -06002708/*
Alex Elderf40eb342013-04-25 15:09:42 -05002709 * Synchronous osd object method call. Returns the number of bytes
2710 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06002711 */
2712static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2713 const char *object_name,
2714 const char *class_name,
2715 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05002716 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06002717 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05002718 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05002719 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06002720{
Alex Elder21692382013-04-05 01:27:12 -05002721 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder36be9a72013-01-19 00:30:28 -06002722 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06002723 struct page **pages;
2724 u32 page_count;
2725 int ret;
2726
2727 /*
Alex Elder6010a452013-04-05 01:27:11 -05002728 * Method calls are ultimately read operations. The result
2729 * should placed into the inbound buffer provided. They
2730 * also supply outbound data--parameters for the object
2731 * method. Currently if this is present it will be a
2732 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06002733 */
Alex Elder57385b52013-04-21 12:14:45 -05002734 page_count = (u32)calc_pages_for(0, inbound_size);
Alex Elder36be9a72013-01-19 00:30:28 -06002735 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2736 if (IS_ERR(pages))
2737 return PTR_ERR(pages);
2738
2739 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05002740 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06002741 OBJ_REQUEST_PAGES);
2742 if (!obj_request)
2743 goto out;
2744
2745 obj_request->pages = pages;
2746 obj_request->page_count = page_count;
2747
Alex Elder430c28c2013-04-03 21:32:51 -05002748 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06002749 if (!obj_request->osd_req)
2750 goto out;
2751
Alex Elderc99d2d42013-04-05 01:27:11 -05002752 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05002753 class_name, method_name);
2754 if (outbound_size) {
2755 struct ceph_pagelist *pagelist;
2756
2757 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2758 if (!pagelist)
2759 goto out;
2760
2761 ceph_pagelist_init(pagelist);
2762 ceph_pagelist_append(pagelist, outbound, outbound_size);
2763 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2764 pagelist);
2765 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05002766 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2767 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05002768 0, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002769 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002770
Alex Elder36be9a72013-01-19 00:30:28 -06002771 ret = rbd_obj_request_submit(osdc, obj_request);
2772 if (ret)
2773 goto out;
2774 ret = rbd_obj_request_wait(obj_request);
2775 if (ret)
2776 goto out;
2777
2778 ret = obj_request->result;
2779 if (ret < 0)
2780 goto out;
Alex Elder57385b52013-04-21 12:14:45 -05002781
2782 rbd_assert(obj_request->xferred < (u64)INT_MAX);
2783 ret = (int)obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002784 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06002785out:
2786 if (obj_request)
2787 rbd_obj_request_put(obj_request);
2788 else
2789 ceph_release_page_vector(pages, page_count);
2790
2791 return ret;
2792}
2793
Alex Elderbf0d5f502012-11-22 00:00:08 -06002794static void rbd_request_fn(struct request_queue *q)
Alex Eldercc344fa2013-02-19 12:25:56 -06002795 __releases(q->queue_lock) __acquires(q->queue_lock)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002796{
2797 struct rbd_device *rbd_dev = q->queuedata;
2798 bool read_only = rbd_dev->mapping.read_only;
2799 struct request *rq;
2800 int result;
2801
2802 while ((rq = blk_fetch_request(q))) {
2803 bool write_request = rq_data_dir(rq) == WRITE;
2804 struct rbd_img_request *img_request;
2805 u64 offset;
2806 u64 length;
2807
2808 /* Ignore any non-FS requests that filter through. */
2809
2810 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06002811 dout("%s: non-fs request type %d\n", __func__,
2812 (int) rq->cmd_type);
2813 __blk_end_request_all(rq, 0);
2814 continue;
2815 }
2816
2817 /* Ignore/skip any zero-length requests */
2818
2819 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2820 length = (u64) blk_rq_bytes(rq);
2821
2822 if (!length) {
2823 dout("%s: zero-length request\n", __func__);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002824 __blk_end_request_all(rq, 0);
2825 continue;
2826 }
2827
2828 spin_unlock_irq(q->queue_lock);
2829
2830 /* Disallow writes to a read-only device */
2831
2832 if (write_request) {
2833 result = -EROFS;
2834 if (read_only)
2835 goto end_request;
2836 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2837 }
2838
Alex Elder6d292902013-01-14 12:43:31 -06002839 /*
2840 * Quit early if the mapped snapshot no longer
2841 * exists. It's still possible the snapshot will
2842 * have disappeared by the time our request arrives
2843 * at the osd, but there's no sense in sending it if
2844 * we already know.
2845 */
2846 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002847 dout("request for non-existent snapshot");
2848 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2849 result = -ENXIO;
2850 goto end_request;
2851 }
2852
Alex Elderbf0d5f502012-11-22 00:00:08 -06002853 result = -EINVAL;
Alex Elderc0cd10db2013-04-26 09:43:47 -05002854 if (offset && length > U64_MAX - offset + 1) {
2855 rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2856 offset, length);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002857 goto end_request; /* Shouldn't happen */
Alex Elderc0cd10db2013-04-26 09:43:47 -05002858 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002859
2860 result = -ENOMEM;
2861 img_request = rbd_img_request_create(rbd_dev, offset, length,
Alex Elder9849e982013-01-24 16:13:36 -06002862 write_request, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002863 if (!img_request)
2864 goto end_request;
2865
2866 img_request->rq = rq;
2867
Alex Elderf1a47392013-04-19 15:34:50 -05002868 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2869 rq->bio);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002870 if (!result)
2871 result = rbd_img_request_submit(img_request);
2872 if (result)
2873 rbd_img_request_put(img_request);
2874end_request:
2875 spin_lock_irq(q->queue_lock);
2876 if (result < 0) {
Alex Elder7da22d22013-01-24 16:13:36 -06002877 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2878 write_request ? "write" : "read",
2879 length, offset, result);
2880
Alex Elderbf0d5f502012-11-22 00:00:08 -06002881 __blk_end_request_all(rq, result);
2882 }
2883 }
2884}
2885
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002886/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002887 * a queue callback. Makes sure that we don't create a bio that spans across
2888 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002889 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002890 */
2891static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2892 struct bio_vec *bvec)
2893{
2894 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05002895 sector_t sector_offset;
2896 sector_t sectors_per_obj;
2897 sector_t obj_sector_offset;
2898 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002899
Alex Eldere5cfeed22012-10-20 22:17:27 -05002900 /*
2901 * Find how far into its rbd object the partition-relative
2902 * bio start sector is to offset relative to the enclosing
2903 * device.
2904 */
2905 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2906 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2907 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002908
Alex Eldere5cfeed22012-10-20 22:17:27 -05002909 /*
2910 * Compute the number of bytes from that offset to the end
2911 * of the object. Account for what's already used by the bio.
2912 */
2913 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2914 if (ret > bmd->bi_size)
2915 ret -= bmd->bi_size;
2916 else
2917 ret = 0;
2918
2919 /*
2920 * Don't send back more than was asked for. And if the bio
2921 * was empty, let the whole thing through because: "Note
2922 * that a block device *must* allow a single page to be
2923 * added to an empty bio."
2924 */
2925 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2926 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2927 ret = (int) bvec->bv_len;
2928
2929 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002930}
2931
2932static void rbd_free_disk(struct rbd_device *rbd_dev)
2933{
2934 struct gendisk *disk = rbd_dev->disk;
2935
2936 if (!disk)
2937 return;
2938
Alex Eldera0cab922013-04-25 23:15:08 -05002939 rbd_dev->disk = NULL;
2940 if (disk->flags & GENHD_FL_UP) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002941 del_gendisk(disk);
Alex Eldera0cab922013-04-25 23:15:08 -05002942 if (disk->queue)
2943 blk_cleanup_queue(disk->queue);
2944 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002945 put_disk(disk);
2946}
2947
Alex Elder788e2df2013-01-17 12:25:27 -06002948static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2949 const char *object_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05002950 u64 offset, u64 length, void *buf)
Alex Elder788e2df2013-01-17 12:25:27 -06002951
2952{
Alex Elder21692382013-04-05 01:27:12 -05002953 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder788e2df2013-01-17 12:25:27 -06002954 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06002955 struct page **pages = NULL;
2956 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002957 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06002958 int ret;
2959
2960 page_count = (u32) calc_pages_for(offset, length);
2961 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2962 if (IS_ERR(pages))
2963 ret = PTR_ERR(pages);
2964
2965 ret = -ENOMEM;
2966 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002967 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002968 if (!obj_request)
2969 goto out;
2970
2971 obj_request->pages = pages;
2972 obj_request->page_count = page_count;
2973
Alex Elder430c28c2013-04-03 21:32:51 -05002974 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06002975 if (!obj_request->osd_req)
2976 goto out;
2977
Alex Elderc99d2d42013-04-05 01:27:11 -05002978 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2979 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05002980 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05002981 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05002982 obj_request->length,
2983 obj_request->offset & ~PAGE_MASK,
2984 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002985 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002986
Alex Elder788e2df2013-01-17 12:25:27 -06002987 ret = rbd_obj_request_submit(osdc, obj_request);
2988 if (ret)
2989 goto out;
2990 ret = rbd_obj_request_wait(obj_request);
2991 if (ret)
2992 goto out;
2993
2994 ret = obj_request->result;
2995 if (ret < 0)
2996 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002997
2998 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2999 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06003000 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder7097f8d2013-04-30 00:44:33 -05003001 rbd_assert(size <= (size_t)INT_MAX);
3002 ret = (int)size;
Alex Elder788e2df2013-01-17 12:25:27 -06003003out:
3004 if (obj_request)
3005 rbd_obj_request_put(obj_request);
3006 else
3007 ceph_release_page_vector(pages, page_count);
3008
3009 return ret;
3010}
3011
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003012/*
Alex Elder4156d992012-08-02 11:29:46 -05003013 * Read the complete header for the given rbd device.
3014 *
3015 * Returns a pointer to a dynamically-allocated buffer containing
3016 * the complete and validated header. Caller can pass the address
3017 * of a variable that will be filled in with the version of the
3018 * header object at the time it was read.
3019 *
3020 * Returns a pointer-coded errno if a failure occurs.
3021 */
3022static struct rbd_image_header_ondisk *
Alex Elder7097f8d2013-04-30 00:44:33 -05003023rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003024{
3025 struct rbd_image_header_ondisk *ondisk = NULL;
3026 u32 snap_count = 0;
3027 u64 names_size = 0;
3028 u32 want_count;
3029 int ret;
3030
3031 /*
3032 * The complete header will include an array of its 64-bit
3033 * snapshot ids, followed by the names of those snapshots as
3034 * a contiguous block of NUL-terminated strings. Note that
3035 * the number of snapshots could change by the time we read
3036 * it in, in which case we re-read it.
3037 */
3038 do {
3039 size_t size;
3040
3041 kfree(ondisk);
3042
3043 size = sizeof (*ondisk);
3044 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3045 size += names_size;
3046 ondisk = kmalloc(size, GFP_KERNEL);
3047 if (!ondisk)
3048 return ERR_PTR(-ENOMEM);
3049
Alex Elder788e2df2013-01-17 12:25:27 -06003050 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05003051 0, size, ondisk);
Alex Elder4156d992012-08-02 11:29:46 -05003052 if (ret < 0)
3053 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003054 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003055 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003056 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3057 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05003058 goto out_err;
3059 }
3060 if (!rbd_dev_ondisk_valid(ondisk)) {
3061 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003062 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05003063 goto out_err;
3064 }
3065
3066 names_size = le64_to_cpu(ondisk->snap_names_len);
3067 want_count = snap_count;
3068 snap_count = le32_to_cpu(ondisk->snap_count);
3069 } while (snap_count != want_count);
3070
3071 return ondisk;
3072
3073out_err:
3074 kfree(ondisk);
3075
3076 return ERR_PTR(ret);
3077}
3078
3079/*
3080 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003081 */
3082static int rbd_read_header(struct rbd_device *rbd_dev,
3083 struct rbd_image_header *header)
3084{
Alex Elder4156d992012-08-02 11:29:46 -05003085 struct rbd_image_header_ondisk *ondisk;
Alex Elder4156d992012-08-02 11:29:46 -05003086 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003087
Alex Elder7097f8d2013-04-30 00:44:33 -05003088 ondisk = rbd_dev_v1_header_read(rbd_dev);
Alex Elder4156d992012-08-02 11:29:46 -05003089 if (IS_ERR(ondisk))
3090 return PTR_ERR(ondisk);
3091 ret = rbd_header_from_disk(header, ondisk);
Alex Elder4156d992012-08-02 11:29:46 -05003092 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003093
Alex Elder4156d992012-08-02 11:29:46 -05003094 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003095}
3096
Alex Elder94785542012-10-09 13:50:17 -07003097static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
3098{
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003099 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07003100 return;
3101
Alex Eldere28626a2013-04-26 15:44:35 -05003102 if (rbd_dev->mapping.size != rbd_dev->header.image_size) {
3103 sector_t size;
3104
3105 rbd_dev->mapping.size = rbd_dev->header.image_size;
3106 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3107 dout("setting size to %llu sectors", (unsigned long long)size);
3108 set_capacity(rbd_dev->disk, size);
3109 }
Alex Elder94785542012-10-09 13:50:17 -07003110}
3111
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003112/*
3113 * only read the first part of the ondisk header, without the snaps info
3114 */
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003115static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003116{
3117 int ret;
3118 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003119
3120 ret = rbd_read_header(rbd_dev, &h);
3121 if (ret < 0)
3122 return ret;
3123
Josh Durgina51aa0c2011-12-05 10:35:04 -08003124 down_write(&rbd_dev->header_rwsem);
3125
Alex Elder94785542012-10-09 13:50:17 -07003126 /* Update image size, and check for resize of mapped image */
3127 rbd_dev->header.image_size = h.image_size;
3128 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07003129
Alex Elder849b4262012-07-09 21:04:24 -05003130 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003131 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05003132 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08003133 /* osd requests may still refer to snapc */
Alex Elder812164f82013-04-30 00:44:32 -05003134 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003135
Josh Durgin93a24e02011-12-05 10:41:28 -08003136 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003137 rbd_dev->header.snapc = h.snapc;
3138 rbd_dev->header.snap_names = h.snap_names;
3139 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05003140 /* Free the extra copy of the object prefix */
Alex Elderc0cd10db2013-04-26 09:43:47 -05003141 if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
3142 rbd_warn(rbd_dev, "object prefix changed (ignoring)");
Alex Elder849b4262012-07-09 21:04:24 -05003143 kfree(h.object_prefix);
3144
Josh Durginc6666012011-11-21 17:11:12 -08003145 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003146
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003147 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003148}
3149
Alex Elder15228ed2013-05-01 12:43:03 -05003150/*
3151 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3152 * has disappeared from the (just updated) snapshot context.
3153 */
3154static void rbd_exists_validate(struct rbd_device *rbd_dev)
3155{
3156 u64 snap_id;
3157
3158 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3159 return;
3160
3161 snap_id = rbd_dev->spec->snap_id;
3162 if (snap_id == CEPH_NOSNAP)
3163 return;
3164
3165 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3166 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3167}
3168
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003169static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003170{
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003171 u64 image_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003172 int ret;
3173
Alex Elder117973f2012-08-31 17:29:55 -05003174 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003175 image_size = rbd_dev->header.image_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003176 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05003177 if (rbd_dev->image_format == 1)
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003178 ret = rbd_dev_v1_refresh(rbd_dev);
Alex Elder117973f2012-08-31 17:29:55 -05003179 else
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003180 ret = rbd_dev_v2_refresh(rbd_dev);
Alex Elder15228ed2013-05-01 12:43:03 -05003181
3182 /* If it's a mapped snapshot, validate its EXISTS flag */
3183
3184 rbd_exists_validate(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003185 mutex_unlock(&ctl_mutex);
Alex Elder522a0cc2013-04-25 15:09:41 -05003186 if (ret)
3187 rbd_warn(rbd_dev, "got notification but failed to "
3188 " update snaps: %d\n", ret);
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003189 if (image_size != rbd_dev->header.image_size)
3190 revalidate_disk(rbd_dev->disk);
Alex Elder1fe5e992012-07-25 09:32:41 -05003191
3192 return ret;
3193}
3194
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003195static int rbd_init_disk(struct rbd_device *rbd_dev)
3196{
3197 struct gendisk *disk;
3198 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003199 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003200
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003201 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003202 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3203 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003204 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003205
Alex Elderf0f8cef2012-01-29 13:57:44 -06003206 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003207 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003208 disk->major = rbd_dev->major;
3209 disk->first_minor = 0;
3210 disk->fops = &rbd_bd_ops;
3211 disk->private_data = rbd_dev;
3212
Alex Elderbf0d5f502012-11-22 00:00:08 -06003213 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003214 if (!q)
3215 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003216
Alex Elder593a9e72012-02-07 12:03:37 -06003217 /* We use the default size, but let's be explicit about it. */
3218 blk_queue_physical_block_size(q, SECTOR_SIZE);
3219
Josh Durgin029bcbd2011-07-22 11:35:23 -07003220 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003221 segment_size = rbd_obj_bytes(&rbd_dev->header);
3222 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3223 blk_queue_max_segment_size(q, segment_size);
3224 blk_queue_io_min(q, segment_size);
3225 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003226
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003227 blk_queue_merge_bvec(q, rbd_merge_bvec);
3228 disk->queue = q;
3229
3230 q->queuedata = rbd_dev;
3231
3232 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003233
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003234 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003235out_disk:
3236 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003237
3238 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003239}
3240
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003241/*
3242 sysfs
3243*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003244
Alex Elder593a9e72012-02-07 12:03:37 -06003245static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3246{
3247 return container_of(dev, struct rbd_device, dev);
3248}
3249
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003250static ssize_t rbd_size_show(struct device *dev,
3251 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003252{
Alex Elder593a9e72012-02-07 12:03:37 -06003253 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003254
Alex Elderfc71d832013-04-26 15:44:36 -05003255 return sprintf(buf, "%llu\n",
3256 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003257}
3258
Alex Elder34b13182012-07-13 20:35:12 -05003259/*
3260 * Note this shows the features for whatever's mapped, which is not
3261 * necessarily the base image.
3262 */
3263static ssize_t rbd_features_show(struct device *dev,
3264 struct device_attribute *attr, char *buf)
3265{
3266 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3267
3268 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003269 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05003270}
3271
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003272static ssize_t rbd_major_show(struct device *dev,
3273 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003274{
Alex Elder593a9e72012-02-07 12:03:37 -06003275 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003276
Alex Elderfc71d832013-04-26 15:44:36 -05003277 if (rbd_dev->major)
3278 return sprintf(buf, "%d\n", rbd_dev->major);
3279
3280 return sprintf(buf, "(none)\n");
3281
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003282}
3283
3284static ssize_t rbd_client_id_show(struct device *dev,
3285 struct device_attribute *attr, char *buf)
3286{
Alex Elder593a9e72012-02-07 12:03:37 -06003287 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003288
Alex Elder1dbb4392012-01-24 10:08:37 -06003289 return sprintf(buf, "client%lld\n",
3290 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003291}
3292
3293static ssize_t rbd_pool_show(struct device *dev,
3294 struct device_attribute *attr, char *buf)
3295{
Alex Elder593a9e72012-02-07 12:03:37 -06003296 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003297
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003298 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003299}
3300
Alex Elder9bb2f332012-07-12 10:46:35 -05003301static ssize_t rbd_pool_id_show(struct device *dev,
3302 struct device_attribute *attr, char *buf)
3303{
3304 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3305
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003306 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003307 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003308}
3309
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003310static ssize_t rbd_name_show(struct device *dev,
3311 struct device_attribute *attr, char *buf)
3312{
Alex Elder593a9e72012-02-07 12:03:37 -06003313 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003314
Alex Eldera92ffdf2012-10-30 19:40:33 -05003315 if (rbd_dev->spec->image_name)
3316 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3317
3318 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003319}
3320
Alex Elder589d30e2012-07-10 20:30:11 -05003321static ssize_t rbd_image_id_show(struct device *dev,
3322 struct device_attribute *attr, char *buf)
3323{
3324 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3325
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003326 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003327}
3328
Alex Elder34b13182012-07-13 20:35:12 -05003329/*
3330 * Shows the name of the currently-mapped snapshot (or
3331 * RBD_SNAP_HEAD_NAME for the base image).
3332 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003333static ssize_t rbd_snap_show(struct device *dev,
3334 struct device_attribute *attr,
3335 char *buf)
3336{
Alex Elder593a9e72012-02-07 12:03:37 -06003337 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003338
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003339 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003340}
3341
Alex Elder86b00e02012-10-25 23:34:42 -05003342/*
3343 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3344 * for the parent image. If there is no parent, simply shows
3345 * "(no parent image)".
3346 */
3347static ssize_t rbd_parent_show(struct device *dev,
3348 struct device_attribute *attr,
3349 char *buf)
3350{
3351 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3352 struct rbd_spec *spec = rbd_dev->parent_spec;
3353 int count;
3354 char *bufp = buf;
3355
3356 if (!spec)
3357 return sprintf(buf, "(no parent image)\n");
3358
3359 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3360 (unsigned long long) spec->pool_id, spec->pool_name);
3361 if (count < 0)
3362 return count;
3363 bufp += count;
3364
3365 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3366 spec->image_name ? spec->image_name : "(unknown)");
3367 if (count < 0)
3368 return count;
3369 bufp += count;
3370
3371 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3372 (unsigned long long) spec->snap_id, spec->snap_name);
3373 if (count < 0)
3374 return count;
3375 bufp += count;
3376
3377 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3378 if (count < 0)
3379 return count;
3380 bufp += count;
3381
3382 return (ssize_t) (bufp - buf);
3383}
3384
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003385static ssize_t rbd_image_refresh(struct device *dev,
3386 struct device_attribute *attr,
3387 const char *buf,
3388 size_t size)
3389{
Alex Elder593a9e72012-02-07 12:03:37 -06003390 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003391 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003392
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003393 ret = rbd_dev_refresh(rbd_dev);
Alex Elderb8136232012-07-25 09:32:41 -05003394
3395 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003396}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003397
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003398static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003399static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003400static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3401static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3402static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003403static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003404static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003405static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003406static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3407static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003408static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003409
3410static struct attribute *rbd_attrs[] = {
3411 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003412 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003413 &dev_attr_major.attr,
3414 &dev_attr_client_id.attr,
3415 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05003416 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003417 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05003418 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003419 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05003420 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003421 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003422 NULL
3423};
3424
3425static struct attribute_group rbd_attr_group = {
3426 .attrs = rbd_attrs,
3427};
3428
3429static const struct attribute_group *rbd_attr_groups[] = {
3430 &rbd_attr_group,
3431 NULL
3432};
3433
3434static void rbd_sysfs_dev_release(struct device *dev)
3435{
3436}
3437
3438static struct device_type rbd_device_type = {
3439 .name = "rbd",
3440 .groups = rbd_attr_groups,
3441 .release = rbd_sysfs_dev_release,
3442};
3443
Alex Elder8b8fb992012-10-26 17:25:24 -05003444static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3445{
3446 kref_get(&spec->kref);
3447
3448 return spec;
3449}
3450
3451static void rbd_spec_free(struct kref *kref);
3452static void rbd_spec_put(struct rbd_spec *spec)
3453{
3454 if (spec)
3455 kref_put(&spec->kref, rbd_spec_free);
3456}
3457
3458static struct rbd_spec *rbd_spec_alloc(void)
3459{
3460 struct rbd_spec *spec;
3461
3462 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3463 if (!spec)
3464 return NULL;
3465 kref_init(&spec->kref);
3466
Alex Elder8b8fb992012-10-26 17:25:24 -05003467 return spec;
3468}
3469
3470static void rbd_spec_free(struct kref *kref)
3471{
3472 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3473
3474 kfree(spec->pool_name);
3475 kfree(spec->image_id);
3476 kfree(spec->image_name);
3477 kfree(spec->snap_name);
3478 kfree(spec);
3479}
3480
Alex Eldercc344fa2013-02-19 12:25:56 -06003481static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05003482 struct rbd_spec *spec)
3483{
3484 struct rbd_device *rbd_dev;
3485
3486 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3487 if (!rbd_dev)
3488 return NULL;
3489
3490 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06003491 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05003492 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05003493 init_rwsem(&rbd_dev->header_rwsem);
3494
3495 rbd_dev->spec = spec;
3496 rbd_dev->rbd_client = rbdc;
3497
Alex Elder0903e872012-11-14 12:25:19 -06003498 /* Initialize the layout used for all rbd requests */
3499
3500 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3501 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3502 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3503 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3504
Alex Elderc53d5892012-10-25 23:34:42 -05003505 return rbd_dev;
3506}
3507
3508static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3509{
Alex Elderc53d5892012-10-25 23:34:42 -05003510 rbd_put_client(rbd_dev->rbd_client);
3511 rbd_spec_put(rbd_dev->spec);
3512 kfree(rbd_dev);
3513}
3514
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003515/*
Alex Elder9d475de2012-07-03 16:01:19 -05003516 * Get the size and object order for an image snapshot, or if
3517 * snap_id is CEPH_NOSNAP, gets this information for the base
3518 * image.
3519 */
3520static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3521 u8 *order, u64 *snap_size)
3522{
3523 __le64 snapid = cpu_to_le64(snap_id);
3524 int ret;
3525 struct {
3526 u8 order;
3527 __le64 size;
3528 } __attribute__ ((packed)) size_buf = { 0 };
3529
Alex Elder36be9a72013-01-19 00:30:28 -06003530 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05003531 "rbd", "get_size",
Alex Elder41579762013-04-21 12:14:45 -05003532 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003533 &size_buf, sizeof (size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06003534 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05003535 if (ret < 0)
3536 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003537 if (ret < sizeof (size_buf))
3538 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05003539
Alex Elderc86f86e2013-04-25 15:09:41 -05003540 if (order)
3541 *order = size_buf.order;
Alex Elder9d475de2012-07-03 16:01:19 -05003542 *snap_size = le64_to_cpu(size_buf.size);
3543
3544 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
Alex Elder57385b52013-04-21 12:14:45 -05003545 (unsigned long long)snap_id, (unsigned int)*order,
3546 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05003547
3548 return 0;
3549}
3550
3551static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3552{
3553 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3554 &rbd_dev->header.obj_order,
3555 &rbd_dev->header.image_size);
3556}
3557
Alex Elder1e130192012-07-03 16:01:19 -05003558static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3559{
3560 void *reply_buf;
3561 int ret;
3562 void *p;
3563
3564 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3565 if (!reply_buf)
3566 return -ENOMEM;
3567
Alex Elder36be9a72013-01-19 00:30:28 -06003568 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003569 "rbd", "get_object_prefix", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003570 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06003571 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05003572 if (ret < 0)
3573 goto out;
3574
3575 p = reply_buf;
3576 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05003577 p + ret, NULL, GFP_NOIO);
3578 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05003579
3580 if (IS_ERR(rbd_dev->header.object_prefix)) {
3581 ret = PTR_ERR(rbd_dev->header.object_prefix);
3582 rbd_dev->header.object_prefix = NULL;
3583 } else {
3584 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
3585 }
Alex Elder1e130192012-07-03 16:01:19 -05003586out:
3587 kfree(reply_buf);
3588
3589 return ret;
3590}
3591
Alex Elderb1b54022012-07-03 16:01:19 -05003592static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3593 u64 *snap_features)
3594{
3595 __le64 snapid = cpu_to_le64(snap_id);
3596 struct {
3597 __le64 features;
3598 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05003599 } __attribute__ ((packed)) features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07003600 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05003601 int ret;
3602
Alex Elder36be9a72013-01-19 00:30:28 -06003603 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05003604 "rbd", "get_features",
Alex Elder41579762013-04-21 12:14:45 -05003605 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003606 &features_buf, sizeof (features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06003607 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05003608 if (ret < 0)
3609 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003610 if (ret < sizeof (features_buf))
3611 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07003612
3613 incompat = le64_to_cpu(features_buf.incompat);
Alex Elder5cbf6f122013-04-11 09:29:48 -05003614 if (incompat & ~RBD_FEATURES_SUPPORTED)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05003615 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07003616
Alex Elderb1b54022012-07-03 16:01:19 -05003617 *snap_features = le64_to_cpu(features_buf.features);
3618
3619 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05003620 (unsigned long long)snap_id,
3621 (unsigned long long)*snap_features,
3622 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05003623
3624 return 0;
3625}
3626
3627static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3628{
3629 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3630 &rbd_dev->header.features);
3631}
3632
Alex Elder86b00e02012-10-25 23:34:42 -05003633static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3634{
3635 struct rbd_spec *parent_spec;
3636 size_t size;
3637 void *reply_buf = NULL;
3638 __le64 snapid;
3639 void *p;
3640 void *end;
3641 char *image_id;
3642 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05003643 int ret;
3644
3645 parent_spec = rbd_spec_alloc();
3646 if (!parent_spec)
3647 return -ENOMEM;
3648
3649 size = sizeof (__le64) + /* pool_id */
3650 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
3651 sizeof (__le64) + /* snap_id */
3652 sizeof (__le64); /* overlap */
3653 reply_buf = kmalloc(size, GFP_KERNEL);
3654 if (!reply_buf) {
3655 ret = -ENOMEM;
3656 goto out_err;
3657 }
3658
3659 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06003660 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05003661 "rbd", "get_parent",
Alex Elder41579762013-04-21 12:14:45 -05003662 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003663 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06003664 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05003665 if (ret < 0)
3666 goto out_err;
3667
Alex Elder86b00e02012-10-25 23:34:42 -05003668 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003669 end = reply_buf + ret;
3670 ret = -ERANGE;
Alex Elder86b00e02012-10-25 23:34:42 -05003671 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3672 if (parent_spec->pool_id == CEPH_NOPOOL)
3673 goto out; /* No parent? No problem. */
3674
Alex Elder0903e872012-11-14 12:25:19 -06003675 /* The ceph file layout needs to fit pool id in 32 bits */
3676
3677 ret = -EIO;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003678 if (parent_spec->pool_id > (u64)U32_MAX) {
3679 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3680 (unsigned long long)parent_spec->pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05003681 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003682 }
Alex Elder0903e872012-11-14 12:25:19 -06003683
Alex Elder979ed482012-11-01 08:39:26 -05003684 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05003685 if (IS_ERR(image_id)) {
3686 ret = PTR_ERR(image_id);
3687 goto out_err;
3688 }
3689 parent_spec->image_id = image_id;
3690 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3691 ceph_decode_64_safe(&p, end, overlap, out_err);
3692
3693 rbd_dev->parent_overlap = overlap;
3694 rbd_dev->parent_spec = parent_spec;
3695 parent_spec = NULL; /* rbd_dev now owns this */
3696out:
3697 ret = 0;
3698out_err:
3699 kfree(reply_buf);
3700 rbd_spec_put(parent_spec);
3701
3702 return ret;
3703}
3704
Alex Eldercc070d52013-04-21 12:14:45 -05003705static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3706{
3707 struct {
3708 __le64 stripe_unit;
3709 __le64 stripe_count;
3710 } __attribute__ ((packed)) striping_info_buf = { 0 };
3711 size_t size = sizeof (striping_info_buf);
3712 void *p;
3713 u64 obj_size;
3714 u64 stripe_unit;
3715 u64 stripe_count;
3716 int ret;
3717
3718 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3719 "rbd", "get_stripe_unit_count", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003720 (char *)&striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05003721 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3722 if (ret < 0)
3723 return ret;
3724 if (ret < size)
3725 return -ERANGE;
3726
3727 /*
3728 * We don't actually support the "fancy striping" feature
3729 * (STRIPINGV2) yet, but if the striping sizes are the
3730 * defaults the behavior is the same as before. So find
3731 * out, and only fail if the image has non-default values.
3732 */
3733 ret = -EINVAL;
3734 obj_size = (u64)1 << rbd_dev->header.obj_order;
3735 p = &striping_info_buf;
3736 stripe_unit = ceph_decode_64(&p);
3737 if (stripe_unit != obj_size) {
3738 rbd_warn(rbd_dev, "unsupported stripe unit "
3739 "(got %llu want %llu)",
3740 stripe_unit, obj_size);
3741 return -EINVAL;
3742 }
3743 stripe_count = ceph_decode_64(&p);
3744 if (stripe_count != 1) {
3745 rbd_warn(rbd_dev, "unsupported stripe count "
3746 "(got %llu want 1)", stripe_count);
3747 return -EINVAL;
3748 }
Alex Elder500d0c02013-04-26 09:43:47 -05003749 rbd_dev->header.stripe_unit = stripe_unit;
3750 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05003751
3752 return 0;
3753}
3754
Alex Elder9e15b772012-10-30 19:40:33 -05003755static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3756{
3757 size_t image_id_size;
3758 char *image_id;
3759 void *p;
3760 void *end;
3761 size_t size;
3762 void *reply_buf = NULL;
3763 size_t len = 0;
3764 char *image_name = NULL;
3765 int ret;
3766
3767 rbd_assert(!rbd_dev->spec->image_name);
3768
Alex Elder69e7a022012-11-01 08:39:26 -05003769 len = strlen(rbd_dev->spec->image_id);
3770 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003771 image_id = kmalloc(image_id_size, GFP_KERNEL);
3772 if (!image_id)
3773 return NULL;
3774
3775 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05003776 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05003777 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05003778
3779 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3780 reply_buf = kmalloc(size, GFP_KERNEL);
3781 if (!reply_buf)
3782 goto out;
3783
Alex Elder36be9a72013-01-19 00:30:28 -06003784 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003785 "rbd", "dir_get_name",
3786 image_id, image_id_size,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003787 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05003788 if (ret < 0)
3789 goto out;
3790 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05003791 end = reply_buf + ret;
3792
Alex Elder9e15b772012-10-30 19:40:33 -05003793 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3794 if (IS_ERR(image_name))
3795 image_name = NULL;
3796 else
3797 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3798out:
3799 kfree(reply_buf);
3800 kfree(image_id);
3801
3802 return image_name;
3803}
3804
Alex Elder2ad3d712013-04-30 00:44:33 -05003805static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3806{
3807 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3808 const char *snap_name;
3809 u32 which = 0;
3810
3811 /* Skip over names until we find the one we are looking for */
3812
3813 snap_name = rbd_dev->header.snap_names;
3814 while (which < snapc->num_snaps) {
3815 if (!strcmp(name, snap_name))
3816 return snapc->snaps[which];
3817 snap_name += strlen(snap_name) + 1;
3818 which++;
3819 }
3820 return CEPH_NOSNAP;
3821}
3822
3823static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3824{
3825 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3826 u32 which;
3827 bool found = false;
3828 u64 snap_id;
3829
3830 for (which = 0; !found && which < snapc->num_snaps; which++) {
3831 const char *snap_name;
3832
3833 snap_id = snapc->snaps[which];
3834 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
3835 if (IS_ERR(snap_name))
3836 break;
3837 found = !strcmp(name, snap_name);
3838 kfree(snap_name);
3839 }
3840 return found ? snap_id : CEPH_NOSNAP;
3841}
3842
3843/*
3844 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
3845 * no snapshot by that name is found, or if an error occurs.
3846 */
3847static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3848{
3849 if (rbd_dev->image_format == 1)
3850 return rbd_v1_snap_id_by_name(rbd_dev, name);
3851
3852 return rbd_v2_snap_id_by_name(rbd_dev, name);
3853}
3854
Alex Elder9e15b772012-10-30 19:40:33 -05003855/*
Alex Elder2e9f7f12013-04-26 09:43:48 -05003856 * When an rbd image has a parent image, it is identified by the
3857 * pool, image, and snapshot ids (not names). This function fills
3858 * in the names for those ids. (It's OK if we can't figure out the
3859 * name for an image id, but the pool and snapshot ids should always
3860 * exist and have names.) All names in an rbd spec are dynamically
3861 * allocated.
Alex Eldere1d42132013-04-25 23:15:08 -05003862 *
3863 * When an image being mapped (not a parent) is probed, we have the
3864 * pool name and pool id, image name and image id, and the snapshot
3865 * name. The only thing we're missing is the snapshot id.
Alex Elder9e15b772012-10-30 19:40:33 -05003866 */
Alex Elder2e9f7f12013-04-26 09:43:48 -05003867static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05003868{
Alex Elder2e9f7f12013-04-26 09:43:48 -05003869 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3870 struct rbd_spec *spec = rbd_dev->spec;
3871 const char *pool_name;
3872 const char *image_name;
3873 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05003874 int ret;
3875
Alex Eldere1d42132013-04-25 23:15:08 -05003876 /*
3877 * An image being mapped will have the pool name (etc.), but
3878 * we need to look up the snapshot id.
3879 */
Alex Elder2e9f7f12013-04-26 09:43:48 -05003880 if (spec->pool_name) {
3881 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
Alex Elder2ad3d712013-04-30 00:44:33 -05003882 u64 snap_id;
Alex Eldere1d42132013-04-25 23:15:08 -05003883
Alex Elder2ad3d712013-04-30 00:44:33 -05003884 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
3885 if (snap_id == CEPH_NOSNAP)
Alex Eldere1d42132013-04-25 23:15:08 -05003886 return -ENOENT;
Alex Elder2ad3d712013-04-30 00:44:33 -05003887 spec->snap_id = snap_id;
Alex Eldere1d42132013-04-25 23:15:08 -05003888 } else {
Alex Elder2e9f7f12013-04-26 09:43:48 -05003889 spec->snap_id = CEPH_NOSNAP;
Alex Eldere1d42132013-04-25 23:15:08 -05003890 }
3891
3892 return 0;
3893 }
Alex Elder9e15b772012-10-30 19:40:33 -05003894
Alex Elder2e9f7f12013-04-26 09:43:48 -05003895 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05003896
Alex Elder2e9f7f12013-04-26 09:43:48 -05003897 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
3898 if (!pool_name) {
3899 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003900 return -EIO;
3901 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05003902 pool_name = kstrdup(pool_name, GFP_KERNEL);
3903 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05003904 return -ENOMEM;
3905
3906 /* Fetch the image name; tolerate failure here */
3907
Alex Elder2e9f7f12013-04-26 09:43:48 -05003908 image_name = rbd_dev_image_name(rbd_dev);
3909 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05003910 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003911
Alex Elder2e9f7f12013-04-26 09:43:48 -05003912 /* Look up the snapshot name, and make a copy */
Alex Elder9e15b772012-10-30 19:40:33 -05003913
Alex Elder2e9f7f12013-04-26 09:43:48 -05003914 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
3915 if (!snap_name) {
Alex Elder2e9f7f12013-04-26 09:43:48 -05003916 ret = -ENOMEM;
Alex Elder9e15b772012-10-30 19:40:33 -05003917 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05003918 }
3919
3920 spec->pool_name = pool_name;
3921 spec->image_name = image_name;
3922 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05003923
3924 return 0;
3925out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05003926 kfree(image_name);
3927 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05003928
3929 return ret;
3930}
3931
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003932static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05003933{
3934 size_t size;
3935 int ret;
3936 void *reply_buf;
3937 void *p;
3938 void *end;
3939 u64 seq;
3940 u32 snap_count;
3941 struct ceph_snap_context *snapc;
3942 u32 i;
3943
3944 /*
3945 * We'll need room for the seq value (maximum snapshot id),
3946 * snapshot count, and array of that many snapshot ids.
3947 * For now we have a fixed upper limit on the number we're
3948 * prepared to receive.
3949 */
3950 size = sizeof (__le64) + sizeof (__le32) +
3951 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3952 reply_buf = kzalloc(size, GFP_KERNEL);
3953 if (!reply_buf)
3954 return -ENOMEM;
3955
Alex Elder36be9a72013-01-19 00:30:28 -06003956 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003957 "rbd", "get_snapcontext", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003958 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06003959 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003960 if (ret < 0)
3961 goto out;
3962
Alex Elder35d489f2012-07-03 16:01:19 -05003963 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003964 end = reply_buf + ret;
3965 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05003966 ceph_decode_64_safe(&p, end, seq, out);
3967 ceph_decode_32_safe(&p, end, snap_count, out);
3968
3969 /*
3970 * Make sure the reported number of snapshot ids wouldn't go
3971 * beyond the end of our buffer. But before checking that,
3972 * make sure the computed size of the snapshot context we
3973 * allocate is representable in a size_t.
3974 */
3975 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3976 / sizeof (u64)) {
3977 ret = -EINVAL;
3978 goto out;
3979 }
3980 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3981 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05003982 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05003983
Alex Elder812164f82013-04-30 00:44:32 -05003984 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05003985 if (!snapc) {
3986 ret = -ENOMEM;
3987 goto out;
3988 }
Alex Elder35d489f2012-07-03 16:01:19 -05003989 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05003990 for (i = 0; i < snap_count; i++)
3991 snapc->snaps[i] = ceph_decode_64(&p);
3992
3993 rbd_dev->header.snapc = snapc;
3994
3995 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05003996 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05003997out:
3998 kfree(reply_buf);
3999
Alex Elder57385b52013-04-21 12:14:45 -05004000 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05004001}
4002
Alex Elder54cac612013-04-30 00:44:33 -05004003static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4004 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004005{
4006 size_t size;
4007 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05004008 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004009 int ret;
4010 void *p;
4011 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004012 char *snap_name;
4013
4014 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4015 reply_buf = kmalloc(size, GFP_KERNEL);
4016 if (!reply_buf)
4017 return ERR_PTR(-ENOMEM);
4018
Alex Elder54cac612013-04-30 00:44:33 -05004019 snapid = cpu_to_le64(snap_id);
Alex Elder36be9a72013-01-19 00:30:28 -06004020 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004021 "rbd", "get_snapshot_name",
Alex Elder54cac612013-04-30 00:44:33 -05004022 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05004023 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004024 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05004025 if (ret < 0) {
4026 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004027 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05004028 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004029
4030 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004031 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05004032 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05004033 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004034 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004035
Alex Elderf40eb342013-04-25 15:09:42 -05004036 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05004037 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004038out:
4039 kfree(reply_buf);
4040
Alex Elderf40eb342013-04-25 15:09:42 -05004041 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004042}
4043
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004044static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05004045{
4046 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05004047
4048 down_write(&rbd_dev->header_rwsem);
4049
Alex Elder117973f2012-08-31 17:29:55 -05004050 ret = rbd_dev_v2_image_size(rbd_dev);
4051 if (ret)
4052 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05004053 rbd_update_mapping_size(rbd_dev);
4054
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004055 ret = rbd_dev_v2_snap_context(rbd_dev);
Alex Elder117973f2012-08-31 17:29:55 -05004056 dout("rbd_dev_v2_snap_context returned %d\n", ret);
4057 if (ret)
4058 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05004059out:
4060 up_write(&rbd_dev->header_rwsem);
4061
4062 return ret;
4063}
4064
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004065static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4066{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004067 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05004068 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004069
4070 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004071
Alex Eldercd789ab2012-08-30 00:16:38 -05004072 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004073 dev->bus = &rbd_bus_type;
4074 dev->type = &rbd_device_type;
4075 dev->parent = &rbd_root_dev;
Alex Elder200a6a82013-04-28 23:32:34 -05004076 dev->release = rbd_dev_device_release;
Alex Elderde71a292012-07-03 16:01:19 -05004077 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004078 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004079
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004080 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05004081
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004082 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004083}
4084
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004085static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4086{
4087 device_unregister(&rbd_dev->dev);
4088}
4089
Alex Eldere2839302012-08-29 17:11:06 -05004090static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06004091
4092/*
Alex Elder499afd52012-02-02 08:13:29 -06004093 * Get a unique rbd identifier for the given new rbd_dev, and add
4094 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06004095 */
Alex Eldere2839302012-08-29 17:11:06 -05004096static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06004097{
Alex Eldere2839302012-08-29 17:11:06 -05004098 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06004099
4100 spin_lock(&rbd_dev_list_lock);
4101 list_add_tail(&rbd_dev->node, &rbd_dev_list);
4102 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05004103 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4104 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06004105}
Alex Elderb7f23c32012-01-29 13:57:43 -06004106
Alex Elder1ddbe942012-01-29 13:57:44 -06004107/*
Alex Elder499afd52012-02-02 08:13:29 -06004108 * Remove an rbd_dev from the global list, and record that its
4109 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06004110 */
Alex Eldere2839302012-08-29 17:11:06 -05004111static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06004112{
Alex Elderd184f6b2012-01-29 13:57:44 -06004113 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05004114 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004115 int max_id;
4116
Alex Elderaafb2302012-09-06 16:00:54 -05004117 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06004118
Alex Eldere2839302012-08-29 17:11:06 -05004119 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4120 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06004121 spin_lock(&rbd_dev_list_lock);
4122 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06004123
4124 /*
4125 * If the id being "put" is not the current maximum, there
4126 * is nothing special we need to do.
4127 */
Alex Eldere2839302012-08-29 17:11:06 -05004128 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06004129 spin_unlock(&rbd_dev_list_lock);
4130 return;
4131 }
4132
4133 /*
4134 * We need to update the current maximum id. Search the
4135 * list to find out what it is. We're more likely to find
4136 * the maximum at the end, so search the list backward.
4137 */
4138 max_id = 0;
4139 list_for_each_prev(tmp, &rbd_dev_list) {
4140 struct rbd_device *rbd_dev;
4141
4142 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07004143 if (rbd_dev->dev_id > max_id)
4144 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004145 }
Alex Elder499afd52012-02-02 08:13:29 -06004146 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06004147
Alex Elder1ddbe942012-01-29 13:57:44 -06004148 /*
Alex Eldere2839302012-08-29 17:11:06 -05004149 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06004150 * which case it now accurately reflects the new maximum.
4151 * Be careful not to overwrite the maximum value in that
4152 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06004153 */
Alex Eldere2839302012-08-29 17:11:06 -05004154 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4155 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06004156}
4157
Alex Eldera725f65e2012-02-02 08:13:30 -06004158/*
Alex Eldere28fff262012-02-02 08:13:30 -06004159 * Skips over white space at *buf, and updates *buf to point to the
4160 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004161 * the token (string of non-white space characters) found. Note
4162 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004163 */
4164static inline size_t next_token(const char **buf)
4165{
4166 /*
4167 * These are the characters that produce nonzero for
4168 * isspace() in the "C" and "POSIX" locales.
4169 */
4170 const char *spaces = " \f\n\r\t\v";
4171
4172 *buf += strspn(*buf, spaces); /* Find start of token */
4173
4174 return strcspn(*buf, spaces); /* Return token length */
4175}
4176
4177/*
4178 * Finds the next token in *buf, and if the provided token buffer is
4179 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06004180 * copied, is guaranteed to be terminated with '\0'. Note that *buf
4181 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06004182 *
4183 * Returns the length of the token found (not including the '\0').
4184 * Return value will be 0 if no token is found, and it will be >=
4185 * token_size if the token would not fit.
4186 *
Alex Elder593a9e72012-02-07 12:03:37 -06004187 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06004188 * found token. Note that this occurs even if the token buffer is
4189 * too small to hold it.
4190 */
4191static inline size_t copy_token(const char **buf,
4192 char *token,
4193 size_t token_size)
4194{
4195 size_t len;
4196
4197 len = next_token(buf);
4198 if (len < token_size) {
4199 memcpy(token, *buf, len);
4200 *(token + len) = '\0';
4201 }
4202 *buf += len;
4203
4204 return len;
4205}
4206
4207/*
Alex Elderea3352f2012-07-09 21:04:23 -05004208 * Finds the next token in *buf, dynamically allocates a buffer big
4209 * enough to hold a copy of it, and copies the token into the new
4210 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4211 * that a duplicate buffer is created even for a zero-length token.
4212 *
4213 * Returns a pointer to the newly-allocated duplicate, or a null
4214 * pointer if memory for the duplicate was not available. If
4215 * the lenp argument is a non-null pointer, the length of the token
4216 * (not including the '\0') is returned in *lenp.
4217 *
4218 * If successful, the *buf pointer will be updated to point beyond
4219 * the end of the found token.
4220 *
4221 * Note: uses GFP_KERNEL for allocation.
4222 */
4223static inline char *dup_token(const char **buf, size_t *lenp)
4224{
4225 char *dup;
4226 size_t len;
4227
4228 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004229 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004230 if (!dup)
4231 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004232 *(dup + len) = '\0';
4233 *buf += len;
4234
4235 if (lenp)
4236 *lenp = len;
4237
4238 return dup;
4239}
4240
4241/*
Alex Elder859c31d2012-10-25 23:34:42 -05004242 * Parse the options provided for an "rbd add" (i.e., rbd image
4243 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4244 * and the data written is passed here via a NUL-terminated buffer.
4245 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004246 *
Alex Elder859c31d2012-10-25 23:34:42 -05004247 * The information extracted from these options is recorded in
4248 * the other parameters which return dynamically-allocated
4249 * structures:
4250 * ceph_opts
4251 * The address of a pointer that will refer to a ceph options
4252 * structure. Caller must release the returned pointer using
4253 * ceph_destroy_options() when it is no longer needed.
4254 * rbd_opts
4255 * Address of an rbd options pointer. Fully initialized by
4256 * this function; caller must release with kfree().
4257 * spec
4258 * Address of an rbd image specification pointer. Fully
4259 * initialized by this function based on parsed options.
4260 * Caller must release with rbd_spec_put().
4261 *
4262 * The options passed take this form:
4263 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4264 * where:
4265 * <mon_addrs>
4266 * A comma-separated list of one or more monitor addresses.
4267 * A monitor address is an ip address, optionally followed
4268 * by a port number (separated by a colon).
4269 * I.e.: ip1[:port1][,ip2[:port2]...]
4270 * <options>
4271 * A comma-separated list of ceph and/or rbd options.
4272 * <pool_name>
4273 * The name of the rados pool containing the rbd image.
4274 * <image_name>
4275 * The name of the image in that pool to map.
4276 * <snap_id>
4277 * An optional snapshot id. If provided, the mapping will
4278 * present data from the image at the time that snapshot was
4279 * created. The image head is used if no snapshot id is
4280 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004281 */
Alex Elder859c31d2012-10-25 23:34:42 -05004282static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004283 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004284 struct rbd_options **opts,
4285 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004286{
Alex Elderd22f76e2012-07-12 10:46:35 -05004287 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004288 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004289 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05004290 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05004291 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004292 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004293 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004294 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004295 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004296
4297 /* The first four tokens are required */
4298
Alex Elder7ef32142012-02-02 08:13:30 -06004299 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004300 if (!len) {
4301 rbd_warn(NULL, "no monitor address(es) provided");
4302 return -EINVAL;
4303 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004304 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004305 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004306 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004307
Alex Elderdc79b112012-10-25 23:34:41 -05004308 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004309 options = dup_token(&buf, NULL);
4310 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004311 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004312 if (!*options) {
4313 rbd_warn(NULL, "no options provided");
4314 goto out_err;
4315 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004316
Alex Elder859c31d2012-10-25 23:34:42 -05004317 spec = rbd_spec_alloc();
4318 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004319 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004320
4321 spec->pool_name = dup_token(&buf, NULL);
4322 if (!spec->pool_name)
4323 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004324 if (!*spec->pool_name) {
4325 rbd_warn(NULL, "no pool name provided");
4326 goto out_err;
4327 }
Alex Eldere28fff262012-02-02 08:13:30 -06004328
Alex Elder69e7a022012-11-01 08:39:26 -05004329 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004330 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004331 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004332 if (!*spec->image_name) {
4333 rbd_warn(NULL, "no image name provided");
4334 goto out_err;
4335 }
Alex Eldere28fff262012-02-02 08:13:30 -06004336
Alex Elderf28e5652012-10-25 23:34:41 -05004337 /*
4338 * Snapshot name is optional; default is to use "-"
4339 * (indicating the head/no snapshot).
4340 */
Alex Elder3feeb8942012-08-31 17:29:52 -05004341 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05004342 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05004343 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4344 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05004345 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05004346 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05004347 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05004348 }
Alex Elderecb4dc22013-04-26 09:43:47 -05004349 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4350 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004351 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05004352 *(snap_name + len) = '\0';
4353 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05004354
Alex Elder0ddebc02012-10-25 23:34:41 -05004355 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06004356
Alex Elder4e9afeb2012-10-25 23:34:41 -05004357 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4358 if (!rbd_opts)
4359 goto out_mem;
4360
4361 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05004362
Alex Elder859c31d2012-10-25 23:34:42 -05004363 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05004364 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05004365 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004366 if (IS_ERR(copts)) {
4367 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05004368 goto out_err;
4369 }
Alex Elder859c31d2012-10-25 23:34:42 -05004370 kfree(options);
4371
4372 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004373 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05004374 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05004375
Alex Elderdc79b112012-10-25 23:34:41 -05004376 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05004377out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05004378 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05004379out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05004380 kfree(rbd_opts);
4381 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05004382 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05004383
Alex Elderdc79b112012-10-25 23:34:41 -05004384 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06004385}
4386
Alex Elder589d30e2012-07-10 20:30:11 -05004387/*
4388 * An rbd format 2 image has a unique identifier, distinct from the
4389 * name given to it by the user. Internally, that identifier is
4390 * what's used to specify the names of objects related to the image.
4391 *
4392 * A special "rbd id" object is used to map an rbd image name to its
4393 * id. If that object doesn't exist, then there is no v2 rbd image
4394 * with the supplied name.
4395 *
4396 * This function will record the given rbd_dev's image_id field if
4397 * it can be determined, and in that case will return 0. If any
4398 * errors occur a negative errno will be returned and the rbd_dev's
4399 * image_id field will be unchanged (and should be NULL).
4400 */
4401static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4402{
4403 int ret;
4404 size_t size;
4405 char *object_name;
4406 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05004407 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05004408
Alex Elder589d30e2012-07-10 20:30:11 -05004409 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05004410 * When probing a parent image, the image id is already
4411 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05004412 * need to fetch the image id again in this case. We
4413 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05004414 */
Alex Elderc0fba362013-04-25 23:15:08 -05004415 if (rbd_dev->spec->image_id) {
4416 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4417
Alex Elder2c0d0a12012-10-30 19:40:33 -05004418 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05004419 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05004420
4421 /*
Alex Elder589d30e2012-07-10 20:30:11 -05004422 * First, see if the format 2 image id file exists, and if
4423 * so, get the image's persistent id from it.
4424 */
Alex Elder69e7a022012-11-01 08:39:26 -05004425 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004426 object_name = kmalloc(size, GFP_NOIO);
4427 if (!object_name)
4428 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004429 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004430 dout("rbd id object name is %s\n", object_name);
4431
4432 /* Response will be an encoded string, which includes a length */
4433
4434 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4435 response = kzalloc(size, GFP_NOIO);
4436 if (!response) {
4437 ret = -ENOMEM;
4438 goto out;
4439 }
4440
Alex Elderc0fba362013-04-25 23:15:08 -05004441 /* If it doesn't exist we'll assume it's a format 1 image */
4442
Alex Elder36be9a72013-01-19 00:30:28 -06004443 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder41579762013-04-21 12:14:45 -05004444 "rbd", "get_id", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004445 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004446 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05004447 if (ret == -ENOENT) {
4448 image_id = kstrdup("", GFP_KERNEL);
4449 ret = image_id ? 0 : -ENOMEM;
4450 if (!ret)
4451 rbd_dev->image_format = 1;
4452 } else if (ret > sizeof (__le32)) {
4453 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05004454
Alex Elderc0fba362013-04-25 23:15:08 -05004455 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05004456 NULL, GFP_NOIO);
Alex Elderc0fba362013-04-25 23:15:08 -05004457 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4458 if (!ret)
4459 rbd_dev->image_format = 2;
Alex Elder589d30e2012-07-10 20:30:11 -05004460 } else {
Alex Elderc0fba362013-04-25 23:15:08 -05004461 ret = -EINVAL;
4462 }
4463
4464 if (!ret) {
4465 rbd_dev->spec->image_id = image_id;
4466 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004467 }
4468out:
4469 kfree(response);
4470 kfree(object_name);
4471
4472 return ret;
4473}
4474
Alex Elder6fd48b32013-04-28 23:32:34 -05004475/* Undo whatever state changes are made by v1 or v2 image probe */
4476
4477static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
4478{
4479 struct rbd_image_header *header;
4480
4481 rbd_dev_remove_parent(rbd_dev);
4482 rbd_spec_put(rbd_dev->parent_spec);
4483 rbd_dev->parent_spec = NULL;
4484 rbd_dev->parent_overlap = 0;
4485
4486 /* Free dynamic fields from the header, then zero it out */
4487
4488 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05004489 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05004490 kfree(header->snap_sizes);
4491 kfree(header->snap_names);
4492 kfree(header->object_prefix);
4493 memset(header, 0, sizeof (*header));
4494}
4495
Alex Eldera30b71b2012-07-10 20:30:11 -05004496static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4497{
4498 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004499
4500 /* Populate rbd image metadata */
4501
4502 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4503 if (ret < 0)
4504 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05004505
4506 /* Version 1 images have no parent (no layering) */
4507
4508 rbd_dev->parent_spec = NULL;
4509 rbd_dev->parent_overlap = 0;
4510
Alex Eldera30b71b2012-07-10 20:30:11 -05004511 dout("discovered version 1 image, header name is %s\n",
4512 rbd_dev->header_name);
4513
4514 return 0;
4515
4516out_err:
4517 kfree(rbd_dev->header_name);
4518 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004519 kfree(rbd_dev->spec->image_id);
4520 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05004521
4522 return ret;
4523}
4524
4525static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4526{
Alex Elder9d475de2012-07-03 16:01:19 -05004527 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004528
Alex Elder9d475de2012-07-03 16:01:19 -05004529 ret = rbd_dev_v2_image_size(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004530 if (ret)
Alex Elder9d475de2012-07-03 16:01:19 -05004531 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05004532
4533 /* Get the object prefix (a.k.a. block_name) for the image */
4534
4535 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004536 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05004537 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05004538
Alex Elderd8891402012-10-09 13:50:17 -07004539 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05004540
4541 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004542 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05004543 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05004544
Alex Elder86b00e02012-10-25 23:34:42 -05004545 /* If the image supports layering, get the parent info */
4546
4547 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4548 ret = rbd_dev_v2_parent_info(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004549 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004550 goto out_err;
Alex Elder96882f52013-04-30 00:44:32 -05004551
4552 /*
4553 * Don't print a warning for parent images. We can
4554 * tell this point because we won't know its pool
4555 * name yet (just its pool id).
4556 */
4557 if (rbd_dev->spec->pool_name)
4558 rbd_warn(rbd_dev, "WARNING: kernel layering "
4559 "is EXPERIMENTAL!");
Alex Elder86b00e02012-10-25 23:34:42 -05004560 }
4561
Alex Eldercc070d52013-04-21 12:14:45 -05004562 /* If the image supports fancy striping, get its parameters */
4563
4564 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4565 ret = rbd_dev_v2_striping_info(rbd_dev);
4566 if (ret < 0)
4567 goto out_err;
4568 }
4569
Alex Elder6e14b1a2012-07-03 16:01:19 -05004570 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05004571
Alex Elder6e14b1a2012-07-03 16:01:19 -05004572 rbd_dev->header.crypt_type = 0;
4573 rbd_dev->header.comp_type = 0;
4574
4575 /* Get the snapshot context, plus the header version */
4576
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004577 ret = rbd_dev_v2_snap_context(rbd_dev);
Alex Elder35d489f2012-07-03 16:01:19 -05004578 if (ret)
4579 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004580
Alex Eldera30b71b2012-07-10 20:30:11 -05004581 dout("discovered version 2 image, header name is %s\n",
4582 rbd_dev->header_name);
4583
Alex Elder35152972012-08-31 17:29:55 -05004584 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05004585out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05004586 rbd_dev->parent_overlap = 0;
4587 rbd_spec_put(rbd_dev->parent_spec);
4588 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004589 kfree(rbd_dev->header_name);
4590 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05004591 kfree(rbd_dev->header.object_prefix);
4592 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004593
4594 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004595}
4596
Alex Elder124afba2013-04-26 15:44:36 -05004597static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
Alex Elder83a06262012-10-30 15:47:17 -05004598{
Alex Elder2f82ee52012-10-30 19:40:33 -05004599 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05004600 struct rbd_spec *parent_spec;
4601 struct rbd_client *rbdc;
4602 int ret;
4603
4604 if (!rbd_dev->parent_spec)
4605 return 0;
4606 /*
4607 * We need to pass a reference to the client and the parent
4608 * spec when creating the parent rbd_dev. Images related by
4609 * parent/child relationships always share both.
4610 */
4611 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4612 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4613
4614 ret = -ENOMEM;
4615 parent = rbd_dev_create(rbdc, parent_spec);
4616 if (!parent)
4617 goto out_err;
4618
4619 ret = rbd_dev_image_probe(parent);
4620 if (ret < 0)
4621 goto out_err;
4622 rbd_dev->parent = parent;
4623
4624 return 0;
4625out_err:
4626 if (parent) {
4627 rbd_spec_put(rbd_dev->parent_spec);
4628 kfree(rbd_dev->header_name);
4629 rbd_dev_destroy(parent);
4630 } else {
4631 rbd_put_client(rbdc);
4632 rbd_spec_put(parent_spec);
4633 }
4634
4635 return ret;
4636}
4637
Alex Elder200a6a82013-04-28 23:32:34 -05004638static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05004639{
Alex Elder83a06262012-10-30 15:47:17 -05004640 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05004641
Alex Elderd1cf5782013-04-27 09:59:30 -05004642 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004643 if (ret)
Alex Elder9bb81c92013-04-27 09:59:30 -05004644 return ret;
Alex Elder5de10f32013-04-26 15:44:37 -05004645
Alex Elder83a06262012-10-30 15:47:17 -05004646 /* generate unique id: find highest unique id, add one */
4647 rbd_dev_id_get(rbd_dev);
4648
4649 /* Fill in the device name, now that we have its id. */
4650 BUILD_BUG_ON(DEV_NAME_LEN
4651 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4652 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4653
4654 /* Get our block major device number. */
4655
4656 ret = register_blkdev(0, rbd_dev->name);
4657 if (ret < 0)
4658 goto err_out_id;
4659 rbd_dev->major = ret;
4660
4661 /* Set up the blkdev mapping. */
4662
4663 ret = rbd_init_disk(rbd_dev);
4664 if (ret)
4665 goto err_out_blkdev;
4666
4667 ret = rbd_bus_add_dev(rbd_dev);
4668 if (ret)
4669 goto err_out_disk;
4670
Alex Elder83a06262012-10-30 15:47:17 -05004671 /* Everything's ready. Announce the disk to the world. */
4672
Alex Elderb5156e72013-04-26 15:44:36 -05004673 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Alex Elder129b79d2013-04-26 15:44:36 -05004674 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder83a06262012-10-30 15:47:17 -05004675 add_disk(rbd_dev->disk);
4676
4677 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4678 (unsigned long long) rbd_dev->mapping.size);
4679
4680 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05004681
Alex Elder83a06262012-10-30 15:47:17 -05004682err_out_disk:
4683 rbd_free_disk(rbd_dev);
4684err_out_blkdev:
4685 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4686err_out_id:
4687 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05004688 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004689
4690 return ret;
4691}
4692
Alex Elder332bb122013-04-27 09:59:30 -05004693static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4694{
4695 struct rbd_spec *spec = rbd_dev->spec;
4696 size_t size;
4697
4698 /* Record the header object name for this rbd image. */
4699
4700 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4701
4702 if (rbd_dev->image_format == 1)
4703 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4704 else
4705 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4706
4707 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4708 if (!rbd_dev->header_name)
4709 return -ENOMEM;
4710
4711 if (rbd_dev->image_format == 1)
4712 sprintf(rbd_dev->header_name, "%s%s",
4713 spec->image_name, RBD_SUFFIX);
4714 else
4715 sprintf(rbd_dev->header_name, "%s%s",
4716 RBD_HEADER_PREFIX, spec->image_id);
4717 return 0;
4718}
4719
Alex Elder200a6a82013-04-28 23:32:34 -05004720static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4721{
Alex Elder6fd48b32013-04-28 23:32:34 -05004722 int ret;
4723
Alex Elder6fd48b32013-04-28 23:32:34 -05004724 rbd_dev_unprobe(rbd_dev);
4725 ret = rbd_dev_header_watch_sync(rbd_dev, 0);
4726 if (ret)
4727 rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
Alex Elder200a6a82013-04-28 23:32:34 -05004728 kfree(rbd_dev->header_name);
Alex Elder6fd48b32013-04-28 23:32:34 -05004729 rbd_dev->header_name = NULL;
4730 rbd_dev->image_format = 0;
4731 kfree(rbd_dev->spec->image_id);
4732 rbd_dev->spec->image_id = NULL;
4733
Alex Elder200a6a82013-04-28 23:32:34 -05004734 rbd_dev_destroy(rbd_dev);
4735}
4736
Alex Eldera30b71b2012-07-10 20:30:11 -05004737/*
4738 * Probe for the existence of the header object for the given rbd
4739 * device. For format 2 images this includes determining the image
4740 * id.
4741 */
Alex Elder71f293e2013-04-26 09:43:48 -05004742static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05004743{
4744 int ret;
Alex Elderb644de22013-04-27 09:59:31 -05004745 int tmp;
Alex Eldera30b71b2012-07-10 20:30:11 -05004746
4747 /*
4748 * Get the id from the image id object. If it's not a
4749 * format 2 image, we'll get ENOENT back, and we'll assume
4750 * it's a format 1 image.
4751 */
4752 ret = rbd_dev_image_id(rbd_dev);
4753 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05004754 return ret;
4755 rbd_assert(rbd_dev->spec->image_id);
4756 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4757
Alex Elder332bb122013-04-27 09:59:30 -05004758 ret = rbd_dev_header_name(rbd_dev);
4759 if (ret)
4760 goto err_out_format;
4761
Alex Elderb644de22013-04-27 09:59:31 -05004762 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4763 if (ret)
4764 goto out_header_name;
4765
Alex Elderc0fba362013-04-25 23:15:08 -05004766 if (rbd_dev->image_format == 1)
Alex Eldera30b71b2012-07-10 20:30:11 -05004767 ret = rbd_dev_v1_probe(rbd_dev);
4768 else
4769 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05004770 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05004771 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05004772
Alex Elder9bb81c92013-04-27 09:59:30 -05004773 ret = rbd_dev_spec_update(rbd_dev);
4774 if (ret)
Alex Elder33dca392013-04-30 00:44:33 -05004775 goto err_out_probe;
Alex Elder9bb81c92013-04-27 09:59:30 -05004776
4777 ret = rbd_dev_probe_parent(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05004778 if (!ret)
4779 return 0;
Alex Elder83a06262012-10-30 15:47:17 -05004780
Alex Elder6fd48b32013-04-28 23:32:34 -05004781err_out_probe:
4782 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05004783err_out_watch:
4784 tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4785 if (tmp)
4786 rbd_warn(rbd_dev, "unable to tear down watch request\n");
Alex Elder332bb122013-04-27 09:59:30 -05004787out_header_name:
4788 kfree(rbd_dev->header_name);
4789 rbd_dev->header_name = NULL;
4790err_out_format:
4791 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05004792 kfree(rbd_dev->spec->image_id);
4793 rbd_dev->spec->image_id = NULL;
4794
4795 dout("probe failed, returning %d\n", ret);
4796
4797 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004798}
4799
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004800static ssize_t rbd_add(struct bus_type *bus,
4801 const char *buf,
4802 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004803{
Alex Eldercb8627c2012-07-09 21:04:23 -05004804 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004805 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004806 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004807 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004808 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004809 struct ceph_osd_client *osdc;
4810 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004811
4812 if (!try_module_get(THIS_MODULE))
4813 return -ENODEV;
4814
Alex Eldera725f65e2012-02-02 08:13:30 -06004815 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004816 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004817 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004818 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004819
Alex Elder9d3997f2012-10-25 23:34:42 -05004820 rbdc = rbd_get_client(ceph_opts);
4821 if (IS_ERR(rbdc)) {
4822 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004823 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004824 }
Alex Elderc53d5892012-10-25 23:34:42 -05004825 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004826
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004827 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004828 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004829 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004830 if (rc < 0)
4831 goto err_out_client;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004832 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05004833
Alex Elder0903e872012-11-14 12:25:19 -06004834 /* The ceph file layout needs to fit pool id in 32 bits */
4835
Alex Elderc0cd10db2013-04-26 09:43:47 -05004836 if (spec->pool_id > (u64)U32_MAX) {
4837 rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4838 (unsigned long long)spec->pool_id, U32_MAX);
Alex Elder0903e872012-11-14 12:25:19 -06004839 rc = -EIO;
4840 goto err_out_client;
4841 }
4842
Alex Elderc53d5892012-10-25 23:34:42 -05004843 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004844 if (!rbd_dev)
4845 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004846 rbdc = NULL; /* rbd_dev now owns this */
4847 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004848
Alex Elderbd4ba652012-10-25 23:34:42 -05004849 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004850 kfree(rbd_opts);
4851 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004852
Alex Elder71f293e2013-04-26 09:43:48 -05004853 rc = rbd_dev_image_probe(rbd_dev);
Alex Eldera30b71b2012-07-10 20:30:11 -05004854 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004855 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004856
Alex Elderb536f692013-04-28 23:32:34 -05004857 rc = rbd_dev_device_setup(rbd_dev);
4858 if (!rc)
4859 return count;
4860
4861 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004862err_out_rbd_dev:
4863 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004864err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004865 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004866err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004867 if (ceph_opts)
4868 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004869 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004870 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004871err_out_module:
4872 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004873
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004874 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004875
Alex Elderc0cd10db2013-04-26 09:43:47 -05004876 return (ssize_t)rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004877}
4878
Alex Elderde71a292012-07-03 16:01:19 -05004879static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004880{
4881 struct list_head *tmp;
4882 struct rbd_device *rbd_dev;
4883
Alex Eldere124a82f2012-01-29 13:57:44 -06004884 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004885 list_for_each(tmp, &rbd_dev_list) {
4886 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004887 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06004888 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004889 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06004890 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004891 }
Alex Eldere124a82f2012-01-29 13:57:44 -06004892 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004893 return NULL;
4894}
4895
Alex Elder200a6a82013-04-28 23:32:34 -05004896static void rbd_dev_device_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004897{
Alex Elder593a9e72012-02-07 12:03:37 -06004898 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004899
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004900 rbd_free_disk(rbd_dev);
Alex Elder200a6a82013-04-28 23:32:34 -05004901 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4902 rbd_dev_clear_mapping(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004903 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder200a6a82013-04-28 23:32:34 -05004904 rbd_dev->major = 0;
Alex Eldere2839302012-08-29 17:11:06 -05004905 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05004906 rbd_dev_mapping_clear(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004907}
4908
Alex Elder05a46af2013-04-26 15:44:36 -05004909static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
4910{
Alex Elderad945fc2013-04-26 15:44:36 -05004911 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05004912 struct rbd_device *first = rbd_dev;
4913 struct rbd_device *second = first->parent;
4914 struct rbd_device *third;
4915
4916 /*
4917 * Follow to the parent with no grandparent and
4918 * remove it.
4919 */
4920 while (second && (third = second->parent)) {
4921 first = second;
4922 second = third;
4923 }
Alex Elderad945fc2013-04-26 15:44:36 -05004924 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05004925 rbd_dev_image_release(second);
Alex Elderad945fc2013-04-26 15:44:36 -05004926 first->parent = NULL;
4927 first->parent_overlap = 0;
4928
4929 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05004930 rbd_spec_put(first->parent_spec);
4931 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05004932 }
4933}
4934
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004935static ssize_t rbd_remove(struct bus_type *bus,
4936 const char *buf,
4937 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004938{
4939 struct rbd_device *rbd_dev = NULL;
Alex Elder0d8189e2013-04-27 09:59:30 -05004940 int target_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004941 unsigned long ul;
Alex Elder0d8189e2013-04-27 09:59:30 -05004942 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004943
Alex Elder0d8189e2013-04-27 09:59:30 -05004944 ret = strict_strtoul(buf, 10, &ul);
4945 if (ret)
4946 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004947
4948 /* convert to int; abort if we lost anything in the conversion */
4949 target_id = (int) ul;
4950 if (target_id != ul)
4951 return -EINVAL;
4952
4953 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4954
4955 rbd_dev = __rbd_get_dev(target_id);
4956 if (!rbd_dev) {
4957 ret = -ENOENT;
4958 goto done;
4959 }
4960
Alex Eldera14ea262013-02-05 13:23:12 -06004961 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004962 if (rbd_dev->open_count)
Alex Elder42382b72012-11-16 09:29:16 -06004963 ret = -EBUSY;
Alex Elderb82d1672013-01-14 12:43:31 -06004964 else
4965 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
Alex Eldera14ea262013-02-05 13:23:12 -06004966 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004967 if (ret < 0)
Alex Elder42382b72012-11-16 09:29:16 -06004968 goto done;
Alex Elder0d8189e2013-04-27 09:59:30 -05004969 ret = count;
Alex Elderb4808152013-04-26 15:44:36 -05004970 rbd_bus_del_dev(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05004971 rbd_dev_image_release(rbd_dev);
Alex Elder79ab7552013-04-28 23:32:34 -05004972 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004973done:
4974 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05004975
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004976 return ret;
4977}
4978
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004979/*
4980 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004981 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004982 */
4983static int rbd_sysfs_init(void)
4984{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004985 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004986
Alex Elderfed4c142012-02-07 12:03:36 -06004987 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004988 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004989 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004990
Alex Elderfed4c142012-02-07 12:03:36 -06004991 ret = bus_register(&rbd_bus_type);
4992 if (ret < 0)
4993 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004994
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004995 return ret;
4996}
4997
4998static void rbd_sysfs_cleanup(void)
4999{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005000 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005001 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005002}
5003
Alex Elder1c2a9df2013-05-01 12:43:03 -05005004static int rbd_slab_init(void)
5005{
5006 rbd_assert(!rbd_img_request_cache);
5007 rbd_img_request_cache = kmem_cache_create("rbd_img_request",
5008 sizeof (struct rbd_img_request),
5009 __alignof__(struct rbd_img_request),
5010 0, NULL);
5011 if (rbd_img_request_cache)
5012 return 0;
5013
5014 return -ENOMEM;
5015}
5016
5017static void rbd_slab_exit(void)
5018{
5019 rbd_assert(rbd_img_request_cache);
5020 kmem_cache_destroy(rbd_img_request_cache);
5021 rbd_img_request_cache = NULL;
5022}
5023
Alex Eldercc344fa2013-02-19 12:25:56 -06005024static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005025{
5026 int rc;
5027
Alex Elder1e32d342013-01-30 11:13:33 -06005028 if (!libceph_compatible(NULL)) {
5029 rbd_warn(NULL, "libceph incompatibility (quitting)");
5030
5031 return -EINVAL;
5032 }
Alex Elder1c2a9df2013-05-01 12:43:03 -05005033 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005034 if (rc)
5035 return rc;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005036 rc = rbd_sysfs_init();
5037 if (rc)
5038 rbd_slab_exit();
5039 else
5040 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5041
5042 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005043}
5044
Alex Eldercc344fa2013-02-19 12:25:56 -06005045static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005046{
5047 rbd_sysfs_cleanup();
Alex Elder1c2a9df2013-05-01 12:43:03 -05005048 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005049}
5050
5051module_init(rbd_init);
5052module_exit(rbd_exit);
5053
5054MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5055MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5056MODULE_DESCRIPTION("rados block device");
5057
5058/* following authorship retained from original osdblk.c */
5059MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5060
5061MODULE_LICENSE("GPL");