blob: a72842aa3b53c3c017681efb65a7e85fc5be1251 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
34#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070035#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050036#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070037
38#include <linux/kernel.h>
39#include <linux/device.h>
40#include <linux/module.h>
41#include <linux/fs.h>
42#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050043#include <linux/slab.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044
45#include "rbd_types.h"
46
Alex Elderaafb2302012-09-06 16:00:54 -050047#define RBD_DEBUG /* Activate rbd_assert() calls */
48
Alex Elder593a9e72012-02-07 12:03:37 -060049/*
50 * The basic unit of block I/O is a sector. It is interpreted in a
51 * number of contexts in Linux (blk, bio, genhd), but the default is
52 * universally 512 bytes. These symbols are just slightly more
53 * meaningful than the bare numbers they represent.
54 */
55#define SECTOR_SHIFT 9
56#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
57
Alex Elderf0f8cef2012-01-29 13:57:44 -060058#define RBD_DRV_NAME "rbd"
59#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070060
61#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
62
Alex Elderd4b125e2012-07-03 16:01:19 -050063#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
64#define RBD_MAX_SNAP_NAME_LEN \
65 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
66
Alex Elder35d489f2012-07-03 16:01:19 -050067#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070068
69#define RBD_SNAP_HEAD_NAME "-"
70
Alex Elder9682fc62013-04-30 00:44:33 -050071#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
72
Alex Elder9e15b772012-10-30 19:40:33 -050073/* This allows a single page to hold an image name sent by OSD */
74#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050075#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050076
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050078
Alex Elderd8891402012-10-09 13:50:17 -070079/* Feature bits */
80
Alex Elder5cbf6f122013-04-11 09:29:48 -050081#define RBD_FEATURE_LAYERING (1<<0)
82#define RBD_FEATURE_STRIPINGV2 (1<<1)
83#define RBD_FEATURES_ALL \
84 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
Alex Elderd8891402012-10-09 13:50:17 -070085
86/* Features supported by this (client software) implementation. */
87
Alex Elder770eba62012-10-25 23:34:40 -050088#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -070089
Alex Elder81a89792012-02-02 08:13:30 -060090/*
91 * An RBD device name will be "rbd#", where the "rbd" comes from
92 * RBD_DRV_NAME above, and # is a unique integer identifier.
93 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
94 * enough to hold all possible device names.
95 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060097#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098
99/*
100 * block device image metadata (in-memory version)
101 */
102struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500103 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500104 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500105 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700106 __u8 obj_order;
107 __u8 crypt_type;
108 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109
Alex Elderf84344f2012-08-31 17:29:51 -0500110 /* The remaining fields need to be updated occasionally */
111 u64 image_size;
112 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700113 char *snap_names;
114 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700115
Alex Elder500d0c02013-04-26 09:43:47 -0500116 u64 stripe_unit;
117 u64 stripe_count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700118};
119
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500120/*
121 * An rbd image specification.
122 *
123 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500124 * identify an image. Each rbd_dev structure includes a pointer to
125 * an rbd_spec structure that encapsulates this identity.
126 *
127 * Each of the id's in an rbd_spec has an associated name. For a
128 * user-mapped image, the names are supplied and the id's associated
129 * with them are looked up. For a layered image, a parent image is
130 * defined by the tuple, and the names are looked up.
131 *
132 * An rbd_dev structure contains a parent_spec pointer which is
133 * non-null if the image it represents is a child in a layered
134 * image. This pointer will refer to the rbd_spec structure used
135 * by the parent rbd_dev for its own identity (i.e., the structure
136 * is shared between the parent and child).
137 *
138 * Since these structures are populated once, during the discovery
139 * phase of image construction, they are effectively immutable so
140 * we make no effort to synchronize access to them.
141 *
142 * Note that code herein does not assume the image name is known (it
143 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500144 */
145struct rbd_spec {
146 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500147 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148
Alex Elderecb4dc22013-04-26 09:43:47 -0500149 const char *image_id;
150 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500151
152 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500153 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500154
155 struct kref kref;
156};
157
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600159 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700160 */
161struct rbd_client {
162 struct ceph_client *client;
163 struct kref kref;
164 struct list_head node;
165};
166
Alex Elderbf0d5f502012-11-22 00:00:08 -0600167struct rbd_img_request;
168typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
169
170#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
171
172struct rbd_obj_request;
173typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
174
Alex Elder9969ebc2013-01-18 12:31:10 -0600175enum obj_request_type {
176 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
177};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600178
Alex Elder926f9b32013-02-11 12:33:24 -0600179enum obj_req_flags {
180 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600181 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600182 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
183 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600184};
185
Alex Elderbf0d5f502012-11-22 00:00:08 -0600186struct rbd_obj_request {
187 const char *object_name;
188 u64 offset; /* object start byte */
189 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600190 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600191
Alex Elderc5b5ef62013-02-11 12:33:24 -0600192 /*
193 * An object request associated with an image will have its
194 * img_data flag set; a standalone object request will not.
195 *
196 * A standalone object request will have which == BAD_WHICH
197 * and a null obj_request pointer.
198 *
199 * An object request initiated in support of a layered image
200 * object (to check for its existence before a write) will
201 * have which == BAD_WHICH and a non-null obj_request pointer.
202 *
203 * Finally, an object request for rbd image data will have
204 * which != BAD_WHICH, and will have a non-null img_request
205 * pointer. The value of which will be in the range
206 * 0..(img_request->obj_request_count-1).
207 */
208 union {
209 struct rbd_obj_request *obj_request; /* STAT op */
210 struct {
211 struct rbd_img_request *img_request;
212 u64 img_offset;
213 /* links for img_request->obj_requests list */
214 struct list_head links;
215 };
216 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600217 u32 which; /* posn image request list */
218
219 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600220 union {
221 struct bio *bio_list;
222 struct {
223 struct page **pages;
224 u32 page_count;
225 };
226 };
Alex Elder0eefd472013-04-19 15:34:50 -0500227 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600228
229 struct ceph_osd_request *osd_req;
230
231 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800232 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600233
234 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600235 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600236
237 struct kref kref;
238};
239
Alex Elder0c425242013-02-08 09:55:49 -0600240enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600241 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
242 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600243 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600244};
245
Alex Elderbf0d5f502012-11-22 00:00:08 -0600246struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600247 struct rbd_device *rbd_dev;
248 u64 offset; /* starting image byte offset */
249 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600250 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600251 union {
Alex Elder9849e982013-01-24 16:13:36 -0600252 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600253 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600254 };
255 union {
256 struct request *rq; /* block request */
257 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600258 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500259 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600260 spinlock_t completion_lock;/* protects next_completion */
261 u32 next_completion;
262 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500263 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600264 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600265
266 u32 obj_request_count;
267 struct list_head obj_requests; /* rbd_obj_request structs */
268
269 struct kref kref;
270};
271
272#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600273 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600274#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600275 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600277 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600278
Alex Elderf84344f2012-08-31 17:29:51 -0500279struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500280 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500281 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500282 bool read_only;
283};
284
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700285/*
286 * a single device
287 */
288struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500289 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700290
291 int major; /* blkdev assigned major */
292 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700293
Alex Eldera30b71b2012-07-10 20:30:11 -0500294 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295 struct rbd_client *rbd_client;
296
297 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
298
Alex Elderb82d1672013-01-14 12:43:31 -0600299 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700300
301 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600302 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500303 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500305 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500306
Alex Elder0903e872012-11-14 12:25:19 -0600307 struct ceph_file_layout layout;
308
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700309 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600310 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700311
Alex Elder86b00e02012-10-25 23:34:42 -0500312 struct rbd_spec *parent_spec;
313 u64 parent_overlap;
Alex Elder2f82ee52012-10-30 19:40:33 -0500314 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500315
Josh Durginc6666012011-11-21 17:11:12 -0800316 /* protects updating the header */
317 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500318
319 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700320
321 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800322
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800323 /* sysfs related */
324 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600325 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800326};
327
Alex Elderb82d1672013-01-14 12:43:31 -0600328/*
329 * Flag bits for rbd_dev->flags. If atomicity is required,
330 * rbd_dev->lock is used to protect access.
331 *
332 * Currently, only the "removing" flag (which is coupled with the
333 * "open_count" field) requires atomic access.
334 */
Alex Elder6d292902013-01-14 12:43:31 -0600335enum rbd_dev_flags {
336 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600337 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600338};
339
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700340static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600341
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700342static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600343static DEFINE_SPINLOCK(rbd_dev_list_lock);
344
Alex Elder432b8582012-01-29 13:57:44 -0600345static LIST_HEAD(rbd_client_list); /* clients */
346static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347
Alex Elder1c2a9df2013-05-01 12:43:03 -0500348static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500349static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500350
Alex Elder3d7efd12013-04-19 15:34:50 -0500351static int rbd_img_request_submit(struct rbd_img_request *img_request);
352
Alex Elder200a6a82013-04-28 23:32:34 -0500353static void rbd_dev_device_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800354
Alex Elderf0f8cef2012-01-29 13:57:44 -0600355static ssize_t rbd_add(struct bus_type *bus, const char *buf,
356 size_t count);
357static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
358 size_t count);
Alex Elder71f293e2013-04-26 09:43:48 -0500359static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600360
361static struct bus_attribute rbd_bus_attrs[] = {
362 __ATTR(add, S_IWUSR, NULL, rbd_add),
363 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
364 __ATTR_NULL
365};
366
367static struct bus_type rbd_bus_type = {
368 .name = "rbd",
369 .bus_attrs = rbd_bus_attrs,
370};
371
372static void rbd_root_dev_release(struct device *dev)
373{
374}
375
376static struct device rbd_root_dev = {
377 .init_name = "rbd",
378 .release = rbd_root_dev_release,
379};
380
Alex Elder06ecc6c2012-11-01 10:17:15 -0500381static __printf(2, 3)
382void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
383{
384 struct va_format vaf;
385 va_list args;
386
387 va_start(args, fmt);
388 vaf.fmt = fmt;
389 vaf.va = &args;
390
391 if (!rbd_dev)
392 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
393 else if (rbd_dev->disk)
394 printk(KERN_WARNING "%s: %s: %pV\n",
395 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
396 else if (rbd_dev->spec && rbd_dev->spec->image_name)
397 printk(KERN_WARNING "%s: image %s: %pV\n",
398 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
399 else if (rbd_dev->spec && rbd_dev->spec->image_id)
400 printk(KERN_WARNING "%s: id %s: %pV\n",
401 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
402 else /* punt */
403 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
404 RBD_DRV_NAME, rbd_dev, &vaf);
405 va_end(args);
406}
407
Alex Elderaafb2302012-09-06 16:00:54 -0500408#ifdef RBD_DEBUG
409#define rbd_assert(expr) \
410 if (unlikely(!(expr))) { \
411 printk(KERN_ERR "\nAssertion failure in %s() " \
412 "at line %d:\n\n" \
413 "\trbd_assert(%s);\n\n", \
414 __func__, __LINE__, #expr); \
415 BUG(); \
416 }
417#else /* !RBD_DEBUG */
418# define rbd_assert(expr) ((void) 0)
419#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800420
Alex Elderb454e362013-04-19 15:34:50 -0500421static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder05a46af2013-04-26 15:44:36 -0500422static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
423static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600424
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500425static int rbd_dev_refresh(struct rbd_device *rbd_dev);
426static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500427static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
428 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500429static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
430 u8 *order, u64 *snap_size);
431static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
432 u64 *snap_features);
433static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435static int rbd_open(struct block_device *bdev, fmode_t mode)
436{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600437 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600438 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439
Alex Elderf84344f2012-08-31 17:29:51 -0500440 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 return -EROFS;
442
Alex Eldera14ea262013-02-05 13:23:12 -0600443 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600444 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
445 removing = true;
446 else
447 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600448 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600449 if (removing)
450 return -ENOENT;
451
Alex Elder42382b72012-11-16 09:29:16 -0600452 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600453 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500454 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600455 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700456
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 return 0;
458}
459
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800460static int rbd_release(struct gendisk *disk, fmode_t mode)
461{
462 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600463 unsigned long open_count_before;
464
Alex Eldera14ea262013-02-05 13:23:12 -0600465 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600466 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600467 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600468 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800469
Alex Elder42382b72012-11-16 09:29:16 -0600470 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600471 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600472 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800473
474 return 0;
475}
476
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700477static const struct block_device_operations rbd_bd_ops = {
478 .owner = THIS_MODULE,
479 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800480 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481};
482
483/*
484 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500485 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486 */
Alex Elderf8c38922012-08-10 13:12:07 -0700487static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488{
489 struct rbd_client *rbdc;
490 int ret = -ENOMEM;
491
Alex Elder37206ee2013-02-20 17:32:08 -0600492 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700493 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
494 if (!rbdc)
495 goto out_opt;
496
497 kref_init(&rbdc->kref);
498 INIT_LIST_HEAD(&rbdc->node);
499
Alex Elderbc534d82012-01-29 13:57:44 -0600500 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
501
Alex Elder43ae4702012-07-03 16:01:18 -0500502 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600504 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500505 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700506
507 ret = ceph_open_session(rbdc->client);
508 if (ret < 0)
509 goto out_err;
510
Alex Elder432b8582012-01-29 13:57:44 -0600511 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700512 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600513 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514
Alex Elderbc534d82012-01-29 13:57:44 -0600515 mutex_unlock(&ctl_mutex);
Alex Elder37206ee2013-02-20 17:32:08 -0600516 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600517
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700518 return rbdc;
519
520out_err:
521 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600522out_mutex:
523 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700524 kfree(rbdc);
525out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500526 if (ceph_opts)
527 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600528 dout("%s: error %d\n", __func__, ret);
529
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400530 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700531}
532
Alex Elder2f82ee52012-10-30 19:40:33 -0500533static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
534{
535 kref_get(&rbdc->kref);
536
537 return rbdc;
538}
539
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700541 * Find a ceph client with specific addr and configuration. If
542 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700543 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700544static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545{
546 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700547 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700548
Alex Elder43ae4702012-07-03 16:01:18 -0500549 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700550 return NULL;
551
Alex Elder1f7ba332012-08-10 13:12:07 -0700552 spin_lock(&rbd_client_list_lock);
553 list_for_each_entry(client_node, &rbd_client_list, node) {
554 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500555 __rbd_get_client(client_node);
556
Alex Elder1f7ba332012-08-10 13:12:07 -0700557 found = true;
558 break;
559 }
560 }
561 spin_unlock(&rbd_client_list_lock);
562
563 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700564}
565
566/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700567 * mount options
568 */
569enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700570 Opt_last_int,
571 /* int args above */
572 Opt_last_string,
573 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700574 Opt_read_only,
575 Opt_read_write,
576 /* Boolean args above */
577 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700578};
579
Alex Elder43ae4702012-07-03 16:01:18 -0500580static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700581 /* int args above */
582 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500583 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700584 {Opt_read_only, "ro"}, /* Alternate spelling */
585 {Opt_read_write, "read_write"},
586 {Opt_read_write, "rw"}, /* Alternate spelling */
587 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700588 {-1, NULL}
589};
590
Alex Elder98571b52013-01-20 14:44:42 -0600591struct rbd_options {
592 bool read_only;
593};
594
595#define RBD_READ_ONLY_DEFAULT false
596
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700597static int parse_rbd_opts_token(char *c, void *private)
598{
Alex Elder43ae4702012-07-03 16:01:18 -0500599 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700600 substring_t argstr[MAX_OPT_ARGS];
601 int token, intval, ret;
602
Alex Elder43ae4702012-07-03 16:01:18 -0500603 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700604 if (token < 0)
605 return -EINVAL;
606
607 if (token < Opt_last_int) {
608 ret = match_int(&argstr[0], &intval);
609 if (ret < 0) {
610 pr_err("bad mount option arg (not int) "
611 "at '%s'\n", c);
612 return ret;
613 }
614 dout("got int token %d val %d\n", token, intval);
615 } else if (token > Opt_last_int && token < Opt_last_string) {
616 dout("got string token %d val %s\n", token,
617 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700618 } else if (token > Opt_last_string && token < Opt_last_bool) {
619 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700620 } else {
621 dout("got token %d\n", token);
622 }
623
624 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700625 case Opt_read_only:
626 rbd_opts->read_only = true;
627 break;
628 case Opt_read_write:
629 rbd_opts->read_only = false;
630 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700631 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500632 rbd_assert(false);
633 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700634 }
635 return 0;
636}
637
638/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639 * Get a ceph client with specific addr and configuration, if one does
640 * not exist create it.
641 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500642static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643{
Alex Elderf8c38922012-08-10 13:12:07 -0700644 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700645
Alex Elder1f7ba332012-08-10 13:12:07 -0700646 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500647 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500648 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500649 else
Alex Elderf8c38922012-08-10 13:12:07 -0700650 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651
Alex Elder9d3997f2012-10-25 23:34:42 -0500652 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700653}
654
655/*
656 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600657 *
Alex Elder432b8582012-01-29 13:57:44 -0600658 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659 */
660static void rbd_client_release(struct kref *kref)
661{
662 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
663
Alex Elder37206ee2013-02-20 17:32:08 -0600664 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500665 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500667 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668
669 ceph_destroy_client(rbdc->client);
670 kfree(rbdc);
671}
672
673/*
674 * Drop reference to ceph client node. If it's not referenced anymore, release
675 * it.
676 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500677static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678{
Alex Elderc53d5892012-10-25 23:34:42 -0500679 if (rbdc)
680 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681}
682
Alex Eldera30b71b2012-07-10 20:30:11 -0500683static bool rbd_image_format_valid(u32 image_format)
684{
685 return image_format == 1 || image_format == 2;
686}
687
Alex Elder8e94af82012-07-25 09:32:40 -0500688static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
689{
Alex Elder103a1502012-08-02 11:29:45 -0500690 size_t size;
691 u32 snap_count;
692
693 /* The header has to start with the magic rbd header text */
694 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
695 return false;
696
Alex Elderdb2388b2012-10-20 22:17:27 -0500697 /* The bio layer requires at least sector-sized I/O */
698
699 if (ondisk->options.order < SECTOR_SHIFT)
700 return false;
701
702 /* If we use u64 in a few spots we may be able to loosen this */
703
704 if (ondisk->options.order > 8 * sizeof (int) - 1)
705 return false;
706
Alex Elder103a1502012-08-02 11:29:45 -0500707 /*
708 * The size of a snapshot header has to fit in a size_t, and
709 * that limits the number of snapshots.
710 */
711 snap_count = le32_to_cpu(ondisk->snap_count);
712 size = SIZE_MAX - sizeof (struct ceph_snap_context);
713 if (snap_count > size / sizeof (__le64))
714 return false;
715
716 /*
717 * Not only that, but the size of the entire the snapshot
718 * header must also be representable in a size_t.
719 */
720 size -= snap_count * sizeof (__le64);
721 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
722 return false;
723
724 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500725}
726
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727/*
728 * Create a new header structure, translate header format from the on-disk
729 * header.
730 */
731static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500732 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733{
Alex Elderccece232012-07-10 20:30:10 -0500734 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500735 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500736 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500737 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738
Alex Elder6a523252012-07-19 17:12:59 -0500739 memset(header, 0, sizeof (*header));
740
Alex Elder103a1502012-08-02 11:29:45 -0500741 snap_count = le32_to_cpu(ondisk->snap_count);
742
Alex Elder58c17b02012-08-23 23:22:06 -0500743 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
744 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500745 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700746 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500747 memcpy(header->object_prefix, ondisk->object_prefix, len);
748 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600749
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500751 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
752
Alex Elder621901d2012-08-23 23:22:06 -0500753 /* Save a copy of the snapshot names */
754
Alex Elderf785cc12012-08-23 23:22:06 -0500755 if (snap_names_len > (u64) SIZE_MAX)
756 return -EIO;
757 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500759 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500760 /*
761 * Note that rbd_dev_v1_header_read() guarantees
762 * the ondisk buffer we're working with has
763 * snap_names_len bytes beyond the end of the
764 * snapshot id array, this memcpy() is safe.
765 */
766 memcpy(header->snap_names, &ondisk->snaps[snap_count],
767 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500768
Alex Elder621901d2012-08-23 23:22:06 -0500769 /* Record each snapshot's size */
770
Alex Elderd2bb24e2012-07-26 23:37:14 -0500771 size = snap_count * sizeof (*header->snap_sizes);
772 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700773 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500774 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500775 for (i = 0; i < snap_count; i++)
776 header->snap_sizes[i] =
777 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700778 } else {
779 header->snap_names = NULL;
780 header->snap_sizes = NULL;
781 }
Alex Elder849b4262012-07-09 21:04:24 -0500782
Alex Elder34b13182012-07-13 20:35:12 -0500783 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 header->obj_order = ondisk->options.order;
785 header->crypt_type = ondisk->options.crypt_type;
786 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500787
Alex Elder621901d2012-08-23 23:22:06 -0500788 /* Allocate and fill in the snapshot context */
789
Alex Elderf84344f2012-08-31 17:29:51 -0500790 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder468521c2013-04-26 09:43:47 -0500791
Alex Elder812164f82013-04-30 00:44:32 -0500792 header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500793 if (!header->snapc)
794 goto out_err;
Alex Elder505cbb92012-07-19 08:49:18 -0500795 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Alex Elder621901d2012-08-23 23:22:06 -0500796 for (i = 0; i < snap_count; i++)
Alex Elder468521c2013-04-26 09:43:47 -0500797 header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700798
799 return 0;
800
Alex Elder6a523252012-07-19 17:12:59 -0500801out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500802 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500803 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700804 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500805 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500806 kfree(header->object_prefix);
807 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500808
Alex Elder00f1f362012-02-07 12:03:36 -0600809 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700810}
811
Alex Elder9682fc62013-04-30 00:44:33 -0500812static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
813{
814 const char *snap_name;
815
816 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
817
818 /* Skip over names until we find the one we are looking for */
819
820 snap_name = rbd_dev->header.snap_names;
821 while (which--)
822 snap_name += strlen(snap_name) + 1;
823
824 return kstrdup(snap_name, GFP_KERNEL);
825}
826
Alex Elder30d1cff2013-05-01 12:43:03 -0500827/*
828 * Snapshot id comparison function for use with qsort()/bsearch().
829 * Note that result is for snapshots in *descending* order.
830 */
831static int snapid_compare_reverse(const void *s1, const void *s2)
832{
833 u64 snap_id1 = *(u64 *)s1;
834 u64 snap_id2 = *(u64 *)s2;
835
836 if (snap_id1 < snap_id2)
837 return 1;
838 return snap_id1 == snap_id2 ? 0 : -1;
839}
840
841/*
842 * Search a snapshot context to see if the given snapshot id is
843 * present.
844 *
845 * Returns the position of the snapshot id in the array if it's found,
846 * or BAD_SNAP_INDEX otherwise.
847 *
848 * Note: The snapshot array is in kept sorted (by the osd) in
849 * reverse order, highest snapshot id first.
850 */
Alex Elder9682fc62013-04-30 00:44:33 -0500851static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
852{
853 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -0500854 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -0500855
Alex Elder30d1cff2013-05-01 12:43:03 -0500856 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
857 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -0500858
Alex Elder30d1cff2013-05-01 12:43:03 -0500859 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -0500860}
861
Alex Elder2ad3d712013-04-30 00:44:33 -0500862static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
863 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -0500864{
865 u32 which;
866
867 which = rbd_dev_snap_index(rbd_dev, snap_id);
868 if (which == BAD_SNAP_INDEX)
869 return NULL;
870
871 return _rbd_dev_v1_snap_name(rbd_dev, which);
872}
873
Alex Elder9e15b772012-10-30 19:40:33 -0500874static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
875{
Alex Elder9e15b772012-10-30 19:40:33 -0500876 if (snap_id == CEPH_NOSNAP)
877 return RBD_SNAP_HEAD_NAME;
878
Alex Elder54cac612013-04-30 00:44:33 -0500879 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
880 if (rbd_dev->image_format == 1)
881 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -0500882
Alex Elder54cac612013-04-30 00:44:33 -0500883 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -0500884}
885
Alex Elder2ad3d712013-04-30 00:44:33 -0500886static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
887 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888{
Alex Elder2ad3d712013-04-30 00:44:33 -0500889 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
890 if (snap_id == CEPH_NOSNAP) {
891 *snap_size = rbd_dev->header.image_size;
892 } else if (rbd_dev->image_format == 1) {
893 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -0600894
Alex Elder2ad3d712013-04-30 00:44:33 -0500895 which = rbd_dev_snap_index(rbd_dev, snap_id);
896 if (which == BAD_SNAP_INDEX)
897 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -0600898
Alex Elder2ad3d712013-04-30 00:44:33 -0500899 *snap_size = rbd_dev->header.snap_sizes[which];
900 } else {
901 u64 size = 0;
902 int ret;
903
904 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
905 if (ret)
906 return ret;
907
908 *snap_size = size;
909 }
910 return 0;
911}
912
913static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
914 u64 *snap_features)
915{
916 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
917 if (snap_id == CEPH_NOSNAP) {
918 *snap_features = rbd_dev->header.features;
919 } else if (rbd_dev->image_format == 1) {
920 *snap_features = 0; /* No features for format 1 */
921 } else {
922 u64 features = 0;
923 int ret;
924
925 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
926 if (ret)
927 return ret;
928
929 *snap_features = features;
930 }
931 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700932}
933
Alex Elderd1cf5782013-04-27 09:59:30 -0500934static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700935{
Alex Elder2ad3d712013-04-30 00:44:33 -0500936 const char *snap_name = rbd_dev->spec->snap_name;
937 u64 snap_id;
938 u64 size = 0;
939 u64 features = 0;
940 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -0500941
Alex Elder2ad3d712013-04-30 00:44:33 -0500942 if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) {
943 snap_id = rbd_snap_id_by_name(rbd_dev, snap_name);
944 if (snap_id == CEPH_NOSNAP)
Alex Elder8b0241f2013-04-25 23:15:08 -0500945 return -ENOENT;
Alex Elder2ad3d712013-04-30 00:44:33 -0500946 } else {
947 snap_id = CEPH_NOSNAP;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700948 }
Alex Elder6d292902013-01-14 12:43:31 -0600949
Alex Elder2ad3d712013-04-30 00:44:33 -0500950 ret = rbd_snap_size(rbd_dev, snap_id, &size);
951 if (ret)
952 return ret;
953 ret = rbd_snap_features(rbd_dev, snap_id, &features);
954 if (ret)
955 return ret;
956
957 rbd_dev->mapping.size = size;
958 rbd_dev->mapping.features = features;
959
960 /* If we are mapping a snapshot it must be marked read-only */
961
962 if (snap_id != CEPH_NOSNAP)
963 rbd_dev->mapping.read_only = true;
964
Alex Elder8b0241f2013-04-25 23:15:08 -0500965 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966}
967
Alex Elderd1cf5782013-04-27 09:59:30 -0500968static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
969{
970 rbd_dev->mapping.size = 0;
971 rbd_dev->mapping.features = 0;
972 rbd_dev->mapping.read_only = true;
973}
974
Alex Elder200a6a82013-04-28 23:32:34 -0500975static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
976{
977 rbd_dev->mapping.size = 0;
978 rbd_dev->mapping.features = 0;
979 rbd_dev->mapping.read_only = true;
980}
981
Alex Elder98571b52013-01-20 14:44:42 -0600982static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700983{
Alex Elder65ccfe22012-08-09 10:33:26 -0700984 char *name;
985 u64 segment;
986 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700987
Alex Elder2fd82b92012-11-09 15:05:54 -0600988 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700989 if (!name)
990 return NULL;
991 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600992 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700993 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600994 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700995 pr_err("error formatting segment name for #%llu (%d)\n",
996 segment, ret);
997 kfree(name);
998 name = NULL;
999 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000
Alex Elder65ccfe22012-08-09 10:33:26 -07001001 return name;
1002}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001003
Alex Elder65ccfe22012-08-09 10:33:26 -07001004static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1005{
1006 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001007
Alex Elder65ccfe22012-08-09 10:33:26 -07001008 return offset & (segment_size - 1);
1009}
1010
1011static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1012 u64 offset, u64 length)
1013{
1014 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1015
1016 offset &= segment_size - 1;
1017
Alex Elderaafb2302012-09-06 16:00:54 -05001018 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001019 if (offset + length > segment_size)
1020 length = segment_size - offset;
1021
1022 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001023}
1024
1025/*
Josh Durgin029bcbd2011-07-22 11:35:23 -07001026 * returns the size of an object in the image
1027 */
1028static u64 rbd_obj_bytes(struct rbd_image_header *header)
1029{
1030 return 1 << header->obj_order;
1031}
1032
1033/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001034 * bio helpers
1035 */
1036
1037static void bio_chain_put(struct bio *chain)
1038{
1039 struct bio *tmp;
1040
1041 while (chain) {
1042 tmp = chain;
1043 chain = chain->bi_next;
1044 bio_put(tmp);
1045 }
1046}
1047
1048/*
1049 * zeros a bio chain, starting at specific offset
1050 */
1051static void zero_bio_chain(struct bio *chain, int start_ofs)
1052{
1053 struct bio_vec *bv;
1054 unsigned long flags;
1055 void *buf;
1056 int i;
1057 int pos = 0;
1058
1059 while (chain) {
1060 bio_for_each_segment(bv, chain, i) {
1061 if (pos + bv->bv_len > start_ofs) {
1062 int remainder = max(start_ofs - pos, 0);
1063 buf = bvec_kmap_irq(bv, &flags);
1064 memset(buf + remainder, 0,
1065 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +02001066 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001067 }
1068 pos += bv->bv_len;
1069 }
1070
1071 chain = chain->bi_next;
1072 }
1073}
1074
1075/*
Alex Elderb9434c52013-04-19 15:34:50 -05001076 * similar to zero_bio_chain(), zeros data defined by a page array,
1077 * starting at the given byte offset from the start of the array and
1078 * continuing up to the given end offset. The pages array is
1079 * assumed to be big enough to hold all bytes up to the end.
1080 */
1081static void zero_pages(struct page **pages, u64 offset, u64 end)
1082{
1083 struct page **page = &pages[offset >> PAGE_SHIFT];
1084
1085 rbd_assert(end > offset);
1086 rbd_assert(end - offset <= (u64)SIZE_MAX);
1087 while (offset < end) {
1088 size_t page_offset;
1089 size_t length;
1090 unsigned long flags;
1091 void *kaddr;
1092
1093 page_offset = (size_t)(offset & ~PAGE_MASK);
1094 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
1095 local_irq_save(flags);
1096 kaddr = kmap_atomic(*page);
1097 memset(kaddr + page_offset, 0, length);
1098 kunmap_atomic(kaddr);
1099 local_irq_restore(flags);
1100
1101 offset += length;
1102 page++;
1103 }
1104}
1105
1106/*
Alex Elderf7760da2012-10-20 22:17:27 -05001107 * Clone a portion of a bio, starting at the given byte offset
1108 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109 */
Alex Elderf7760da2012-10-20 22:17:27 -05001110static struct bio *bio_clone_range(struct bio *bio_src,
1111 unsigned int offset,
1112 unsigned int len,
1113 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001114{
Alex Elderf7760da2012-10-20 22:17:27 -05001115 struct bio_vec *bv;
1116 unsigned int resid;
1117 unsigned short idx;
1118 unsigned int voff;
1119 unsigned short end_idx;
1120 unsigned short vcnt;
1121 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122
Alex Elderf7760da2012-10-20 22:17:27 -05001123 /* Handle the easy case for the caller */
1124
1125 if (!offset && len == bio_src->bi_size)
1126 return bio_clone(bio_src, gfpmask);
1127
1128 if (WARN_ON_ONCE(!len))
1129 return NULL;
1130 if (WARN_ON_ONCE(len > bio_src->bi_size))
1131 return NULL;
1132 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1133 return NULL;
1134
1135 /* Find first affected segment... */
1136
1137 resid = offset;
1138 __bio_for_each_segment(bv, bio_src, idx, 0) {
1139 if (resid < bv->bv_len)
1140 break;
1141 resid -= bv->bv_len;
1142 }
1143 voff = resid;
1144
1145 /* ...and the last affected segment */
1146
1147 resid += len;
1148 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1149 if (resid <= bv->bv_len)
1150 break;
1151 resid -= bv->bv_len;
1152 }
1153 vcnt = end_idx - idx + 1;
1154
1155 /* Build the clone */
1156
1157 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1158 if (!bio)
1159 return NULL; /* ENOMEM */
1160
1161 bio->bi_bdev = bio_src->bi_bdev;
1162 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1163 bio->bi_rw = bio_src->bi_rw;
1164 bio->bi_flags |= 1 << BIO_CLONED;
1165
1166 /*
1167 * Copy over our part of the bio_vec, then update the first
1168 * and last (or only) entries.
1169 */
1170 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1171 vcnt * sizeof (struct bio_vec));
1172 bio->bi_io_vec[0].bv_offset += voff;
1173 if (vcnt > 1) {
1174 bio->bi_io_vec[0].bv_len -= voff;
1175 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1176 } else {
1177 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178 }
1179
Alex Elderf7760da2012-10-20 22:17:27 -05001180 bio->bi_vcnt = vcnt;
1181 bio->bi_size = len;
1182 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001183
Alex Elderf7760da2012-10-20 22:17:27 -05001184 return bio;
1185}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001186
Alex Elderf7760da2012-10-20 22:17:27 -05001187/*
1188 * Clone a portion of a bio chain, starting at the given byte offset
1189 * into the first bio in the source chain and continuing for the
1190 * number of bytes indicated. The result is another bio chain of
1191 * exactly the given length, or a null pointer on error.
1192 *
1193 * The bio_src and offset parameters are both in-out. On entry they
1194 * refer to the first source bio and the offset into that bio where
1195 * the start of data to be cloned is located.
1196 *
1197 * On return, bio_src is updated to refer to the bio in the source
1198 * chain that contains first un-cloned byte, and *offset will
1199 * contain the offset of that byte within that bio.
1200 */
1201static struct bio *bio_chain_clone_range(struct bio **bio_src,
1202 unsigned int *offset,
1203 unsigned int len,
1204 gfp_t gfpmask)
1205{
1206 struct bio *bi = *bio_src;
1207 unsigned int off = *offset;
1208 struct bio *chain = NULL;
1209 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001210
Alex Elderf7760da2012-10-20 22:17:27 -05001211 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212
Alex Elderf7760da2012-10-20 22:17:27 -05001213 if (!bi || off >= bi->bi_size || !len)
1214 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215
Alex Elderf7760da2012-10-20 22:17:27 -05001216 end = &chain;
1217 while (len) {
1218 unsigned int bi_size;
1219 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001220
Alex Elderf5400b72012-11-01 10:17:15 -05001221 if (!bi) {
1222 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001223 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001224 }
Alex Elderf7760da2012-10-20 22:17:27 -05001225 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1226 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1227 if (!bio)
1228 goto out_err; /* ENOMEM */
1229
1230 *end = bio;
1231 end = &bio->bi_next;
1232
1233 off += bi_size;
1234 if (off == bi->bi_size) {
1235 bi = bi->bi_next;
1236 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001237 }
Alex Elderf7760da2012-10-20 22:17:27 -05001238 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001239 }
Alex Elderf7760da2012-10-20 22:17:27 -05001240 *bio_src = bi;
1241 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242
Alex Elderf7760da2012-10-20 22:17:27 -05001243 return chain;
1244out_err:
1245 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001246
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001247 return NULL;
1248}
1249
Alex Elder926f9b32013-02-11 12:33:24 -06001250/*
1251 * The default/initial value for all object request flags is 0. For
1252 * each flag, once its value is set to 1 it is never reset to 0
1253 * again.
1254 */
Alex Elder6365d332013-02-11 12:33:24 -06001255static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1256{
1257 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001258 struct rbd_device *rbd_dev;
1259
Alex Elder57acbaa2013-02-11 12:33:24 -06001260 rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder6365d332013-02-11 12:33:24 -06001261 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1262 obj_request);
1263 }
1264}
1265
1266static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1267{
1268 smp_mb();
1269 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1270}
1271
Alex Elder57acbaa2013-02-11 12:33:24 -06001272static void obj_request_done_set(struct rbd_obj_request *obj_request)
1273{
1274 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1275 struct rbd_device *rbd_dev = NULL;
1276
1277 if (obj_request_img_data_test(obj_request))
1278 rbd_dev = obj_request->img_request->rbd_dev;
1279 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1280 obj_request);
1281 }
1282}
1283
1284static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1285{
1286 smp_mb();
1287 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1288}
1289
Alex Elder5679c592013-02-11 12:33:24 -06001290/*
1291 * This sets the KNOWN flag after (possibly) setting the EXISTS
1292 * flag. The latter is set based on the "exists" value provided.
1293 *
1294 * Note that for our purposes once an object exists it never goes
1295 * away again. It's possible that the response from two existence
1296 * checks are separated by the creation of the target object, and
1297 * the first ("doesn't exist") response arrives *after* the second
1298 * ("does exist"). In that case we ignore the second one.
1299 */
1300static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1301 bool exists)
1302{
1303 if (exists)
1304 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1305 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1306 smp_mb();
1307}
1308
1309static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1310{
1311 smp_mb();
1312 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1313}
1314
1315static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1316{
1317 smp_mb();
1318 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1319}
1320
Alex Elderbf0d5f502012-11-22 00:00:08 -06001321static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1322{
Alex Elder37206ee2013-02-20 17:32:08 -06001323 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1324 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001325 kref_get(&obj_request->kref);
1326}
1327
1328static void rbd_obj_request_destroy(struct kref *kref);
1329static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1330{
1331 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001332 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1333 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001334 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1335}
1336
1337static void rbd_img_request_get(struct rbd_img_request *img_request)
1338{
Alex Elder37206ee2013-02-20 17:32:08 -06001339 dout("%s: img %p (was %d)\n", __func__, img_request,
1340 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001341 kref_get(&img_request->kref);
1342}
1343
1344static void rbd_img_request_destroy(struct kref *kref);
1345static void rbd_img_request_put(struct rbd_img_request *img_request)
1346{
1347 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001348 dout("%s: img %p (was %d)\n", __func__, img_request,
1349 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001350 kref_put(&img_request->kref, rbd_img_request_destroy);
1351}
1352
1353static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1354 struct rbd_obj_request *obj_request)
1355{
Alex Elder25dcf952013-01-25 17:08:55 -06001356 rbd_assert(obj_request->img_request == NULL);
1357
Alex Elderb155e862013-04-15 14:50:37 -05001358 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001359 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001360 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001361 rbd_assert(!obj_request_img_data_test(obj_request));
1362 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001363 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001364 img_request->obj_request_count++;
1365 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001366 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1367 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001368}
1369
1370static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1371 struct rbd_obj_request *obj_request)
1372{
1373 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001374
Alex Elder37206ee2013-02-20 17:32:08 -06001375 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1376 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001377 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001378 rbd_assert(img_request->obj_request_count > 0);
1379 img_request->obj_request_count--;
1380 rbd_assert(obj_request->which == img_request->obj_request_count);
1381 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001382 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001383 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001384 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001385 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001386 rbd_obj_request_put(obj_request);
1387}
1388
1389static bool obj_request_type_valid(enum obj_request_type type)
1390{
1391 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001392 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001393 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001394 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001395 return true;
1396 default:
1397 return false;
1398 }
1399}
1400
Alex Elderbf0d5f502012-11-22 00:00:08 -06001401static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1402 struct rbd_obj_request *obj_request)
1403{
Alex Elder37206ee2013-02-20 17:32:08 -06001404 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1405
Alex Elderbf0d5f502012-11-22 00:00:08 -06001406 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1407}
1408
1409static void rbd_img_request_complete(struct rbd_img_request *img_request)
1410{
Alex Elder55f27e02013-04-10 12:34:25 -05001411
Alex Elder37206ee2013-02-20 17:32:08 -06001412 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001413
1414 /*
1415 * If no error occurred, compute the aggregate transfer
1416 * count for the image request. We could instead use
1417 * atomic64_cmpxchg() to update it as each object request
1418 * completes; not clear which way is better off hand.
1419 */
1420 if (!img_request->result) {
1421 struct rbd_obj_request *obj_request;
1422 u64 xferred = 0;
1423
1424 for_each_obj_request(img_request, obj_request)
1425 xferred += obj_request->xferred;
1426 img_request->xferred = xferred;
1427 }
1428
Alex Elderbf0d5f502012-11-22 00:00:08 -06001429 if (img_request->callback)
1430 img_request->callback(img_request);
1431 else
1432 rbd_img_request_put(img_request);
1433}
1434
Alex Elder788e2df2013-01-17 12:25:27 -06001435/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1436
1437static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1438{
Alex Elder37206ee2013-02-20 17:32:08 -06001439 dout("%s: obj %p\n", __func__, obj_request);
1440
Alex Elder788e2df2013-01-17 12:25:27 -06001441 return wait_for_completion_interruptible(&obj_request->completion);
1442}
1443
Alex Elder0c425242013-02-08 09:55:49 -06001444/*
1445 * The default/initial value for all image request flags is 0. Each
1446 * is conditionally set to 1 at image request initialization time
1447 * and currently never change thereafter.
1448 */
1449static void img_request_write_set(struct rbd_img_request *img_request)
1450{
1451 set_bit(IMG_REQ_WRITE, &img_request->flags);
1452 smp_mb();
1453}
1454
1455static bool img_request_write_test(struct rbd_img_request *img_request)
1456{
1457 smp_mb();
1458 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1459}
1460
Alex Elder9849e982013-01-24 16:13:36 -06001461static void img_request_child_set(struct rbd_img_request *img_request)
1462{
1463 set_bit(IMG_REQ_CHILD, &img_request->flags);
1464 smp_mb();
1465}
1466
1467static bool img_request_child_test(struct rbd_img_request *img_request)
1468{
1469 smp_mb();
1470 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1471}
1472
Alex Elderd0b2e942013-01-24 16:13:36 -06001473static void img_request_layered_set(struct rbd_img_request *img_request)
1474{
1475 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1476 smp_mb();
1477}
1478
1479static bool img_request_layered_test(struct rbd_img_request *img_request)
1480{
1481 smp_mb();
1482 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1483}
1484
Alex Elder6e2a4502013-03-27 09:16:30 -05001485static void
1486rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1487{
Alex Elderb9434c52013-04-19 15:34:50 -05001488 u64 xferred = obj_request->xferred;
1489 u64 length = obj_request->length;
1490
Alex Elder6e2a4502013-03-27 09:16:30 -05001491 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1492 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001493 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001494 /*
1495 * ENOENT means a hole in the image. We zero-fill the
1496 * entire length of the request. A short read also implies
1497 * zero-fill to the end of the request. Either way we
1498 * update the xferred count to indicate the whole request
1499 * was satisfied.
1500 */
Alex Elderb9434c52013-04-19 15:34:50 -05001501 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001502 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001503 if (obj_request->type == OBJ_REQUEST_BIO)
1504 zero_bio_chain(obj_request->bio_list, 0);
1505 else
1506 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001507 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001508 obj_request->xferred = length;
1509 } else if (xferred < length && !obj_request->result) {
1510 if (obj_request->type == OBJ_REQUEST_BIO)
1511 zero_bio_chain(obj_request->bio_list, xferred);
1512 else
1513 zero_pages(obj_request->pages, xferred, length);
1514 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001515 }
1516 obj_request_done_set(obj_request);
1517}
1518
Alex Elderbf0d5f502012-11-22 00:00:08 -06001519static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1520{
Alex Elder37206ee2013-02-20 17:32:08 -06001521 dout("%s: obj %p cb %p\n", __func__, obj_request,
1522 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001523 if (obj_request->callback)
1524 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001525 else
1526 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001527}
1528
Alex Elderc47f9372013-02-26 14:23:07 -06001529static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001530{
1531 dout("%s: obj %p\n", __func__, obj_request);
1532 obj_request_done_set(obj_request);
1533}
1534
Alex Elderc47f9372013-02-26 14:23:07 -06001535static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001536{
Alex Elder57acbaa2013-02-11 12:33:24 -06001537 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001538 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001539 bool layered = false;
1540
1541 if (obj_request_img_data_test(obj_request)) {
1542 img_request = obj_request->img_request;
1543 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001544 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001545 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001546
1547 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1548 obj_request, img_request, obj_request->result,
1549 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001550 if (layered && obj_request->result == -ENOENT &&
1551 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001552 rbd_img_parent_read(obj_request);
1553 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001554 rbd_img_obj_request_read_callback(obj_request);
1555 else
1556 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001557}
1558
Alex Elderc47f9372013-02-26 14:23:07 -06001559static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001560{
Sage Weil1b83bef2013-02-25 16:11:12 -08001561 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1562 obj_request->result, obj_request->length);
1563 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001564 * There is no such thing as a successful short write. Set
1565 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001566 */
1567 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001568 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001569}
1570
Alex Elderfbfab532013-02-08 09:55:48 -06001571/*
1572 * For a simple stat call there's nothing to do. We'll do more if
1573 * this is part of a write sequence for a layered image.
1574 */
Alex Elderc47f9372013-02-26 14:23:07 -06001575static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001576{
Alex Elder37206ee2013-02-20 17:32:08 -06001577 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001578 obj_request_done_set(obj_request);
1579}
1580
Alex Elderbf0d5f502012-11-22 00:00:08 -06001581static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1582 struct ceph_msg *msg)
1583{
1584 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001585 u16 opcode;
1586
Alex Elder37206ee2013-02-20 17:32:08 -06001587 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001588 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001589 if (obj_request_img_data_test(obj_request)) {
1590 rbd_assert(obj_request->img_request);
1591 rbd_assert(obj_request->which != BAD_WHICH);
1592 } else {
1593 rbd_assert(obj_request->which == BAD_WHICH);
1594 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001595
Sage Weil1b83bef2013-02-25 16:11:12 -08001596 if (osd_req->r_result < 0)
1597 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001598
Alex Elder0eefd472013-04-19 15:34:50 -05001599 BUG_ON(osd_req->r_num_ops > 2);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001600
Alex Elderc47f9372013-02-26 14:23:07 -06001601 /*
1602 * We support a 64-bit length, but ultimately it has to be
1603 * passed to blk_end_request(), which takes an unsigned int.
1604 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001605 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elder8b3e1a52013-01-24 16:13:36 -06001606 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Alex Elder79528732013-04-03 21:32:51 -05001607 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001608 switch (opcode) {
1609 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001610 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001611 break;
1612 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001613 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001614 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001615 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001616 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001617 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001618 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001619 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001620 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001621 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001622 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001623 default:
1624 rbd_warn(NULL, "%s: unsupported op %hu\n",
1625 obj_request->object_name, (unsigned short) opcode);
1626 break;
1627 }
1628
Alex Elder07741302013-02-05 23:41:50 -06001629 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001630 rbd_obj_request_complete(obj_request);
1631}
1632
Alex Elder9d4df012013-04-19 15:34:50 -05001633static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001634{
1635 struct rbd_img_request *img_request = obj_request->img_request;
Alex Elder8c042b02013-04-03 01:28:58 -05001636 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001637 u64 snap_id;
Alex Elder430c28c2013-04-03 21:32:51 -05001638
Alex Elder8c042b02013-04-03 01:28:58 -05001639 rbd_assert(osd_req != NULL);
Alex Elder430c28c2013-04-03 21:32:51 -05001640
Alex Elder9d4df012013-04-19 15:34:50 -05001641 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
Alex Elder8c042b02013-04-03 01:28:58 -05001642 ceph_osdc_build_request(osd_req, obj_request->offset,
Alex Elder9d4df012013-04-19 15:34:50 -05001643 NULL, snap_id, NULL);
1644}
1645
1646static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1647{
1648 struct rbd_img_request *img_request = obj_request->img_request;
1649 struct ceph_osd_request *osd_req = obj_request->osd_req;
1650 struct ceph_snap_context *snapc;
1651 struct timespec mtime = CURRENT_TIME;
1652
1653 rbd_assert(osd_req != NULL);
1654
1655 snapc = img_request ? img_request->snapc : NULL;
1656 ceph_osdc_build_request(osd_req, obj_request->offset,
1657 snapc, CEPH_NOSNAP, &mtime);
Alex Elder430c28c2013-04-03 21:32:51 -05001658}
1659
Alex Elderbf0d5f502012-11-22 00:00:08 -06001660static struct ceph_osd_request *rbd_osd_req_create(
1661 struct rbd_device *rbd_dev,
1662 bool write_request,
Alex Elder430c28c2013-04-03 21:32:51 -05001663 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001664{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001665 struct ceph_snap_context *snapc = NULL;
1666 struct ceph_osd_client *osdc;
1667 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001668
Alex Elder6365d332013-02-11 12:33:24 -06001669 if (obj_request_img_data_test(obj_request)) {
1670 struct rbd_img_request *img_request = obj_request->img_request;
1671
Alex Elder0c425242013-02-08 09:55:49 -06001672 rbd_assert(write_request ==
1673 img_request_write_test(img_request));
1674 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001675 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001676 }
1677
1678 /* Allocate and initialize the request, for the single op */
1679
1680 osdc = &rbd_dev->rbd_client->client->osdc;
1681 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1682 if (!osd_req)
1683 return NULL; /* ENOMEM */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001684
Alex Elder430c28c2013-04-03 21:32:51 -05001685 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001686 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05001687 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06001688 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001689
1690 osd_req->r_callback = rbd_osd_req_callback;
1691 osd_req->r_priv = obj_request;
1692
1693 osd_req->r_oid_len = strlen(obj_request->object_name);
1694 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1695 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1696
1697 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1698
Alex Elderbf0d5f502012-11-22 00:00:08 -06001699 return osd_req;
1700}
1701
Alex Elder0eefd472013-04-19 15:34:50 -05001702/*
1703 * Create a copyup osd request based on the information in the
1704 * object request supplied. A copyup request has two osd ops,
1705 * a copyup method call, and a "normal" write request.
1706 */
1707static struct ceph_osd_request *
1708rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1709{
1710 struct rbd_img_request *img_request;
1711 struct ceph_snap_context *snapc;
1712 struct rbd_device *rbd_dev;
1713 struct ceph_osd_client *osdc;
1714 struct ceph_osd_request *osd_req;
1715
1716 rbd_assert(obj_request_img_data_test(obj_request));
1717 img_request = obj_request->img_request;
1718 rbd_assert(img_request);
1719 rbd_assert(img_request_write_test(img_request));
1720
1721 /* Allocate and initialize the request, for the two ops */
1722
1723 snapc = img_request->snapc;
1724 rbd_dev = img_request->rbd_dev;
1725 osdc = &rbd_dev->rbd_client->client->osdc;
1726 osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1727 if (!osd_req)
1728 return NULL; /* ENOMEM */
1729
1730 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1731 osd_req->r_callback = rbd_osd_req_callback;
1732 osd_req->r_priv = obj_request;
1733
1734 osd_req->r_oid_len = strlen(obj_request->object_name);
1735 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1736 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1737
1738 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1739
1740 return osd_req;
1741}
1742
1743
Alex Elderbf0d5f502012-11-22 00:00:08 -06001744static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1745{
1746 ceph_osdc_put_request(osd_req);
1747}
1748
1749/* object_name is assumed to be a non-null pointer and NUL-terminated */
1750
1751static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1752 u64 offset, u64 length,
1753 enum obj_request_type type)
1754{
1755 struct rbd_obj_request *obj_request;
1756 size_t size;
1757 char *name;
1758
1759 rbd_assert(obj_request_type_valid(type));
1760
1761 size = strlen(object_name) + 1;
Alex Elderf907ad52013-05-01 12:43:03 -05001762 name = kmalloc(size, GFP_KERNEL);
1763 if (!name)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001764 return NULL;
1765
Alex Elder868311b2013-05-01 12:43:03 -05001766 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_KERNEL);
Alex Elderf907ad52013-05-01 12:43:03 -05001767 if (!obj_request) {
1768 kfree(name);
1769 return NULL;
1770 }
1771
Alex Elderbf0d5f502012-11-22 00:00:08 -06001772 obj_request->object_name = memcpy(name, object_name, size);
1773 obj_request->offset = offset;
1774 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06001775 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001776 obj_request->which = BAD_WHICH;
1777 obj_request->type = type;
1778 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06001779 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001780 kref_init(&obj_request->kref);
1781
Alex Elder37206ee2013-02-20 17:32:08 -06001782 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1783 offset, length, (int)type, obj_request);
1784
Alex Elderbf0d5f502012-11-22 00:00:08 -06001785 return obj_request;
1786}
1787
1788static void rbd_obj_request_destroy(struct kref *kref)
1789{
1790 struct rbd_obj_request *obj_request;
1791
1792 obj_request = container_of(kref, struct rbd_obj_request, kref);
1793
Alex Elder37206ee2013-02-20 17:32:08 -06001794 dout("%s: obj %p\n", __func__, obj_request);
1795
Alex Elderbf0d5f502012-11-22 00:00:08 -06001796 rbd_assert(obj_request->img_request == NULL);
1797 rbd_assert(obj_request->which == BAD_WHICH);
1798
1799 if (obj_request->osd_req)
1800 rbd_osd_req_destroy(obj_request->osd_req);
1801
1802 rbd_assert(obj_request_type_valid(obj_request->type));
1803 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001804 case OBJ_REQUEST_NODATA:
1805 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001806 case OBJ_REQUEST_BIO:
1807 if (obj_request->bio_list)
1808 bio_chain_put(obj_request->bio_list);
1809 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001810 case OBJ_REQUEST_PAGES:
1811 if (obj_request->pages)
1812 ceph_release_page_vector(obj_request->pages,
1813 obj_request->page_count);
1814 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001815 }
1816
Alex Elderf907ad52013-05-01 12:43:03 -05001817 kfree(obj_request->object_name);
Alex Elder868311b2013-05-01 12:43:03 -05001818 obj_request->object_name = NULL;
1819 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001820}
1821
1822/*
1823 * Caller is responsible for filling in the list of object requests
1824 * that comprises the image request, and the Linux request pointer
1825 * (if there is one).
1826 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001827static struct rbd_img_request *rbd_img_request_create(
1828 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001829 u64 offset, u64 length,
Alex Elder9849e982013-01-24 16:13:36 -06001830 bool write_request,
1831 bool child_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001832{
1833 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001834
Alex Elder1c2a9df2013-05-01 12:43:03 -05001835 img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_ATOMIC);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001836 if (!img_request)
1837 return NULL;
1838
1839 if (write_request) {
1840 down_read(&rbd_dev->header_rwsem);
Alex Elder812164f82013-04-30 00:44:32 -05001841 ceph_get_snap_context(rbd_dev->header.snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001842 up_read(&rbd_dev->header_rwsem);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001843 }
1844
1845 img_request->rq = NULL;
1846 img_request->rbd_dev = rbd_dev;
1847 img_request->offset = offset;
1848 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06001849 img_request->flags = 0;
1850 if (write_request) {
1851 img_request_write_set(img_request);
Alex Elder468521c2013-04-26 09:43:47 -05001852 img_request->snapc = rbd_dev->header.snapc;
Alex Elder0c425242013-02-08 09:55:49 -06001853 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001854 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06001855 }
Alex Elder9849e982013-01-24 16:13:36 -06001856 if (child_request)
1857 img_request_child_set(img_request);
Alex Elderd0b2e942013-01-24 16:13:36 -06001858 if (rbd_dev->parent_spec)
1859 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001860 spin_lock_init(&img_request->completion_lock);
1861 img_request->next_completion = 0;
1862 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06001863 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001864 img_request->obj_request_count = 0;
1865 INIT_LIST_HEAD(&img_request->obj_requests);
1866 kref_init(&img_request->kref);
1867
1868 rbd_img_request_get(img_request); /* Avoid a warning */
1869 rbd_img_request_put(img_request); /* TEMPORARY */
1870
Alex Elder37206ee2013-02-20 17:32:08 -06001871 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1872 write_request ? "write" : "read", offset, length,
1873 img_request);
1874
Alex Elderbf0d5f502012-11-22 00:00:08 -06001875 return img_request;
1876}
1877
1878static void rbd_img_request_destroy(struct kref *kref)
1879{
1880 struct rbd_img_request *img_request;
1881 struct rbd_obj_request *obj_request;
1882 struct rbd_obj_request *next_obj_request;
1883
1884 img_request = container_of(kref, struct rbd_img_request, kref);
1885
Alex Elder37206ee2013-02-20 17:32:08 -06001886 dout("%s: img %p\n", __func__, img_request);
1887
Alex Elderbf0d5f502012-11-22 00:00:08 -06001888 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1889 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001890 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001891
Alex Elder0c425242013-02-08 09:55:49 -06001892 if (img_request_write_test(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001893 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001894
Alex Elder8b3e1a52013-01-24 16:13:36 -06001895 if (img_request_child_test(img_request))
1896 rbd_obj_request_put(img_request->obj_request);
1897
Alex Elder1c2a9df2013-05-01 12:43:03 -05001898 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001899}
1900
Alex Elder12178572013-02-08 09:55:49 -06001901static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1902{
Alex Elder6365d332013-02-11 12:33:24 -06001903 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001904 unsigned int xferred;
1905 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001906 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001907
Alex Elder6365d332013-02-11 12:33:24 -06001908 rbd_assert(obj_request_img_data_test(obj_request));
1909 img_request = obj_request->img_request;
1910
Alex Elder12178572013-02-08 09:55:49 -06001911 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1912 xferred = (unsigned int)obj_request->xferred;
1913 result = obj_request->result;
1914 if (result) {
1915 struct rbd_device *rbd_dev = img_request->rbd_dev;
1916
1917 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1918 img_request_write_test(img_request) ? "write" : "read",
1919 obj_request->length, obj_request->img_offset,
1920 obj_request->offset);
1921 rbd_warn(rbd_dev, " result %d xferred %x\n",
1922 result, xferred);
1923 if (!img_request->result)
1924 img_request->result = result;
1925 }
1926
Alex Elderf1a47392013-04-19 15:34:50 -05001927 /* Image object requests don't own their page array */
1928
1929 if (obj_request->type == OBJ_REQUEST_PAGES) {
1930 obj_request->pages = NULL;
1931 obj_request->page_count = 0;
1932 }
1933
Alex Elder8b3e1a52013-01-24 16:13:36 -06001934 if (img_request_child_test(img_request)) {
1935 rbd_assert(img_request->obj_request != NULL);
1936 more = obj_request->which < img_request->obj_request_count - 1;
1937 } else {
1938 rbd_assert(img_request->rq != NULL);
1939 more = blk_end_request(img_request->rq, result, xferred);
1940 }
1941
1942 return more;
Alex Elder12178572013-02-08 09:55:49 -06001943}
1944
Alex Elder21692382013-04-05 01:27:12 -05001945static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1946{
1947 struct rbd_img_request *img_request;
1948 u32 which = obj_request->which;
1949 bool more = true;
1950
Alex Elder6365d332013-02-11 12:33:24 -06001951 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001952 img_request = obj_request->img_request;
1953
1954 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1955 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001956 rbd_assert(img_request->obj_request_count > 0);
1957 rbd_assert(which != BAD_WHICH);
1958 rbd_assert(which < img_request->obj_request_count);
1959 rbd_assert(which >= img_request->next_completion);
1960
1961 spin_lock_irq(&img_request->completion_lock);
1962 if (which != img_request->next_completion)
1963 goto out;
1964
1965 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001966 rbd_assert(more);
1967 rbd_assert(which < img_request->obj_request_count);
1968
1969 if (!obj_request_done_test(obj_request))
1970 break;
Alex Elder12178572013-02-08 09:55:49 -06001971 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001972 which++;
1973 }
1974
1975 rbd_assert(more ^ (which == img_request->obj_request_count));
1976 img_request->next_completion = which;
1977out:
1978 spin_unlock_irq(&img_request->completion_lock);
1979
1980 if (!more)
1981 rbd_img_request_complete(img_request);
1982}
1983
Alex Elderf1a47392013-04-19 15:34:50 -05001984/*
1985 * Split up an image request into one or more object requests, each
1986 * to a different object. The "type" parameter indicates whether
1987 * "data_desc" is the pointer to the head of a list of bio
1988 * structures, or the base of a page array. In either case this
1989 * function assumes data_desc describes memory sufficient to hold
1990 * all data described by the image request.
1991 */
1992static int rbd_img_request_fill(struct rbd_img_request *img_request,
1993 enum obj_request_type type,
1994 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001995{
1996 struct rbd_device *rbd_dev = img_request->rbd_dev;
1997 struct rbd_obj_request *obj_request = NULL;
1998 struct rbd_obj_request *next_obj_request;
Alex Elder0c425242013-02-08 09:55:49 -06001999 bool write_request = img_request_write_test(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05002000 struct bio *bio_list;
2001 unsigned int bio_offset = 0;
2002 struct page **pages;
Alex Elder7da22d22013-01-24 16:13:36 -06002003 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002004 u64 resid;
2005 u16 opcode;
2006
Alex Elderf1a47392013-04-19 15:34:50 -05002007 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2008 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06002009
Alex Elder430c28c2013-04-03 21:32:51 -05002010 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
Alex Elder7da22d22013-01-24 16:13:36 -06002011 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002012 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06002013 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05002014
2015 if (type == OBJ_REQUEST_BIO) {
2016 bio_list = data_desc;
2017 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
2018 } else {
2019 rbd_assert(type == OBJ_REQUEST_PAGES);
2020 pages = data_desc;
2021 }
2022
Alex Elderbf0d5f502012-11-22 00:00:08 -06002023 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05002024 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002025 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002026 u64 offset;
2027 u64 length;
2028
Alex Elder7da22d22013-01-24 16:13:36 -06002029 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002030 if (!object_name)
2031 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06002032 offset = rbd_segment_offset(rbd_dev, img_offset);
2033 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002034 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05002035 offset, length, type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002036 kfree(object_name); /* object request has its own copy */
2037 if (!obj_request)
2038 goto out_unwind;
2039
Alex Elderf1a47392013-04-19 15:34:50 -05002040 if (type == OBJ_REQUEST_BIO) {
2041 unsigned int clone_size;
2042
2043 rbd_assert(length <= (u64)UINT_MAX);
2044 clone_size = (unsigned int)length;
2045 obj_request->bio_list =
2046 bio_chain_clone_range(&bio_list,
2047 &bio_offset,
2048 clone_size,
2049 GFP_ATOMIC);
2050 if (!obj_request->bio_list)
2051 goto out_partial;
2052 } else {
2053 unsigned int page_count;
2054
2055 obj_request->pages = pages;
2056 page_count = (u32)calc_pages_for(offset, length);
2057 obj_request->page_count = page_count;
2058 if ((offset + length) & ~PAGE_MASK)
2059 page_count--; /* more on last page */
2060 pages += page_count;
2061 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002062
Alex Elder2fa12322013-04-05 01:27:12 -05002063 osd_req = rbd_osd_req_create(rbd_dev, write_request,
2064 obj_request);
2065 if (!osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002066 goto out_partial;
Alex Elder2fa12322013-04-05 01:27:12 -05002067 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05002068 obj_request->callback = rbd_img_obj_callback;
Alex Elder430c28c2013-04-03 21:32:51 -05002069
Alex Elder2fa12322013-04-05 01:27:12 -05002070 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
2071 0, 0);
Alex Elderf1a47392013-04-19 15:34:50 -05002072 if (type == OBJ_REQUEST_BIO)
2073 osd_req_op_extent_osd_data_bio(osd_req, 0,
2074 obj_request->bio_list, length);
2075 else
2076 osd_req_op_extent_osd_data_pages(osd_req, 0,
2077 obj_request->pages, length,
2078 offset & ~PAGE_MASK, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002079
2080 if (write_request)
2081 rbd_osd_req_format_write(obj_request);
2082 else
2083 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002084
Alex Elder7da22d22013-01-24 16:13:36 -06002085 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002086 rbd_img_obj_request_add(img_request, obj_request);
2087
Alex Elder7da22d22013-01-24 16:13:36 -06002088 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002089 resid -= length;
2090 }
2091
2092 return 0;
2093
2094out_partial:
2095 rbd_obj_request_put(obj_request);
2096out_unwind:
2097 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2098 rbd_obj_request_put(obj_request);
2099
2100 return -ENOMEM;
2101}
2102
Alex Elder3d7efd12013-04-19 15:34:50 -05002103static void
Alex Elder0eefd472013-04-19 15:34:50 -05002104rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2105{
2106 struct rbd_img_request *img_request;
2107 struct rbd_device *rbd_dev;
2108 u64 length;
2109 u32 page_count;
2110
2111 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2112 rbd_assert(obj_request_img_data_test(obj_request));
2113 img_request = obj_request->img_request;
2114 rbd_assert(img_request);
2115
2116 rbd_dev = img_request->rbd_dev;
2117 rbd_assert(rbd_dev);
2118 length = (u64)1 << rbd_dev->header.obj_order;
2119 page_count = (u32)calc_pages_for(0, length);
2120
2121 rbd_assert(obj_request->copyup_pages);
2122 ceph_release_page_vector(obj_request->copyup_pages, page_count);
2123 obj_request->copyup_pages = NULL;
2124
2125 /*
2126 * We want the transfer count to reflect the size of the
2127 * original write request. There is no such thing as a
2128 * successful short write, so if the request was successful
2129 * we can just set it to the originally-requested length.
2130 */
2131 if (!obj_request->result)
2132 obj_request->xferred = obj_request->length;
2133
2134 /* Finish up with the normal image object callback */
2135
2136 rbd_img_obj_callback(obj_request);
2137}
2138
2139static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002140rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2141{
2142 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002143 struct ceph_osd_request *osd_req;
2144 struct ceph_osd_client *osdc;
2145 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002146 struct page **pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002147 int result;
2148 u64 obj_size;
2149 u64 xferred;
2150
2151 rbd_assert(img_request_child_test(img_request));
2152
2153 /* First get what we need from the image request */
2154
2155 pages = img_request->copyup_pages;
2156 rbd_assert(pages != NULL);
2157 img_request->copyup_pages = NULL;
2158
2159 orig_request = img_request->obj_request;
2160 rbd_assert(orig_request != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002161 rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
Alex Elder3d7efd12013-04-19 15:34:50 -05002162 result = img_request->result;
2163 obj_size = img_request->length;
2164 xferred = img_request->xferred;
2165
Alex Elder0eefd472013-04-19 15:34:50 -05002166 rbd_dev = img_request->rbd_dev;
2167 rbd_assert(rbd_dev);
2168 rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2169
Alex Elder3d7efd12013-04-19 15:34:50 -05002170 rbd_img_request_put(img_request);
2171
Alex Elder0eefd472013-04-19 15:34:50 -05002172 if (result)
2173 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002174
Alex Elder0eefd472013-04-19 15:34:50 -05002175 /* Allocate the new copyup osd request for the original request */
Alex Elder3d7efd12013-04-19 15:34:50 -05002176
Alex Elder0eefd472013-04-19 15:34:50 -05002177 result = -ENOMEM;
2178 rbd_assert(!orig_request->osd_req);
2179 osd_req = rbd_osd_req_create_copyup(orig_request);
2180 if (!osd_req)
2181 goto out_err;
2182 orig_request->osd_req = osd_req;
2183 orig_request->copyup_pages = pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002184
Alex Elder0eefd472013-04-19 15:34:50 -05002185 /* Initialize the copyup op */
2186
2187 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2188 osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2189 false, false);
2190
2191 /* Then the original write request op */
2192
2193 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2194 orig_request->offset,
2195 orig_request->length, 0, 0);
2196 osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2197 orig_request->length);
2198
2199 rbd_osd_req_format_write(orig_request);
2200
2201 /* All set, send it off. */
2202
2203 orig_request->callback = rbd_img_obj_copyup_callback;
2204 osdc = &rbd_dev->rbd_client->client->osdc;
2205 result = rbd_obj_request_submit(osdc, orig_request);
2206 if (!result)
2207 return;
2208out_err:
2209 /* Record the error code and complete the request */
2210
2211 orig_request->result = result;
2212 orig_request->xferred = 0;
2213 obj_request_done_set(orig_request);
2214 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002215}
2216
2217/*
2218 * Read from the parent image the range of data that covers the
2219 * entire target of the given object request. This is used for
2220 * satisfying a layered image write request when the target of an
2221 * object request from the image request does not exist.
2222 *
2223 * A page array big enough to hold the returned data is allocated
2224 * and supplied to rbd_img_request_fill() as the "data descriptor."
2225 * When the read completes, this page array will be transferred to
2226 * the original object request for the copyup operation.
2227 *
2228 * If an error occurs, record it as the result of the original
2229 * object request and mark it done so it gets completed.
2230 */
2231static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2232{
2233 struct rbd_img_request *img_request = NULL;
2234 struct rbd_img_request *parent_request = NULL;
2235 struct rbd_device *rbd_dev;
2236 u64 img_offset;
2237 u64 length;
2238 struct page **pages = NULL;
2239 u32 page_count;
2240 int result;
2241
2242 rbd_assert(obj_request_img_data_test(obj_request));
2243 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2244
2245 img_request = obj_request->img_request;
2246 rbd_assert(img_request != NULL);
2247 rbd_dev = img_request->rbd_dev;
2248 rbd_assert(rbd_dev->parent != NULL);
2249
2250 /*
Alex Elder0eefd472013-04-19 15:34:50 -05002251 * First things first. The original osd request is of no
2252 * use to use any more, we'll need a new one that can hold
2253 * the two ops in a copyup request. We'll get that later,
2254 * but for now we can release the old one.
2255 */
2256 rbd_osd_req_destroy(obj_request->osd_req);
2257 obj_request->osd_req = NULL;
2258
2259 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002260 * Determine the byte range covered by the object in the
2261 * child image to which the original request was to be sent.
2262 */
2263 img_offset = obj_request->img_offset - obj_request->offset;
2264 length = (u64)1 << rbd_dev->header.obj_order;
2265
2266 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002267 * There is no defined parent data beyond the parent
2268 * overlap, so limit what we read at that boundary if
2269 * necessary.
2270 */
2271 if (img_offset + length > rbd_dev->parent_overlap) {
2272 rbd_assert(img_offset < rbd_dev->parent_overlap);
2273 length = rbd_dev->parent_overlap - img_offset;
2274 }
2275
2276 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002277 * Allocate a page array big enough to receive the data read
2278 * from the parent.
2279 */
2280 page_count = (u32)calc_pages_for(0, length);
2281 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2282 if (IS_ERR(pages)) {
2283 result = PTR_ERR(pages);
2284 pages = NULL;
2285 goto out_err;
2286 }
2287
2288 result = -ENOMEM;
2289 parent_request = rbd_img_request_create(rbd_dev->parent,
2290 img_offset, length,
2291 false, true);
2292 if (!parent_request)
2293 goto out_err;
2294 rbd_obj_request_get(obj_request);
2295 parent_request->obj_request = obj_request;
2296
2297 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2298 if (result)
2299 goto out_err;
2300 parent_request->copyup_pages = pages;
2301
2302 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2303 result = rbd_img_request_submit(parent_request);
2304 if (!result)
2305 return 0;
2306
2307 parent_request->copyup_pages = NULL;
2308 parent_request->obj_request = NULL;
2309 rbd_obj_request_put(obj_request);
2310out_err:
2311 if (pages)
2312 ceph_release_page_vector(pages, page_count);
2313 if (parent_request)
2314 rbd_img_request_put(parent_request);
2315 obj_request->result = result;
2316 obj_request->xferred = 0;
2317 obj_request_done_set(obj_request);
2318
2319 return result;
2320}
2321
Alex Elderc5b5ef62013-02-11 12:33:24 -06002322static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2323{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002324 struct rbd_obj_request *orig_request;
2325 int result;
2326
2327 rbd_assert(!obj_request_img_data_test(obj_request));
2328
2329 /*
2330 * All we need from the object request is the original
2331 * request and the result of the STAT op. Grab those, then
2332 * we're done with the request.
2333 */
2334 orig_request = obj_request->obj_request;
2335 obj_request->obj_request = NULL;
2336 rbd_assert(orig_request);
2337 rbd_assert(orig_request->img_request);
2338
2339 result = obj_request->result;
2340 obj_request->result = 0;
2341
2342 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2343 obj_request, orig_request, result,
2344 obj_request->xferred, obj_request->length);
2345 rbd_obj_request_put(obj_request);
2346
2347 rbd_assert(orig_request);
2348 rbd_assert(orig_request->img_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002349
2350 /*
2351 * Our only purpose here is to determine whether the object
2352 * exists, and we don't want to treat the non-existence as
2353 * an error. If something else comes back, transfer the
2354 * error to the original request and complete it now.
2355 */
2356 if (!result) {
2357 obj_request_existence_set(orig_request, true);
2358 } else if (result == -ENOENT) {
2359 obj_request_existence_set(orig_request, false);
2360 } else if (result) {
2361 orig_request->result = result;
Alex Elder3d7efd12013-04-19 15:34:50 -05002362 goto out;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002363 }
2364
2365 /*
2366 * Resubmit the original request now that we have recorded
2367 * whether the target object exists.
2368 */
Alex Elderb454e362013-04-19 15:34:50 -05002369 orig_request->result = rbd_img_obj_request_submit(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002370out:
Alex Elderc5b5ef62013-02-11 12:33:24 -06002371 if (orig_request->result)
2372 rbd_obj_request_complete(orig_request);
2373 rbd_obj_request_put(orig_request);
2374}
2375
2376static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2377{
2378 struct rbd_obj_request *stat_request;
2379 struct rbd_device *rbd_dev;
2380 struct ceph_osd_client *osdc;
2381 struct page **pages = NULL;
2382 u32 page_count;
2383 size_t size;
2384 int ret;
2385
2386 /*
2387 * The response data for a STAT call consists of:
2388 * le64 length;
2389 * struct {
2390 * le32 tv_sec;
2391 * le32 tv_nsec;
2392 * } mtime;
2393 */
2394 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2395 page_count = (u32)calc_pages_for(0, size);
2396 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2397 if (IS_ERR(pages))
2398 return PTR_ERR(pages);
2399
2400 ret = -ENOMEM;
2401 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2402 OBJ_REQUEST_PAGES);
2403 if (!stat_request)
2404 goto out;
2405
2406 rbd_obj_request_get(obj_request);
2407 stat_request->obj_request = obj_request;
2408 stat_request->pages = pages;
2409 stat_request->page_count = page_count;
2410
2411 rbd_assert(obj_request->img_request);
2412 rbd_dev = obj_request->img_request->rbd_dev;
2413 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2414 stat_request);
2415 if (!stat_request->osd_req)
2416 goto out;
2417 stat_request->callback = rbd_img_obj_exists_callback;
2418
2419 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2420 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2421 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002422 rbd_osd_req_format_read(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002423
2424 osdc = &rbd_dev->rbd_client->client->osdc;
2425 ret = rbd_obj_request_submit(osdc, stat_request);
2426out:
2427 if (ret)
2428 rbd_obj_request_put(obj_request);
2429
2430 return ret;
2431}
2432
Alex Elderb454e362013-04-19 15:34:50 -05002433static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2434{
2435 struct rbd_img_request *img_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002436 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002437 bool known;
Alex Elderb454e362013-04-19 15:34:50 -05002438
2439 rbd_assert(obj_request_img_data_test(obj_request));
2440
2441 img_request = obj_request->img_request;
2442 rbd_assert(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002443 rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002444
Alex Elderb454e362013-04-19 15:34:50 -05002445 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002446 * Only writes to layered images need special handling.
2447 * Reads and non-layered writes are simple object requests.
2448 * Layered writes that start beyond the end of the overlap
2449 * with the parent have no parent data, so they too are
2450 * simple object requests. Finally, if the target object is
2451 * known to already exist, its parent data has already been
2452 * copied, so a write to the object can also be handled as a
2453 * simple object request.
Alex Elderb454e362013-04-19 15:34:50 -05002454 */
2455 if (!img_request_write_test(img_request) ||
2456 !img_request_layered_test(img_request) ||
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002457 rbd_dev->parent_overlap <= obj_request->img_offset ||
Alex Elder3d7efd12013-04-19 15:34:50 -05002458 ((known = obj_request_known_test(obj_request)) &&
2459 obj_request_exists_test(obj_request))) {
Alex Elderb454e362013-04-19 15:34:50 -05002460
2461 struct rbd_device *rbd_dev;
2462 struct ceph_osd_client *osdc;
2463
2464 rbd_dev = obj_request->img_request->rbd_dev;
2465 osdc = &rbd_dev->rbd_client->client->osdc;
2466
2467 return rbd_obj_request_submit(osdc, obj_request);
2468 }
2469
2470 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002471 * It's a layered write. The target object might exist but
2472 * we may not know that yet. If we know it doesn't exist,
2473 * start by reading the data for the full target object from
2474 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002475 */
Alex Elder3d7efd12013-04-19 15:34:50 -05002476 if (known)
2477 return rbd_img_obj_parent_read_full(obj_request);
2478
2479 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002480
2481 return rbd_img_obj_exists_submit(obj_request);
2482}
2483
Alex Elderbf0d5f502012-11-22 00:00:08 -06002484static int rbd_img_request_submit(struct rbd_img_request *img_request)
2485{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002486 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002487 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002488
Alex Elder37206ee2013-02-20 17:32:08 -06002489 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05002490 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002491 int ret;
2492
Alex Elderb454e362013-04-19 15:34:50 -05002493 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002494 if (ret)
2495 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002496 }
2497
2498 return 0;
2499}
2500
Alex Elder8b3e1a52013-01-24 16:13:36 -06002501static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2502{
2503 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002504 struct rbd_device *rbd_dev;
2505 u64 obj_end;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002506
2507 rbd_assert(img_request_child_test(img_request));
2508
2509 obj_request = img_request->obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002510 rbd_assert(obj_request);
2511 rbd_assert(obj_request->img_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002512
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002513 obj_request->result = img_request->result;
2514 if (obj_request->result)
2515 goto out;
2516
2517 /*
2518 * We need to zero anything beyond the parent overlap
2519 * boundary. Since rbd_img_obj_request_read_callback()
2520 * will zero anything beyond the end of a short read, an
2521 * easy way to do this is to pretend the data from the
2522 * parent came up short--ending at the overlap boundary.
2523 */
2524 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2525 obj_end = obj_request->img_offset + obj_request->length;
2526 rbd_dev = obj_request->img_request->rbd_dev;
2527 if (obj_end > rbd_dev->parent_overlap) {
2528 u64 xferred = 0;
2529
2530 if (obj_request->img_offset < rbd_dev->parent_overlap)
2531 xferred = rbd_dev->parent_overlap -
2532 obj_request->img_offset;
2533
2534 obj_request->xferred = min(img_request->xferred, xferred);
2535 } else {
2536 obj_request->xferred = img_request->xferred;
2537 }
2538out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06002539 rbd_img_obj_request_read_callback(obj_request);
2540 rbd_obj_request_complete(obj_request);
2541}
2542
2543static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2544{
2545 struct rbd_device *rbd_dev;
2546 struct rbd_img_request *img_request;
2547 int result;
2548
2549 rbd_assert(obj_request_img_data_test(obj_request));
2550 rbd_assert(obj_request->img_request != NULL);
2551 rbd_assert(obj_request->result == (s32) -ENOENT);
2552 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2553
2554 rbd_dev = obj_request->img_request->rbd_dev;
2555 rbd_assert(rbd_dev->parent != NULL);
2556 /* rbd_read_finish(obj_request, obj_request->length); */
2557 img_request = rbd_img_request_create(rbd_dev->parent,
2558 obj_request->img_offset,
2559 obj_request->length,
2560 false, true);
2561 result = -ENOMEM;
2562 if (!img_request)
2563 goto out_err;
2564
2565 rbd_obj_request_get(obj_request);
2566 img_request->obj_request = obj_request;
2567
Alex Elderf1a47392013-04-19 15:34:50 -05002568 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2569 obj_request->bio_list);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002570 if (result)
2571 goto out_err;
2572
2573 img_request->callback = rbd_img_parent_read_callback;
2574 result = rbd_img_request_submit(img_request);
2575 if (result)
2576 goto out_err;
2577
2578 return;
2579out_err:
2580 if (img_request)
2581 rbd_img_request_put(img_request);
2582 obj_request->result = result;
2583 obj_request->xferred = 0;
2584 obj_request_done_set(obj_request);
2585}
2586
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002587static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
Alex Elderb8d70032012-11-30 17:53:04 -06002588{
2589 struct rbd_obj_request *obj_request;
Alex Elder21692382013-04-05 01:27:12 -05002590 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderb8d70032012-11-30 17:53:04 -06002591 int ret;
2592
2593 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2594 OBJ_REQUEST_NODATA);
2595 if (!obj_request)
2596 return -ENOMEM;
2597
2598 ret = -ENOMEM;
Alex Elder430c28c2013-04-03 21:32:51 -05002599 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002600 if (!obj_request->osd_req)
2601 goto out;
Alex Elder21692382013-04-05 01:27:12 -05002602 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06002603
Alex Elderc99d2d42013-04-05 01:27:11 -05002604 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002605 notify_id, 0, 0);
Alex Elder9d4df012013-04-19 15:34:50 -05002606 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002607
Alex Elderb8d70032012-11-30 17:53:04 -06002608 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002609out:
Alex Eldercf81b602013-01-17 12:18:46 -06002610 if (ret)
2611 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002612
2613 return ret;
2614}
2615
2616static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2617{
2618 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Alex Elderb8d70032012-11-30 17:53:04 -06002619
2620 if (!rbd_dev)
2621 return;
2622
Alex Elder37206ee2013-02-20 17:32:08 -06002623 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002624 rbd_dev->header_name, (unsigned long long)notify_id,
2625 (unsigned int)opcode);
2626 (void)rbd_dev_refresh(rbd_dev);
Alex Elderb8d70032012-11-30 17:53:04 -06002627
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002628 rbd_obj_notify_ack(rbd_dev, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06002629}
2630
Alex Elder9969ebc2013-01-18 12:31:10 -06002631/*
2632 * Request sync osd watch/unwatch. The value of "start" determines
2633 * whether a watch request is being initiated or torn down.
2634 */
2635static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2636{
2637 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2638 struct rbd_obj_request *obj_request;
Alex Elder9969ebc2013-01-18 12:31:10 -06002639 int ret;
2640
2641 rbd_assert(start ^ !!rbd_dev->watch_event);
2642 rbd_assert(start ^ !!rbd_dev->watch_request);
2643
2644 if (start) {
Alex Elder3c663bb2013-02-15 11:42:30 -06002645 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
Alex Elder9969ebc2013-01-18 12:31:10 -06002646 &rbd_dev->watch_event);
2647 if (ret < 0)
2648 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06002649 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06002650 }
2651
2652 ret = -ENOMEM;
2653 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2654 OBJ_REQUEST_NODATA);
2655 if (!obj_request)
2656 goto out_cancel;
2657
Alex Elder430c28c2013-04-03 21:32:51 -05002658 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2659 if (!obj_request->osd_req)
2660 goto out_cancel;
2661
Alex Elder8eb87562013-01-25 17:08:55 -06002662 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06002663 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06002664 else
Alex Elder6977c3f2013-01-25 17:08:55 -06002665 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06002666 rbd_dev->watch_request->osd_req);
Alex Elder21692382013-04-05 01:27:12 -05002667
2668 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
Alex Elderb21ebdd2013-04-30 00:44:32 -05002669 rbd_dev->watch_event->cookie, 0, start);
Alex Elder9d4df012013-04-19 15:34:50 -05002670 rbd_osd_req_format_write(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002671
Alex Elder9969ebc2013-01-18 12:31:10 -06002672 ret = rbd_obj_request_submit(osdc, obj_request);
2673 if (ret)
2674 goto out_cancel;
2675 ret = rbd_obj_request_wait(obj_request);
2676 if (ret)
2677 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06002678 ret = obj_request->result;
2679 if (ret)
2680 goto out_cancel;
2681
Alex Elder8eb87562013-01-25 17:08:55 -06002682 /*
2683 * A watch request is set to linger, so the underlying osd
2684 * request won't go away until we unregister it. We retain
2685 * a pointer to the object request during that time (in
2686 * rbd_dev->watch_request), so we'll keep a reference to
2687 * it. We'll drop that reference (below) after we've
2688 * unregistered it.
2689 */
2690 if (start) {
2691 rbd_dev->watch_request = obj_request;
2692
2693 return 0;
2694 }
2695
2696 /* We have successfully torn down the watch request */
2697
2698 rbd_obj_request_put(rbd_dev->watch_request);
2699 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002700out_cancel:
2701 /* Cancel the event if we're tearing down, or on error */
2702 ceph_osdc_cancel_event(rbd_dev->watch_event);
2703 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002704 if (obj_request)
2705 rbd_obj_request_put(obj_request);
2706
2707 return ret;
2708}
2709
Alex Elder36be9a72013-01-19 00:30:28 -06002710/*
Alex Elderf40eb342013-04-25 15:09:42 -05002711 * Synchronous osd object method call. Returns the number of bytes
2712 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06002713 */
2714static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2715 const char *object_name,
2716 const char *class_name,
2717 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05002718 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06002719 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05002720 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05002721 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06002722{
Alex Elder21692382013-04-05 01:27:12 -05002723 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder36be9a72013-01-19 00:30:28 -06002724 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06002725 struct page **pages;
2726 u32 page_count;
2727 int ret;
2728
2729 /*
Alex Elder6010a452013-04-05 01:27:11 -05002730 * Method calls are ultimately read operations. The result
2731 * should placed into the inbound buffer provided. They
2732 * also supply outbound data--parameters for the object
2733 * method. Currently if this is present it will be a
2734 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06002735 */
Alex Elder57385b52013-04-21 12:14:45 -05002736 page_count = (u32)calc_pages_for(0, inbound_size);
Alex Elder36be9a72013-01-19 00:30:28 -06002737 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2738 if (IS_ERR(pages))
2739 return PTR_ERR(pages);
2740
2741 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05002742 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06002743 OBJ_REQUEST_PAGES);
2744 if (!obj_request)
2745 goto out;
2746
2747 obj_request->pages = pages;
2748 obj_request->page_count = page_count;
2749
Alex Elder430c28c2013-04-03 21:32:51 -05002750 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06002751 if (!obj_request->osd_req)
2752 goto out;
2753
Alex Elderc99d2d42013-04-05 01:27:11 -05002754 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05002755 class_name, method_name);
2756 if (outbound_size) {
2757 struct ceph_pagelist *pagelist;
2758
2759 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2760 if (!pagelist)
2761 goto out;
2762
2763 ceph_pagelist_init(pagelist);
2764 ceph_pagelist_append(pagelist, outbound, outbound_size);
2765 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2766 pagelist);
2767 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05002768 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2769 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05002770 0, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002771 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002772
Alex Elder36be9a72013-01-19 00:30:28 -06002773 ret = rbd_obj_request_submit(osdc, obj_request);
2774 if (ret)
2775 goto out;
2776 ret = rbd_obj_request_wait(obj_request);
2777 if (ret)
2778 goto out;
2779
2780 ret = obj_request->result;
2781 if (ret < 0)
2782 goto out;
Alex Elder57385b52013-04-21 12:14:45 -05002783
2784 rbd_assert(obj_request->xferred < (u64)INT_MAX);
2785 ret = (int)obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002786 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06002787out:
2788 if (obj_request)
2789 rbd_obj_request_put(obj_request);
2790 else
2791 ceph_release_page_vector(pages, page_count);
2792
2793 return ret;
2794}
2795
Alex Elderbf0d5f502012-11-22 00:00:08 -06002796static void rbd_request_fn(struct request_queue *q)
Alex Eldercc344fa2013-02-19 12:25:56 -06002797 __releases(q->queue_lock) __acquires(q->queue_lock)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002798{
2799 struct rbd_device *rbd_dev = q->queuedata;
2800 bool read_only = rbd_dev->mapping.read_only;
2801 struct request *rq;
2802 int result;
2803
2804 while ((rq = blk_fetch_request(q))) {
2805 bool write_request = rq_data_dir(rq) == WRITE;
2806 struct rbd_img_request *img_request;
2807 u64 offset;
2808 u64 length;
2809
2810 /* Ignore any non-FS requests that filter through. */
2811
2812 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06002813 dout("%s: non-fs request type %d\n", __func__,
2814 (int) rq->cmd_type);
2815 __blk_end_request_all(rq, 0);
2816 continue;
2817 }
2818
2819 /* Ignore/skip any zero-length requests */
2820
2821 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2822 length = (u64) blk_rq_bytes(rq);
2823
2824 if (!length) {
2825 dout("%s: zero-length request\n", __func__);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002826 __blk_end_request_all(rq, 0);
2827 continue;
2828 }
2829
2830 spin_unlock_irq(q->queue_lock);
2831
2832 /* Disallow writes to a read-only device */
2833
2834 if (write_request) {
2835 result = -EROFS;
2836 if (read_only)
2837 goto end_request;
2838 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2839 }
2840
Alex Elder6d292902013-01-14 12:43:31 -06002841 /*
2842 * Quit early if the mapped snapshot no longer
2843 * exists. It's still possible the snapshot will
2844 * have disappeared by the time our request arrives
2845 * at the osd, but there's no sense in sending it if
2846 * we already know.
2847 */
2848 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002849 dout("request for non-existent snapshot");
2850 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2851 result = -ENXIO;
2852 goto end_request;
2853 }
2854
Alex Elderbf0d5f502012-11-22 00:00:08 -06002855 result = -EINVAL;
Alex Elderc0cd10db2013-04-26 09:43:47 -05002856 if (offset && length > U64_MAX - offset + 1) {
2857 rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2858 offset, length);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002859 goto end_request; /* Shouldn't happen */
Alex Elderc0cd10db2013-04-26 09:43:47 -05002860 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002861
2862 result = -ENOMEM;
2863 img_request = rbd_img_request_create(rbd_dev, offset, length,
Alex Elder9849e982013-01-24 16:13:36 -06002864 write_request, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002865 if (!img_request)
2866 goto end_request;
2867
2868 img_request->rq = rq;
2869
Alex Elderf1a47392013-04-19 15:34:50 -05002870 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2871 rq->bio);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002872 if (!result)
2873 result = rbd_img_request_submit(img_request);
2874 if (result)
2875 rbd_img_request_put(img_request);
2876end_request:
2877 spin_lock_irq(q->queue_lock);
2878 if (result < 0) {
Alex Elder7da22d22013-01-24 16:13:36 -06002879 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2880 write_request ? "write" : "read",
2881 length, offset, result);
2882
Alex Elderbf0d5f502012-11-22 00:00:08 -06002883 __blk_end_request_all(rq, result);
2884 }
2885 }
2886}
2887
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002888/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002889 * a queue callback. Makes sure that we don't create a bio that spans across
2890 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002891 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002892 */
2893static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2894 struct bio_vec *bvec)
2895{
2896 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05002897 sector_t sector_offset;
2898 sector_t sectors_per_obj;
2899 sector_t obj_sector_offset;
2900 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002901
Alex Eldere5cfeed22012-10-20 22:17:27 -05002902 /*
2903 * Find how far into its rbd object the partition-relative
2904 * bio start sector is to offset relative to the enclosing
2905 * device.
2906 */
2907 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2908 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2909 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002910
Alex Eldere5cfeed22012-10-20 22:17:27 -05002911 /*
2912 * Compute the number of bytes from that offset to the end
2913 * of the object. Account for what's already used by the bio.
2914 */
2915 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2916 if (ret > bmd->bi_size)
2917 ret -= bmd->bi_size;
2918 else
2919 ret = 0;
2920
2921 /*
2922 * Don't send back more than was asked for. And if the bio
2923 * was empty, let the whole thing through because: "Note
2924 * that a block device *must* allow a single page to be
2925 * added to an empty bio."
2926 */
2927 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2928 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2929 ret = (int) bvec->bv_len;
2930
2931 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002932}
2933
2934static void rbd_free_disk(struct rbd_device *rbd_dev)
2935{
2936 struct gendisk *disk = rbd_dev->disk;
2937
2938 if (!disk)
2939 return;
2940
Alex Eldera0cab922013-04-25 23:15:08 -05002941 rbd_dev->disk = NULL;
2942 if (disk->flags & GENHD_FL_UP) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002943 del_gendisk(disk);
Alex Eldera0cab922013-04-25 23:15:08 -05002944 if (disk->queue)
2945 blk_cleanup_queue(disk->queue);
2946 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002947 put_disk(disk);
2948}
2949
Alex Elder788e2df2013-01-17 12:25:27 -06002950static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2951 const char *object_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05002952 u64 offset, u64 length, void *buf)
Alex Elder788e2df2013-01-17 12:25:27 -06002953
2954{
Alex Elder21692382013-04-05 01:27:12 -05002955 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder788e2df2013-01-17 12:25:27 -06002956 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06002957 struct page **pages = NULL;
2958 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002959 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06002960 int ret;
2961
2962 page_count = (u32) calc_pages_for(offset, length);
2963 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2964 if (IS_ERR(pages))
2965 ret = PTR_ERR(pages);
2966
2967 ret = -ENOMEM;
2968 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002969 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002970 if (!obj_request)
2971 goto out;
2972
2973 obj_request->pages = pages;
2974 obj_request->page_count = page_count;
2975
Alex Elder430c28c2013-04-03 21:32:51 -05002976 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06002977 if (!obj_request->osd_req)
2978 goto out;
2979
Alex Elderc99d2d42013-04-05 01:27:11 -05002980 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2981 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05002982 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05002983 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05002984 obj_request->length,
2985 obj_request->offset & ~PAGE_MASK,
2986 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002987 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002988
Alex Elder788e2df2013-01-17 12:25:27 -06002989 ret = rbd_obj_request_submit(osdc, obj_request);
2990 if (ret)
2991 goto out;
2992 ret = rbd_obj_request_wait(obj_request);
2993 if (ret)
2994 goto out;
2995
2996 ret = obj_request->result;
2997 if (ret < 0)
2998 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002999
3000 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
3001 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06003002 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder7097f8d2013-04-30 00:44:33 -05003003 rbd_assert(size <= (size_t)INT_MAX);
3004 ret = (int)size;
Alex Elder788e2df2013-01-17 12:25:27 -06003005out:
3006 if (obj_request)
3007 rbd_obj_request_put(obj_request);
3008 else
3009 ceph_release_page_vector(pages, page_count);
3010
3011 return ret;
3012}
3013
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003014/*
Alex Elder4156d992012-08-02 11:29:46 -05003015 * Read the complete header for the given rbd device.
3016 *
3017 * Returns a pointer to a dynamically-allocated buffer containing
3018 * the complete and validated header. Caller can pass the address
3019 * of a variable that will be filled in with the version of the
3020 * header object at the time it was read.
3021 *
3022 * Returns a pointer-coded errno if a failure occurs.
3023 */
3024static struct rbd_image_header_ondisk *
Alex Elder7097f8d2013-04-30 00:44:33 -05003025rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003026{
3027 struct rbd_image_header_ondisk *ondisk = NULL;
3028 u32 snap_count = 0;
3029 u64 names_size = 0;
3030 u32 want_count;
3031 int ret;
3032
3033 /*
3034 * The complete header will include an array of its 64-bit
3035 * snapshot ids, followed by the names of those snapshots as
3036 * a contiguous block of NUL-terminated strings. Note that
3037 * the number of snapshots could change by the time we read
3038 * it in, in which case we re-read it.
3039 */
3040 do {
3041 size_t size;
3042
3043 kfree(ondisk);
3044
3045 size = sizeof (*ondisk);
3046 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3047 size += names_size;
3048 ondisk = kmalloc(size, GFP_KERNEL);
3049 if (!ondisk)
3050 return ERR_PTR(-ENOMEM);
3051
Alex Elder788e2df2013-01-17 12:25:27 -06003052 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05003053 0, size, ondisk);
Alex Elder4156d992012-08-02 11:29:46 -05003054 if (ret < 0)
3055 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003056 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003057 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003058 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3059 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05003060 goto out_err;
3061 }
3062 if (!rbd_dev_ondisk_valid(ondisk)) {
3063 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003064 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05003065 goto out_err;
3066 }
3067
3068 names_size = le64_to_cpu(ondisk->snap_names_len);
3069 want_count = snap_count;
3070 snap_count = le32_to_cpu(ondisk->snap_count);
3071 } while (snap_count != want_count);
3072
3073 return ondisk;
3074
3075out_err:
3076 kfree(ondisk);
3077
3078 return ERR_PTR(ret);
3079}
3080
3081/*
3082 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003083 */
3084static int rbd_read_header(struct rbd_device *rbd_dev,
3085 struct rbd_image_header *header)
3086{
Alex Elder4156d992012-08-02 11:29:46 -05003087 struct rbd_image_header_ondisk *ondisk;
Alex Elder4156d992012-08-02 11:29:46 -05003088 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003089
Alex Elder7097f8d2013-04-30 00:44:33 -05003090 ondisk = rbd_dev_v1_header_read(rbd_dev);
Alex Elder4156d992012-08-02 11:29:46 -05003091 if (IS_ERR(ondisk))
3092 return PTR_ERR(ondisk);
3093 ret = rbd_header_from_disk(header, ondisk);
Alex Elder4156d992012-08-02 11:29:46 -05003094 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003095
Alex Elder4156d992012-08-02 11:29:46 -05003096 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003097}
3098
Alex Elder94785542012-10-09 13:50:17 -07003099static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
3100{
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003101 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07003102 return;
3103
Alex Eldere28626a2013-04-26 15:44:35 -05003104 if (rbd_dev->mapping.size != rbd_dev->header.image_size) {
3105 sector_t size;
3106
3107 rbd_dev->mapping.size = rbd_dev->header.image_size;
3108 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3109 dout("setting size to %llu sectors", (unsigned long long)size);
3110 set_capacity(rbd_dev->disk, size);
3111 }
Alex Elder94785542012-10-09 13:50:17 -07003112}
3113
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003114/*
3115 * only read the first part of the ondisk header, without the snaps info
3116 */
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003117static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003118{
3119 int ret;
3120 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003121
3122 ret = rbd_read_header(rbd_dev, &h);
3123 if (ret < 0)
3124 return ret;
3125
Josh Durgina51aa0c2011-12-05 10:35:04 -08003126 down_write(&rbd_dev->header_rwsem);
3127
Alex Elder94785542012-10-09 13:50:17 -07003128 /* Update image size, and check for resize of mapped image */
3129 rbd_dev->header.image_size = h.image_size;
3130 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07003131
Alex Elder849b4262012-07-09 21:04:24 -05003132 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003133 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05003134 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08003135 /* osd requests may still refer to snapc */
Alex Elder812164f82013-04-30 00:44:32 -05003136 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003137
Josh Durgin93a24e02011-12-05 10:41:28 -08003138 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003139 rbd_dev->header.snapc = h.snapc;
3140 rbd_dev->header.snap_names = h.snap_names;
3141 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05003142 /* Free the extra copy of the object prefix */
Alex Elderc0cd10db2013-04-26 09:43:47 -05003143 if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
3144 rbd_warn(rbd_dev, "object prefix changed (ignoring)");
Alex Elder849b4262012-07-09 21:04:24 -05003145 kfree(h.object_prefix);
3146
Josh Durginc6666012011-11-21 17:11:12 -08003147 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003148
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003149 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003150}
3151
Alex Elder15228ed2013-05-01 12:43:03 -05003152/*
3153 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3154 * has disappeared from the (just updated) snapshot context.
3155 */
3156static void rbd_exists_validate(struct rbd_device *rbd_dev)
3157{
3158 u64 snap_id;
3159
3160 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3161 return;
3162
3163 snap_id = rbd_dev->spec->snap_id;
3164 if (snap_id == CEPH_NOSNAP)
3165 return;
3166
3167 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3168 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3169}
3170
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003171static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003172{
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003173 u64 image_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003174 int ret;
3175
Alex Elder117973f2012-08-31 17:29:55 -05003176 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003177 image_size = rbd_dev->header.image_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003178 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05003179 if (rbd_dev->image_format == 1)
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003180 ret = rbd_dev_v1_refresh(rbd_dev);
Alex Elder117973f2012-08-31 17:29:55 -05003181 else
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003182 ret = rbd_dev_v2_refresh(rbd_dev);
Alex Elder15228ed2013-05-01 12:43:03 -05003183
3184 /* If it's a mapped snapshot, validate its EXISTS flag */
3185
3186 rbd_exists_validate(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003187 mutex_unlock(&ctl_mutex);
Alex Elder522a0cc2013-04-25 15:09:41 -05003188 if (ret)
3189 rbd_warn(rbd_dev, "got notification but failed to "
3190 " update snaps: %d\n", ret);
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003191 if (image_size != rbd_dev->header.image_size)
3192 revalidate_disk(rbd_dev->disk);
Alex Elder1fe5e992012-07-25 09:32:41 -05003193
3194 return ret;
3195}
3196
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003197static int rbd_init_disk(struct rbd_device *rbd_dev)
3198{
3199 struct gendisk *disk;
3200 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003201 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003202
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003203 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003204 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3205 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003206 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003207
Alex Elderf0f8cef2012-01-29 13:57:44 -06003208 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003209 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003210 disk->major = rbd_dev->major;
3211 disk->first_minor = 0;
3212 disk->fops = &rbd_bd_ops;
3213 disk->private_data = rbd_dev;
3214
Alex Elderbf0d5f502012-11-22 00:00:08 -06003215 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003216 if (!q)
3217 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003218
Alex Elder593a9e72012-02-07 12:03:37 -06003219 /* We use the default size, but let's be explicit about it. */
3220 blk_queue_physical_block_size(q, SECTOR_SIZE);
3221
Josh Durgin029bcbd2011-07-22 11:35:23 -07003222 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003223 segment_size = rbd_obj_bytes(&rbd_dev->header);
3224 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3225 blk_queue_max_segment_size(q, segment_size);
3226 blk_queue_io_min(q, segment_size);
3227 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003228
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003229 blk_queue_merge_bvec(q, rbd_merge_bvec);
3230 disk->queue = q;
3231
3232 q->queuedata = rbd_dev;
3233
3234 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003235
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003236 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003237out_disk:
3238 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003239
3240 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003241}
3242
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003243/*
3244 sysfs
3245*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003246
Alex Elder593a9e72012-02-07 12:03:37 -06003247static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3248{
3249 return container_of(dev, struct rbd_device, dev);
3250}
3251
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003252static ssize_t rbd_size_show(struct device *dev,
3253 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003254{
Alex Elder593a9e72012-02-07 12:03:37 -06003255 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003256
Alex Elderfc71d832013-04-26 15:44:36 -05003257 return sprintf(buf, "%llu\n",
3258 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003259}
3260
Alex Elder34b13182012-07-13 20:35:12 -05003261/*
3262 * Note this shows the features for whatever's mapped, which is not
3263 * necessarily the base image.
3264 */
3265static ssize_t rbd_features_show(struct device *dev,
3266 struct device_attribute *attr, char *buf)
3267{
3268 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3269
3270 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003271 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05003272}
3273
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003274static ssize_t rbd_major_show(struct device *dev,
3275 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003276{
Alex Elder593a9e72012-02-07 12:03:37 -06003277 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003278
Alex Elderfc71d832013-04-26 15:44:36 -05003279 if (rbd_dev->major)
3280 return sprintf(buf, "%d\n", rbd_dev->major);
3281
3282 return sprintf(buf, "(none)\n");
3283
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003284}
3285
3286static ssize_t rbd_client_id_show(struct device *dev,
3287 struct device_attribute *attr, char *buf)
3288{
Alex Elder593a9e72012-02-07 12:03:37 -06003289 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003290
Alex Elder1dbb4392012-01-24 10:08:37 -06003291 return sprintf(buf, "client%lld\n",
3292 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003293}
3294
3295static ssize_t rbd_pool_show(struct device *dev,
3296 struct device_attribute *attr, char *buf)
3297{
Alex Elder593a9e72012-02-07 12:03:37 -06003298 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003299
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003300 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003301}
3302
Alex Elder9bb2f332012-07-12 10:46:35 -05003303static ssize_t rbd_pool_id_show(struct device *dev,
3304 struct device_attribute *attr, char *buf)
3305{
3306 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3307
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003308 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003309 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003310}
3311
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003312static ssize_t rbd_name_show(struct device *dev,
3313 struct device_attribute *attr, char *buf)
3314{
Alex Elder593a9e72012-02-07 12:03:37 -06003315 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003316
Alex Eldera92ffdf2012-10-30 19:40:33 -05003317 if (rbd_dev->spec->image_name)
3318 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3319
3320 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003321}
3322
Alex Elder589d30e2012-07-10 20:30:11 -05003323static ssize_t rbd_image_id_show(struct device *dev,
3324 struct device_attribute *attr, char *buf)
3325{
3326 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3327
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003328 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003329}
3330
Alex Elder34b13182012-07-13 20:35:12 -05003331/*
3332 * Shows the name of the currently-mapped snapshot (or
3333 * RBD_SNAP_HEAD_NAME for the base image).
3334 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003335static ssize_t rbd_snap_show(struct device *dev,
3336 struct device_attribute *attr,
3337 char *buf)
3338{
Alex Elder593a9e72012-02-07 12:03:37 -06003339 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003340
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003341 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003342}
3343
Alex Elder86b00e02012-10-25 23:34:42 -05003344/*
3345 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3346 * for the parent image. If there is no parent, simply shows
3347 * "(no parent image)".
3348 */
3349static ssize_t rbd_parent_show(struct device *dev,
3350 struct device_attribute *attr,
3351 char *buf)
3352{
3353 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3354 struct rbd_spec *spec = rbd_dev->parent_spec;
3355 int count;
3356 char *bufp = buf;
3357
3358 if (!spec)
3359 return sprintf(buf, "(no parent image)\n");
3360
3361 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3362 (unsigned long long) spec->pool_id, spec->pool_name);
3363 if (count < 0)
3364 return count;
3365 bufp += count;
3366
3367 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3368 spec->image_name ? spec->image_name : "(unknown)");
3369 if (count < 0)
3370 return count;
3371 bufp += count;
3372
3373 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3374 (unsigned long long) spec->snap_id, spec->snap_name);
3375 if (count < 0)
3376 return count;
3377 bufp += count;
3378
3379 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3380 if (count < 0)
3381 return count;
3382 bufp += count;
3383
3384 return (ssize_t) (bufp - buf);
3385}
3386
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003387static ssize_t rbd_image_refresh(struct device *dev,
3388 struct device_attribute *attr,
3389 const char *buf,
3390 size_t size)
3391{
Alex Elder593a9e72012-02-07 12:03:37 -06003392 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003393 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003394
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003395 ret = rbd_dev_refresh(rbd_dev);
Alex Elderb8136232012-07-25 09:32:41 -05003396
3397 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003398}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003399
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003400static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003401static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003402static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3403static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3404static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003405static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003406static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003407static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003408static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3409static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003410static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003411
3412static struct attribute *rbd_attrs[] = {
3413 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003414 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003415 &dev_attr_major.attr,
3416 &dev_attr_client_id.attr,
3417 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05003418 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003419 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05003420 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003421 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05003422 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003423 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003424 NULL
3425};
3426
3427static struct attribute_group rbd_attr_group = {
3428 .attrs = rbd_attrs,
3429};
3430
3431static const struct attribute_group *rbd_attr_groups[] = {
3432 &rbd_attr_group,
3433 NULL
3434};
3435
3436static void rbd_sysfs_dev_release(struct device *dev)
3437{
3438}
3439
3440static struct device_type rbd_device_type = {
3441 .name = "rbd",
3442 .groups = rbd_attr_groups,
3443 .release = rbd_sysfs_dev_release,
3444};
3445
Alex Elder8b8fb992012-10-26 17:25:24 -05003446static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3447{
3448 kref_get(&spec->kref);
3449
3450 return spec;
3451}
3452
3453static void rbd_spec_free(struct kref *kref);
3454static void rbd_spec_put(struct rbd_spec *spec)
3455{
3456 if (spec)
3457 kref_put(&spec->kref, rbd_spec_free);
3458}
3459
3460static struct rbd_spec *rbd_spec_alloc(void)
3461{
3462 struct rbd_spec *spec;
3463
3464 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3465 if (!spec)
3466 return NULL;
3467 kref_init(&spec->kref);
3468
Alex Elder8b8fb992012-10-26 17:25:24 -05003469 return spec;
3470}
3471
3472static void rbd_spec_free(struct kref *kref)
3473{
3474 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3475
3476 kfree(spec->pool_name);
3477 kfree(spec->image_id);
3478 kfree(spec->image_name);
3479 kfree(spec->snap_name);
3480 kfree(spec);
3481}
3482
Alex Eldercc344fa2013-02-19 12:25:56 -06003483static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05003484 struct rbd_spec *spec)
3485{
3486 struct rbd_device *rbd_dev;
3487
3488 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3489 if (!rbd_dev)
3490 return NULL;
3491
3492 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06003493 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05003494 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05003495 init_rwsem(&rbd_dev->header_rwsem);
3496
3497 rbd_dev->spec = spec;
3498 rbd_dev->rbd_client = rbdc;
3499
Alex Elder0903e872012-11-14 12:25:19 -06003500 /* Initialize the layout used for all rbd requests */
3501
3502 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3503 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3504 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3505 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3506
Alex Elderc53d5892012-10-25 23:34:42 -05003507 return rbd_dev;
3508}
3509
3510static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3511{
Alex Elderc53d5892012-10-25 23:34:42 -05003512 rbd_put_client(rbd_dev->rbd_client);
3513 rbd_spec_put(rbd_dev->spec);
3514 kfree(rbd_dev);
3515}
3516
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003517/*
Alex Elder9d475de2012-07-03 16:01:19 -05003518 * Get the size and object order for an image snapshot, or if
3519 * snap_id is CEPH_NOSNAP, gets this information for the base
3520 * image.
3521 */
3522static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3523 u8 *order, u64 *snap_size)
3524{
3525 __le64 snapid = cpu_to_le64(snap_id);
3526 int ret;
3527 struct {
3528 u8 order;
3529 __le64 size;
3530 } __attribute__ ((packed)) size_buf = { 0 };
3531
Alex Elder36be9a72013-01-19 00:30:28 -06003532 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05003533 "rbd", "get_size",
Alex Elder41579762013-04-21 12:14:45 -05003534 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003535 &size_buf, sizeof (size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06003536 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05003537 if (ret < 0)
3538 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003539 if (ret < sizeof (size_buf))
3540 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05003541
Alex Elderc86f86e2013-04-25 15:09:41 -05003542 if (order)
3543 *order = size_buf.order;
Alex Elder9d475de2012-07-03 16:01:19 -05003544 *snap_size = le64_to_cpu(size_buf.size);
3545
3546 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
Alex Elder57385b52013-04-21 12:14:45 -05003547 (unsigned long long)snap_id, (unsigned int)*order,
3548 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05003549
3550 return 0;
3551}
3552
3553static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3554{
3555 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3556 &rbd_dev->header.obj_order,
3557 &rbd_dev->header.image_size);
3558}
3559
Alex Elder1e130192012-07-03 16:01:19 -05003560static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3561{
3562 void *reply_buf;
3563 int ret;
3564 void *p;
3565
3566 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3567 if (!reply_buf)
3568 return -ENOMEM;
3569
Alex Elder36be9a72013-01-19 00:30:28 -06003570 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003571 "rbd", "get_object_prefix", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003572 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06003573 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05003574 if (ret < 0)
3575 goto out;
3576
3577 p = reply_buf;
3578 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05003579 p + ret, NULL, GFP_NOIO);
3580 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05003581
3582 if (IS_ERR(rbd_dev->header.object_prefix)) {
3583 ret = PTR_ERR(rbd_dev->header.object_prefix);
3584 rbd_dev->header.object_prefix = NULL;
3585 } else {
3586 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
3587 }
Alex Elder1e130192012-07-03 16:01:19 -05003588out:
3589 kfree(reply_buf);
3590
3591 return ret;
3592}
3593
Alex Elderb1b54022012-07-03 16:01:19 -05003594static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3595 u64 *snap_features)
3596{
3597 __le64 snapid = cpu_to_le64(snap_id);
3598 struct {
3599 __le64 features;
3600 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05003601 } __attribute__ ((packed)) features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07003602 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05003603 int ret;
3604
Alex Elder36be9a72013-01-19 00:30:28 -06003605 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05003606 "rbd", "get_features",
Alex Elder41579762013-04-21 12:14:45 -05003607 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003608 &features_buf, sizeof (features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06003609 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05003610 if (ret < 0)
3611 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003612 if (ret < sizeof (features_buf))
3613 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07003614
3615 incompat = le64_to_cpu(features_buf.incompat);
Alex Elder5cbf6f122013-04-11 09:29:48 -05003616 if (incompat & ~RBD_FEATURES_SUPPORTED)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05003617 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07003618
Alex Elderb1b54022012-07-03 16:01:19 -05003619 *snap_features = le64_to_cpu(features_buf.features);
3620
3621 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05003622 (unsigned long long)snap_id,
3623 (unsigned long long)*snap_features,
3624 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05003625
3626 return 0;
3627}
3628
3629static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3630{
3631 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3632 &rbd_dev->header.features);
3633}
3634
Alex Elder86b00e02012-10-25 23:34:42 -05003635static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3636{
3637 struct rbd_spec *parent_spec;
3638 size_t size;
3639 void *reply_buf = NULL;
3640 __le64 snapid;
3641 void *p;
3642 void *end;
3643 char *image_id;
3644 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05003645 int ret;
3646
3647 parent_spec = rbd_spec_alloc();
3648 if (!parent_spec)
3649 return -ENOMEM;
3650
3651 size = sizeof (__le64) + /* pool_id */
3652 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
3653 sizeof (__le64) + /* snap_id */
3654 sizeof (__le64); /* overlap */
3655 reply_buf = kmalloc(size, GFP_KERNEL);
3656 if (!reply_buf) {
3657 ret = -ENOMEM;
3658 goto out_err;
3659 }
3660
3661 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06003662 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05003663 "rbd", "get_parent",
Alex Elder41579762013-04-21 12:14:45 -05003664 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003665 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06003666 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05003667 if (ret < 0)
3668 goto out_err;
3669
Alex Elder86b00e02012-10-25 23:34:42 -05003670 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003671 end = reply_buf + ret;
3672 ret = -ERANGE;
Alex Elder86b00e02012-10-25 23:34:42 -05003673 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3674 if (parent_spec->pool_id == CEPH_NOPOOL)
3675 goto out; /* No parent? No problem. */
3676
Alex Elder0903e872012-11-14 12:25:19 -06003677 /* The ceph file layout needs to fit pool id in 32 bits */
3678
3679 ret = -EIO;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003680 if (parent_spec->pool_id > (u64)U32_MAX) {
3681 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3682 (unsigned long long)parent_spec->pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05003683 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003684 }
Alex Elder0903e872012-11-14 12:25:19 -06003685
Alex Elder979ed482012-11-01 08:39:26 -05003686 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05003687 if (IS_ERR(image_id)) {
3688 ret = PTR_ERR(image_id);
3689 goto out_err;
3690 }
3691 parent_spec->image_id = image_id;
3692 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3693 ceph_decode_64_safe(&p, end, overlap, out_err);
3694
3695 rbd_dev->parent_overlap = overlap;
3696 rbd_dev->parent_spec = parent_spec;
3697 parent_spec = NULL; /* rbd_dev now owns this */
3698out:
3699 ret = 0;
3700out_err:
3701 kfree(reply_buf);
3702 rbd_spec_put(parent_spec);
3703
3704 return ret;
3705}
3706
Alex Eldercc070d52013-04-21 12:14:45 -05003707static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3708{
3709 struct {
3710 __le64 stripe_unit;
3711 __le64 stripe_count;
3712 } __attribute__ ((packed)) striping_info_buf = { 0 };
3713 size_t size = sizeof (striping_info_buf);
3714 void *p;
3715 u64 obj_size;
3716 u64 stripe_unit;
3717 u64 stripe_count;
3718 int ret;
3719
3720 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3721 "rbd", "get_stripe_unit_count", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003722 (char *)&striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05003723 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3724 if (ret < 0)
3725 return ret;
3726 if (ret < size)
3727 return -ERANGE;
3728
3729 /*
3730 * We don't actually support the "fancy striping" feature
3731 * (STRIPINGV2) yet, but if the striping sizes are the
3732 * defaults the behavior is the same as before. So find
3733 * out, and only fail if the image has non-default values.
3734 */
3735 ret = -EINVAL;
3736 obj_size = (u64)1 << rbd_dev->header.obj_order;
3737 p = &striping_info_buf;
3738 stripe_unit = ceph_decode_64(&p);
3739 if (stripe_unit != obj_size) {
3740 rbd_warn(rbd_dev, "unsupported stripe unit "
3741 "(got %llu want %llu)",
3742 stripe_unit, obj_size);
3743 return -EINVAL;
3744 }
3745 stripe_count = ceph_decode_64(&p);
3746 if (stripe_count != 1) {
3747 rbd_warn(rbd_dev, "unsupported stripe count "
3748 "(got %llu want 1)", stripe_count);
3749 return -EINVAL;
3750 }
Alex Elder500d0c02013-04-26 09:43:47 -05003751 rbd_dev->header.stripe_unit = stripe_unit;
3752 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05003753
3754 return 0;
3755}
3756
Alex Elder9e15b772012-10-30 19:40:33 -05003757static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3758{
3759 size_t image_id_size;
3760 char *image_id;
3761 void *p;
3762 void *end;
3763 size_t size;
3764 void *reply_buf = NULL;
3765 size_t len = 0;
3766 char *image_name = NULL;
3767 int ret;
3768
3769 rbd_assert(!rbd_dev->spec->image_name);
3770
Alex Elder69e7a022012-11-01 08:39:26 -05003771 len = strlen(rbd_dev->spec->image_id);
3772 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003773 image_id = kmalloc(image_id_size, GFP_KERNEL);
3774 if (!image_id)
3775 return NULL;
3776
3777 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05003778 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05003779 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05003780
3781 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3782 reply_buf = kmalloc(size, GFP_KERNEL);
3783 if (!reply_buf)
3784 goto out;
3785
Alex Elder36be9a72013-01-19 00:30:28 -06003786 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003787 "rbd", "dir_get_name",
3788 image_id, image_id_size,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003789 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05003790 if (ret < 0)
3791 goto out;
3792 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05003793 end = reply_buf + ret;
3794
Alex Elder9e15b772012-10-30 19:40:33 -05003795 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3796 if (IS_ERR(image_name))
3797 image_name = NULL;
3798 else
3799 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3800out:
3801 kfree(reply_buf);
3802 kfree(image_id);
3803
3804 return image_name;
3805}
3806
Alex Elder2ad3d712013-04-30 00:44:33 -05003807static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3808{
3809 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3810 const char *snap_name;
3811 u32 which = 0;
3812
3813 /* Skip over names until we find the one we are looking for */
3814
3815 snap_name = rbd_dev->header.snap_names;
3816 while (which < snapc->num_snaps) {
3817 if (!strcmp(name, snap_name))
3818 return snapc->snaps[which];
3819 snap_name += strlen(snap_name) + 1;
3820 which++;
3821 }
3822 return CEPH_NOSNAP;
3823}
3824
3825static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3826{
3827 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3828 u32 which;
3829 bool found = false;
3830 u64 snap_id;
3831
3832 for (which = 0; !found && which < snapc->num_snaps; which++) {
3833 const char *snap_name;
3834
3835 snap_id = snapc->snaps[which];
3836 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
3837 if (IS_ERR(snap_name))
3838 break;
3839 found = !strcmp(name, snap_name);
3840 kfree(snap_name);
3841 }
3842 return found ? snap_id : CEPH_NOSNAP;
3843}
3844
3845/*
3846 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
3847 * no snapshot by that name is found, or if an error occurs.
3848 */
3849static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3850{
3851 if (rbd_dev->image_format == 1)
3852 return rbd_v1_snap_id_by_name(rbd_dev, name);
3853
3854 return rbd_v2_snap_id_by_name(rbd_dev, name);
3855}
3856
Alex Elder9e15b772012-10-30 19:40:33 -05003857/*
Alex Elder2e9f7f12013-04-26 09:43:48 -05003858 * When an rbd image has a parent image, it is identified by the
3859 * pool, image, and snapshot ids (not names). This function fills
3860 * in the names for those ids. (It's OK if we can't figure out the
3861 * name for an image id, but the pool and snapshot ids should always
3862 * exist and have names.) All names in an rbd spec are dynamically
3863 * allocated.
Alex Eldere1d42132013-04-25 23:15:08 -05003864 *
3865 * When an image being mapped (not a parent) is probed, we have the
3866 * pool name and pool id, image name and image id, and the snapshot
3867 * name. The only thing we're missing is the snapshot id.
Alex Elder9e15b772012-10-30 19:40:33 -05003868 */
Alex Elder2e9f7f12013-04-26 09:43:48 -05003869static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05003870{
Alex Elder2e9f7f12013-04-26 09:43:48 -05003871 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3872 struct rbd_spec *spec = rbd_dev->spec;
3873 const char *pool_name;
3874 const char *image_name;
3875 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05003876 int ret;
3877
Alex Eldere1d42132013-04-25 23:15:08 -05003878 /*
3879 * An image being mapped will have the pool name (etc.), but
3880 * we need to look up the snapshot id.
3881 */
Alex Elder2e9f7f12013-04-26 09:43:48 -05003882 if (spec->pool_name) {
3883 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
Alex Elder2ad3d712013-04-30 00:44:33 -05003884 u64 snap_id;
Alex Eldere1d42132013-04-25 23:15:08 -05003885
Alex Elder2ad3d712013-04-30 00:44:33 -05003886 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
3887 if (snap_id == CEPH_NOSNAP)
Alex Eldere1d42132013-04-25 23:15:08 -05003888 return -ENOENT;
Alex Elder2ad3d712013-04-30 00:44:33 -05003889 spec->snap_id = snap_id;
Alex Eldere1d42132013-04-25 23:15:08 -05003890 } else {
Alex Elder2e9f7f12013-04-26 09:43:48 -05003891 spec->snap_id = CEPH_NOSNAP;
Alex Eldere1d42132013-04-25 23:15:08 -05003892 }
3893
3894 return 0;
3895 }
Alex Elder9e15b772012-10-30 19:40:33 -05003896
Alex Elder2e9f7f12013-04-26 09:43:48 -05003897 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05003898
Alex Elder2e9f7f12013-04-26 09:43:48 -05003899 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
3900 if (!pool_name) {
3901 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003902 return -EIO;
3903 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05003904 pool_name = kstrdup(pool_name, GFP_KERNEL);
3905 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05003906 return -ENOMEM;
3907
3908 /* Fetch the image name; tolerate failure here */
3909
Alex Elder2e9f7f12013-04-26 09:43:48 -05003910 image_name = rbd_dev_image_name(rbd_dev);
3911 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05003912 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003913
Alex Elder2e9f7f12013-04-26 09:43:48 -05003914 /* Look up the snapshot name, and make a copy */
Alex Elder9e15b772012-10-30 19:40:33 -05003915
Alex Elder2e9f7f12013-04-26 09:43:48 -05003916 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
3917 if (!snap_name) {
Alex Elder2e9f7f12013-04-26 09:43:48 -05003918 ret = -ENOMEM;
Alex Elder9e15b772012-10-30 19:40:33 -05003919 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05003920 }
3921
3922 spec->pool_name = pool_name;
3923 spec->image_name = image_name;
3924 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05003925
3926 return 0;
3927out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05003928 kfree(image_name);
3929 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05003930
3931 return ret;
3932}
3933
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003934static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05003935{
3936 size_t size;
3937 int ret;
3938 void *reply_buf;
3939 void *p;
3940 void *end;
3941 u64 seq;
3942 u32 snap_count;
3943 struct ceph_snap_context *snapc;
3944 u32 i;
3945
3946 /*
3947 * We'll need room for the seq value (maximum snapshot id),
3948 * snapshot count, and array of that many snapshot ids.
3949 * For now we have a fixed upper limit on the number we're
3950 * prepared to receive.
3951 */
3952 size = sizeof (__le64) + sizeof (__le32) +
3953 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3954 reply_buf = kzalloc(size, GFP_KERNEL);
3955 if (!reply_buf)
3956 return -ENOMEM;
3957
Alex Elder36be9a72013-01-19 00:30:28 -06003958 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003959 "rbd", "get_snapcontext", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003960 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06003961 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003962 if (ret < 0)
3963 goto out;
3964
Alex Elder35d489f2012-07-03 16:01:19 -05003965 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003966 end = reply_buf + ret;
3967 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05003968 ceph_decode_64_safe(&p, end, seq, out);
3969 ceph_decode_32_safe(&p, end, snap_count, out);
3970
3971 /*
3972 * Make sure the reported number of snapshot ids wouldn't go
3973 * beyond the end of our buffer. But before checking that,
3974 * make sure the computed size of the snapshot context we
3975 * allocate is representable in a size_t.
3976 */
3977 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3978 / sizeof (u64)) {
3979 ret = -EINVAL;
3980 goto out;
3981 }
3982 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3983 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05003984 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05003985
Alex Elder812164f82013-04-30 00:44:32 -05003986 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05003987 if (!snapc) {
3988 ret = -ENOMEM;
3989 goto out;
3990 }
Alex Elder35d489f2012-07-03 16:01:19 -05003991 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05003992 for (i = 0; i < snap_count; i++)
3993 snapc->snaps[i] = ceph_decode_64(&p);
3994
3995 rbd_dev->header.snapc = snapc;
3996
3997 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05003998 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05003999out:
4000 kfree(reply_buf);
4001
Alex Elder57385b52013-04-21 12:14:45 -05004002 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05004003}
4004
Alex Elder54cac612013-04-30 00:44:33 -05004005static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4006 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004007{
4008 size_t size;
4009 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05004010 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004011 int ret;
4012 void *p;
4013 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004014 char *snap_name;
4015
4016 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4017 reply_buf = kmalloc(size, GFP_KERNEL);
4018 if (!reply_buf)
4019 return ERR_PTR(-ENOMEM);
4020
Alex Elder54cac612013-04-30 00:44:33 -05004021 snapid = cpu_to_le64(snap_id);
Alex Elder36be9a72013-01-19 00:30:28 -06004022 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004023 "rbd", "get_snapshot_name",
Alex Elder54cac612013-04-30 00:44:33 -05004024 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05004025 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004026 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05004027 if (ret < 0) {
4028 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004029 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05004030 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004031
4032 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004033 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05004034 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05004035 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004036 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004037
Alex Elderf40eb342013-04-25 15:09:42 -05004038 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05004039 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004040out:
4041 kfree(reply_buf);
4042
Alex Elderf40eb342013-04-25 15:09:42 -05004043 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004044}
4045
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004046static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05004047{
4048 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05004049
4050 down_write(&rbd_dev->header_rwsem);
4051
Alex Elder117973f2012-08-31 17:29:55 -05004052 ret = rbd_dev_v2_image_size(rbd_dev);
4053 if (ret)
4054 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05004055 rbd_update_mapping_size(rbd_dev);
4056
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004057 ret = rbd_dev_v2_snap_context(rbd_dev);
Alex Elder117973f2012-08-31 17:29:55 -05004058 dout("rbd_dev_v2_snap_context returned %d\n", ret);
4059 if (ret)
4060 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05004061out:
4062 up_write(&rbd_dev->header_rwsem);
4063
4064 return ret;
4065}
4066
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004067static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4068{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004069 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05004070 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004071
4072 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004073
Alex Eldercd789ab2012-08-30 00:16:38 -05004074 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004075 dev->bus = &rbd_bus_type;
4076 dev->type = &rbd_device_type;
4077 dev->parent = &rbd_root_dev;
Alex Elder200a6a82013-04-28 23:32:34 -05004078 dev->release = rbd_dev_device_release;
Alex Elderde71a292012-07-03 16:01:19 -05004079 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004080 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004081
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004082 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05004083
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004084 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004085}
4086
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004087static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4088{
4089 device_unregister(&rbd_dev->dev);
4090}
4091
Alex Eldere2839302012-08-29 17:11:06 -05004092static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06004093
4094/*
Alex Elder499afd52012-02-02 08:13:29 -06004095 * Get a unique rbd identifier for the given new rbd_dev, and add
4096 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06004097 */
Alex Eldere2839302012-08-29 17:11:06 -05004098static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06004099{
Alex Eldere2839302012-08-29 17:11:06 -05004100 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06004101
4102 spin_lock(&rbd_dev_list_lock);
4103 list_add_tail(&rbd_dev->node, &rbd_dev_list);
4104 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05004105 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4106 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06004107}
Alex Elderb7f23c32012-01-29 13:57:43 -06004108
Alex Elder1ddbe942012-01-29 13:57:44 -06004109/*
Alex Elder499afd52012-02-02 08:13:29 -06004110 * Remove an rbd_dev from the global list, and record that its
4111 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06004112 */
Alex Eldere2839302012-08-29 17:11:06 -05004113static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06004114{
Alex Elderd184f6b2012-01-29 13:57:44 -06004115 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05004116 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004117 int max_id;
4118
Alex Elderaafb2302012-09-06 16:00:54 -05004119 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06004120
Alex Eldere2839302012-08-29 17:11:06 -05004121 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4122 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06004123 spin_lock(&rbd_dev_list_lock);
4124 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06004125
4126 /*
4127 * If the id being "put" is not the current maximum, there
4128 * is nothing special we need to do.
4129 */
Alex Eldere2839302012-08-29 17:11:06 -05004130 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06004131 spin_unlock(&rbd_dev_list_lock);
4132 return;
4133 }
4134
4135 /*
4136 * We need to update the current maximum id. Search the
4137 * list to find out what it is. We're more likely to find
4138 * the maximum at the end, so search the list backward.
4139 */
4140 max_id = 0;
4141 list_for_each_prev(tmp, &rbd_dev_list) {
4142 struct rbd_device *rbd_dev;
4143
4144 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07004145 if (rbd_dev->dev_id > max_id)
4146 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004147 }
Alex Elder499afd52012-02-02 08:13:29 -06004148 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06004149
Alex Elder1ddbe942012-01-29 13:57:44 -06004150 /*
Alex Eldere2839302012-08-29 17:11:06 -05004151 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06004152 * which case it now accurately reflects the new maximum.
4153 * Be careful not to overwrite the maximum value in that
4154 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06004155 */
Alex Eldere2839302012-08-29 17:11:06 -05004156 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4157 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06004158}
4159
Alex Eldera725f65e2012-02-02 08:13:30 -06004160/*
Alex Eldere28fff262012-02-02 08:13:30 -06004161 * Skips over white space at *buf, and updates *buf to point to the
4162 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004163 * the token (string of non-white space characters) found. Note
4164 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004165 */
4166static inline size_t next_token(const char **buf)
4167{
4168 /*
4169 * These are the characters that produce nonzero for
4170 * isspace() in the "C" and "POSIX" locales.
4171 */
4172 const char *spaces = " \f\n\r\t\v";
4173
4174 *buf += strspn(*buf, spaces); /* Find start of token */
4175
4176 return strcspn(*buf, spaces); /* Return token length */
4177}
4178
4179/*
4180 * Finds the next token in *buf, and if the provided token buffer is
4181 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06004182 * copied, is guaranteed to be terminated with '\0'. Note that *buf
4183 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06004184 *
4185 * Returns the length of the token found (not including the '\0').
4186 * Return value will be 0 if no token is found, and it will be >=
4187 * token_size if the token would not fit.
4188 *
Alex Elder593a9e72012-02-07 12:03:37 -06004189 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06004190 * found token. Note that this occurs even if the token buffer is
4191 * too small to hold it.
4192 */
4193static inline size_t copy_token(const char **buf,
4194 char *token,
4195 size_t token_size)
4196{
4197 size_t len;
4198
4199 len = next_token(buf);
4200 if (len < token_size) {
4201 memcpy(token, *buf, len);
4202 *(token + len) = '\0';
4203 }
4204 *buf += len;
4205
4206 return len;
4207}
4208
4209/*
Alex Elderea3352f2012-07-09 21:04:23 -05004210 * Finds the next token in *buf, dynamically allocates a buffer big
4211 * enough to hold a copy of it, and copies the token into the new
4212 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4213 * that a duplicate buffer is created even for a zero-length token.
4214 *
4215 * Returns a pointer to the newly-allocated duplicate, or a null
4216 * pointer if memory for the duplicate was not available. If
4217 * the lenp argument is a non-null pointer, the length of the token
4218 * (not including the '\0') is returned in *lenp.
4219 *
4220 * If successful, the *buf pointer will be updated to point beyond
4221 * the end of the found token.
4222 *
4223 * Note: uses GFP_KERNEL for allocation.
4224 */
4225static inline char *dup_token(const char **buf, size_t *lenp)
4226{
4227 char *dup;
4228 size_t len;
4229
4230 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004231 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004232 if (!dup)
4233 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004234 *(dup + len) = '\0';
4235 *buf += len;
4236
4237 if (lenp)
4238 *lenp = len;
4239
4240 return dup;
4241}
4242
4243/*
Alex Elder859c31d2012-10-25 23:34:42 -05004244 * Parse the options provided for an "rbd add" (i.e., rbd image
4245 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4246 * and the data written is passed here via a NUL-terminated buffer.
4247 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004248 *
Alex Elder859c31d2012-10-25 23:34:42 -05004249 * The information extracted from these options is recorded in
4250 * the other parameters which return dynamically-allocated
4251 * structures:
4252 * ceph_opts
4253 * The address of a pointer that will refer to a ceph options
4254 * structure. Caller must release the returned pointer using
4255 * ceph_destroy_options() when it is no longer needed.
4256 * rbd_opts
4257 * Address of an rbd options pointer. Fully initialized by
4258 * this function; caller must release with kfree().
4259 * spec
4260 * Address of an rbd image specification pointer. Fully
4261 * initialized by this function based on parsed options.
4262 * Caller must release with rbd_spec_put().
4263 *
4264 * The options passed take this form:
4265 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4266 * where:
4267 * <mon_addrs>
4268 * A comma-separated list of one or more monitor addresses.
4269 * A monitor address is an ip address, optionally followed
4270 * by a port number (separated by a colon).
4271 * I.e.: ip1[:port1][,ip2[:port2]...]
4272 * <options>
4273 * A comma-separated list of ceph and/or rbd options.
4274 * <pool_name>
4275 * The name of the rados pool containing the rbd image.
4276 * <image_name>
4277 * The name of the image in that pool to map.
4278 * <snap_id>
4279 * An optional snapshot id. If provided, the mapping will
4280 * present data from the image at the time that snapshot was
4281 * created. The image head is used if no snapshot id is
4282 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004283 */
Alex Elder859c31d2012-10-25 23:34:42 -05004284static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004285 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004286 struct rbd_options **opts,
4287 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004288{
Alex Elderd22f76e2012-07-12 10:46:35 -05004289 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004290 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004291 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05004292 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05004293 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004294 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004295 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004296 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004297 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004298
4299 /* The first four tokens are required */
4300
Alex Elder7ef32142012-02-02 08:13:30 -06004301 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004302 if (!len) {
4303 rbd_warn(NULL, "no monitor address(es) provided");
4304 return -EINVAL;
4305 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004306 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004307 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004308 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004309
Alex Elderdc79b112012-10-25 23:34:41 -05004310 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004311 options = dup_token(&buf, NULL);
4312 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004313 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004314 if (!*options) {
4315 rbd_warn(NULL, "no options provided");
4316 goto out_err;
4317 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004318
Alex Elder859c31d2012-10-25 23:34:42 -05004319 spec = rbd_spec_alloc();
4320 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004321 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004322
4323 spec->pool_name = dup_token(&buf, NULL);
4324 if (!spec->pool_name)
4325 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004326 if (!*spec->pool_name) {
4327 rbd_warn(NULL, "no pool name provided");
4328 goto out_err;
4329 }
Alex Eldere28fff262012-02-02 08:13:30 -06004330
Alex Elder69e7a022012-11-01 08:39:26 -05004331 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004332 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004333 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004334 if (!*spec->image_name) {
4335 rbd_warn(NULL, "no image name provided");
4336 goto out_err;
4337 }
Alex Eldere28fff262012-02-02 08:13:30 -06004338
Alex Elderf28e5652012-10-25 23:34:41 -05004339 /*
4340 * Snapshot name is optional; default is to use "-"
4341 * (indicating the head/no snapshot).
4342 */
Alex Elder3feeb8942012-08-31 17:29:52 -05004343 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05004344 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05004345 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4346 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05004347 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05004348 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05004349 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05004350 }
Alex Elderecb4dc22013-04-26 09:43:47 -05004351 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4352 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004353 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05004354 *(snap_name + len) = '\0';
4355 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05004356
Alex Elder0ddebc02012-10-25 23:34:41 -05004357 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06004358
Alex Elder4e9afeb2012-10-25 23:34:41 -05004359 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4360 if (!rbd_opts)
4361 goto out_mem;
4362
4363 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05004364
Alex Elder859c31d2012-10-25 23:34:42 -05004365 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05004366 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05004367 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004368 if (IS_ERR(copts)) {
4369 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05004370 goto out_err;
4371 }
Alex Elder859c31d2012-10-25 23:34:42 -05004372 kfree(options);
4373
4374 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004375 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05004376 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05004377
Alex Elderdc79b112012-10-25 23:34:41 -05004378 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05004379out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05004380 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05004381out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05004382 kfree(rbd_opts);
4383 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05004384 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05004385
Alex Elderdc79b112012-10-25 23:34:41 -05004386 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06004387}
4388
Alex Elder589d30e2012-07-10 20:30:11 -05004389/*
4390 * An rbd format 2 image has a unique identifier, distinct from the
4391 * name given to it by the user. Internally, that identifier is
4392 * what's used to specify the names of objects related to the image.
4393 *
4394 * A special "rbd id" object is used to map an rbd image name to its
4395 * id. If that object doesn't exist, then there is no v2 rbd image
4396 * with the supplied name.
4397 *
4398 * This function will record the given rbd_dev's image_id field if
4399 * it can be determined, and in that case will return 0. If any
4400 * errors occur a negative errno will be returned and the rbd_dev's
4401 * image_id field will be unchanged (and should be NULL).
4402 */
4403static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4404{
4405 int ret;
4406 size_t size;
4407 char *object_name;
4408 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05004409 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05004410
Alex Elder589d30e2012-07-10 20:30:11 -05004411 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05004412 * When probing a parent image, the image id is already
4413 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05004414 * need to fetch the image id again in this case. We
4415 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05004416 */
Alex Elderc0fba362013-04-25 23:15:08 -05004417 if (rbd_dev->spec->image_id) {
4418 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4419
Alex Elder2c0d0a12012-10-30 19:40:33 -05004420 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05004421 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05004422
4423 /*
Alex Elder589d30e2012-07-10 20:30:11 -05004424 * First, see if the format 2 image id file exists, and if
4425 * so, get the image's persistent id from it.
4426 */
Alex Elder69e7a022012-11-01 08:39:26 -05004427 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004428 object_name = kmalloc(size, GFP_NOIO);
4429 if (!object_name)
4430 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004431 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004432 dout("rbd id object name is %s\n", object_name);
4433
4434 /* Response will be an encoded string, which includes a length */
4435
4436 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4437 response = kzalloc(size, GFP_NOIO);
4438 if (!response) {
4439 ret = -ENOMEM;
4440 goto out;
4441 }
4442
Alex Elderc0fba362013-04-25 23:15:08 -05004443 /* If it doesn't exist we'll assume it's a format 1 image */
4444
Alex Elder36be9a72013-01-19 00:30:28 -06004445 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder41579762013-04-21 12:14:45 -05004446 "rbd", "get_id", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004447 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004448 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05004449 if (ret == -ENOENT) {
4450 image_id = kstrdup("", GFP_KERNEL);
4451 ret = image_id ? 0 : -ENOMEM;
4452 if (!ret)
4453 rbd_dev->image_format = 1;
4454 } else if (ret > sizeof (__le32)) {
4455 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05004456
Alex Elderc0fba362013-04-25 23:15:08 -05004457 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05004458 NULL, GFP_NOIO);
Alex Elderc0fba362013-04-25 23:15:08 -05004459 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4460 if (!ret)
4461 rbd_dev->image_format = 2;
Alex Elder589d30e2012-07-10 20:30:11 -05004462 } else {
Alex Elderc0fba362013-04-25 23:15:08 -05004463 ret = -EINVAL;
4464 }
4465
4466 if (!ret) {
4467 rbd_dev->spec->image_id = image_id;
4468 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004469 }
4470out:
4471 kfree(response);
4472 kfree(object_name);
4473
4474 return ret;
4475}
4476
Alex Elder6fd48b32013-04-28 23:32:34 -05004477/* Undo whatever state changes are made by v1 or v2 image probe */
4478
4479static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
4480{
4481 struct rbd_image_header *header;
4482
4483 rbd_dev_remove_parent(rbd_dev);
4484 rbd_spec_put(rbd_dev->parent_spec);
4485 rbd_dev->parent_spec = NULL;
4486 rbd_dev->parent_overlap = 0;
4487
4488 /* Free dynamic fields from the header, then zero it out */
4489
4490 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05004491 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05004492 kfree(header->snap_sizes);
4493 kfree(header->snap_names);
4494 kfree(header->object_prefix);
4495 memset(header, 0, sizeof (*header));
4496}
4497
Alex Eldera30b71b2012-07-10 20:30:11 -05004498static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4499{
4500 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004501
4502 /* Populate rbd image metadata */
4503
4504 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4505 if (ret < 0)
4506 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05004507
4508 /* Version 1 images have no parent (no layering) */
4509
4510 rbd_dev->parent_spec = NULL;
4511 rbd_dev->parent_overlap = 0;
4512
Alex Eldera30b71b2012-07-10 20:30:11 -05004513 dout("discovered version 1 image, header name is %s\n",
4514 rbd_dev->header_name);
4515
4516 return 0;
4517
4518out_err:
4519 kfree(rbd_dev->header_name);
4520 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004521 kfree(rbd_dev->spec->image_id);
4522 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05004523
4524 return ret;
4525}
4526
4527static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4528{
Alex Elder9d475de2012-07-03 16:01:19 -05004529 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004530
Alex Elder9d475de2012-07-03 16:01:19 -05004531 ret = rbd_dev_v2_image_size(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004532 if (ret)
Alex Elder9d475de2012-07-03 16:01:19 -05004533 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05004534
4535 /* Get the object prefix (a.k.a. block_name) for the image */
4536
4537 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004538 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05004539 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05004540
Alex Elderd8891402012-10-09 13:50:17 -07004541 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05004542
4543 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004544 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05004545 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05004546
Alex Elder86b00e02012-10-25 23:34:42 -05004547 /* If the image supports layering, get the parent info */
4548
4549 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4550 ret = rbd_dev_v2_parent_info(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004551 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004552 goto out_err;
Alex Elder96882f52013-04-30 00:44:32 -05004553
4554 /*
4555 * Don't print a warning for parent images. We can
4556 * tell this point because we won't know its pool
4557 * name yet (just its pool id).
4558 */
4559 if (rbd_dev->spec->pool_name)
4560 rbd_warn(rbd_dev, "WARNING: kernel layering "
4561 "is EXPERIMENTAL!");
Alex Elder86b00e02012-10-25 23:34:42 -05004562 }
4563
Alex Eldercc070d52013-04-21 12:14:45 -05004564 /* If the image supports fancy striping, get its parameters */
4565
4566 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4567 ret = rbd_dev_v2_striping_info(rbd_dev);
4568 if (ret < 0)
4569 goto out_err;
4570 }
4571
Alex Elder6e14b1a2012-07-03 16:01:19 -05004572 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05004573
Alex Elder6e14b1a2012-07-03 16:01:19 -05004574 rbd_dev->header.crypt_type = 0;
4575 rbd_dev->header.comp_type = 0;
4576
4577 /* Get the snapshot context, plus the header version */
4578
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004579 ret = rbd_dev_v2_snap_context(rbd_dev);
Alex Elder35d489f2012-07-03 16:01:19 -05004580 if (ret)
4581 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004582
Alex Eldera30b71b2012-07-10 20:30:11 -05004583 dout("discovered version 2 image, header name is %s\n",
4584 rbd_dev->header_name);
4585
Alex Elder35152972012-08-31 17:29:55 -05004586 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05004587out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05004588 rbd_dev->parent_overlap = 0;
4589 rbd_spec_put(rbd_dev->parent_spec);
4590 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004591 kfree(rbd_dev->header_name);
4592 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05004593 kfree(rbd_dev->header.object_prefix);
4594 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004595
4596 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004597}
4598
Alex Elder124afba2013-04-26 15:44:36 -05004599static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
Alex Elder83a06262012-10-30 15:47:17 -05004600{
Alex Elder2f82ee52012-10-30 19:40:33 -05004601 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05004602 struct rbd_spec *parent_spec;
4603 struct rbd_client *rbdc;
4604 int ret;
4605
4606 if (!rbd_dev->parent_spec)
4607 return 0;
4608 /*
4609 * We need to pass a reference to the client and the parent
4610 * spec when creating the parent rbd_dev. Images related by
4611 * parent/child relationships always share both.
4612 */
4613 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4614 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4615
4616 ret = -ENOMEM;
4617 parent = rbd_dev_create(rbdc, parent_spec);
4618 if (!parent)
4619 goto out_err;
4620
4621 ret = rbd_dev_image_probe(parent);
4622 if (ret < 0)
4623 goto out_err;
4624 rbd_dev->parent = parent;
4625
4626 return 0;
4627out_err:
4628 if (parent) {
4629 rbd_spec_put(rbd_dev->parent_spec);
4630 kfree(rbd_dev->header_name);
4631 rbd_dev_destroy(parent);
4632 } else {
4633 rbd_put_client(rbdc);
4634 rbd_spec_put(parent_spec);
4635 }
4636
4637 return ret;
4638}
4639
Alex Elder200a6a82013-04-28 23:32:34 -05004640static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05004641{
Alex Elder83a06262012-10-30 15:47:17 -05004642 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05004643
Alex Elderd1cf5782013-04-27 09:59:30 -05004644 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004645 if (ret)
Alex Elder9bb81c92013-04-27 09:59:30 -05004646 return ret;
Alex Elder5de10f32013-04-26 15:44:37 -05004647
Alex Elder83a06262012-10-30 15:47:17 -05004648 /* generate unique id: find highest unique id, add one */
4649 rbd_dev_id_get(rbd_dev);
4650
4651 /* Fill in the device name, now that we have its id. */
4652 BUILD_BUG_ON(DEV_NAME_LEN
4653 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4654 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4655
4656 /* Get our block major device number. */
4657
4658 ret = register_blkdev(0, rbd_dev->name);
4659 if (ret < 0)
4660 goto err_out_id;
4661 rbd_dev->major = ret;
4662
4663 /* Set up the blkdev mapping. */
4664
4665 ret = rbd_init_disk(rbd_dev);
4666 if (ret)
4667 goto err_out_blkdev;
4668
4669 ret = rbd_bus_add_dev(rbd_dev);
4670 if (ret)
4671 goto err_out_disk;
4672
Alex Elder83a06262012-10-30 15:47:17 -05004673 /* Everything's ready. Announce the disk to the world. */
4674
Alex Elderb5156e72013-04-26 15:44:36 -05004675 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Alex Elder129b79d2013-04-26 15:44:36 -05004676 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder83a06262012-10-30 15:47:17 -05004677 add_disk(rbd_dev->disk);
4678
4679 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4680 (unsigned long long) rbd_dev->mapping.size);
4681
4682 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05004683
Alex Elder83a06262012-10-30 15:47:17 -05004684err_out_disk:
4685 rbd_free_disk(rbd_dev);
4686err_out_blkdev:
4687 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4688err_out_id:
4689 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05004690 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004691
4692 return ret;
4693}
4694
Alex Elder332bb122013-04-27 09:59:30 -05004695static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4696{
4697 struct rbd_spec *spec = rbd_dev->spec;
4698 size_t size;
4699
4700 /* Record the header object name for this rbd image. */
4701
4702 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4703
4704 if (rbd_dev->image_format == 1)
4705 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4706 else
4707 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4708
4709 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4710 if (!rbd_dev->header_name)
4711 return -ENOMEM;
4712
4713 if (rbd_dev->image_format == 1)
4714 sprintf(rbd_dev->header_name, "%s%s",
4715 spec->image_name, RBD_SUFFIX);
4716 else
4717 sprintf(rbd_dev->header_name, "%s%s",
4718 RBD_HEADER_PREFIX, spec->image_id);
4719 return 0;
4720}
4721
Alex Elder200a6a82013-04-28 23:32:34 -05004722static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4723{
Alex Elder6fd48b32013-04-28 23:32:34 -05004724 int ret;
4725
Alex Elder6fd48b32013-04-28 23:32:34 -05004726 rbd_dev_unprobe(rbd_dev);
4727 ret = rbd_dev_header_watch_sync(rbd_dev, 0);
4728 if (ret)
4729 rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
Alex Elder200a6a82013-04-28 23:32:34 -05004730 kfree(rbd_dev->header_name);
Alex Elder6fd48b32013-04-28 23:32:34 -05004731 rbd_dev->header_name = NULL;
4732 rbd_dev->image_format = 0;
4733 kfree(rbd_dev->spec->image_id);
4734 rbd_dev->spec->image_id = NULL;
4735
Alex Elder200a6a82013-04-28 23:32:34 -05004736 rbd_dev_destroy(rbd_dev);
4737}
4738
Alex Eldera30b71b2012-07-10 20:30:11 -05004739/*
4740 * Probe for the existence of the header object for the given rbd
4741 * device. For format 2 images this includes determining the image
4742 * id.
4743 */
Alex Elder71f293e2013-04-26 09:43:48 -05004744static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05004745{
4746 int ret;
Alex Elderb644de22013-04-27 09:59:31 -05004747 int tmp;
Alex Eldera30b71b2012-07-10 20:30:11 -05004748
4749 /*
4750 * Get the id from the image id object. If it's not a
4751 * format 2 image, we'll get ENOENT back, and we'll assume
4752 * it's a format 1 image.
4753 */
4754 ret = rbd_dev_image_id(rbd_dev);
4755 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05004756 return ret;
4757 rbd_assert(rbd_dev->spec->image_id);
4758 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4759
Alex Elder332bb122013-04-27 09:59:30 -05004760 ret = rbd_dev_header_name(rbd_dev);
4761 if (ret)
4762 goto err_out_format;
4763
Alex Elderb644de22013-04-27 09:59:31 -05004764 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4765 if (ret)
4766 goto out_header_name;
4767
Alex Elderc0fba362013-04-25 23:15:08 -05004768 if (rbd_dev->image_format == 1)
Alex Eldera30b71b2012-07-10 20:30:11 -05004769 ret = rbd_dev_v1_probe(rbd_dev);
4770 else
4771 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05004772 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05004773 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05004774
Alex Elder9bb81c92013-04-27 09:59:30 -05004775 ret = rbd_dev_spec_update(rbd_dev);
4776 if (ret)
Alex Elder33dca392013-04-30 00:44:33 -05004777 goto err_out_probe;
Alex Elder9bb81c92013-04-27 09:59:30 -05004778
4779 ret = rbd_dev_probe_parent(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05004780 if (!ret)
4781 return 0;
Alex Elder83a06262012-10-30 15:47:17 -05004782
Alex Elder6fd48b32013-04-28 23:32:34 -05004783err_out_probe:
4784 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05004785err_out_watch:
4786 tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4787 if (tmp)
4788 rbd_warn(rbd_dev, "unable to tear down watch request\n");
Alex Elder332bb122013-04-27 09:59:30 -05004789out_header_name:
4790 kfree(rbd_dev->header_name);
4791 rbd_dev->header_name = NULL;
4792err_out_format:
4793 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05004794 kfree(rbd_dev->spec->image_id);
4795 rbd_dev->spec->image_id = NULL;
4796
4797 dout("probe failed, returning %d\n", ret);
4798
4799 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004800}
4801
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004802static ssize_t rbd_add(struct bus_type *bus,
4803 const char *buf,
4804 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004805{
Alex Eldercb8627c2012-07-09 21:04:23 -05004806 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004807 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004808 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004809 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004810 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004811 struct ceph_osd_client *osdc;
4812 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004813
4814 if (!try_module_get(THIS_MODULE))
4815 return -ENODEV;
4816
Alex Eldera725f65e2012-02-02 08:13:30 -06004817 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004818 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004819 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004820 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004821
Alex Elder9d3997f2012-10-25 23:34:42 -05004822 rbdc = rbd_get_client(ceph_opts);
4823 if (IS_ERR(rbdc)) {
4824 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004825 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004826 }
Alex Elderc53d5892012-10-25 23:34:42 -05004827 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004828
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004829 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004830 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004831 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004832 if (rc < 0)
4833 goto err_out_client;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004834 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05004835
Alex Elder0903e872012-11-14 12:25:19 -06004836 /* The ceph file layout needs to fit pool id in 32 bits */
4837
Alex Elderc0cd10db2013-04-26 09:43:47 -05004838 if (spec->pool_id > (u64)U32_MAX) {
4839 rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4840 (unsigned long long)spec->pool_id, U32_MAX);
Alex Elder0903e872012-11-14 12:25:19 -06004841 rc = -EIO;
4842 goto err_out_client;
4843 }
4844
Alex Elderc53d5892012-10-25 23:34:42 -05004845 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004846 if (!rbd_dev)
4847 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004848 rbdc = NULL; /* rbd_dev now owns this */
4849 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004850
Alex Elderbd4ba652012-10-25 23:34:42 -05004851 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004852 kfree(rbd_opts);
4853 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004854
Alex Elder71f293e2013-04-26 09:43:48 -05004855 rc = rbd_dev_image_probe(rbd_dev);
Alex Eldera30b71b2012-07-10 20:30:11 -05004856 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004857 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004858
Alex Elderb536f692013-04-28 23:32:34 -05004859 rc = rbd_dev_device_setup(rbd_dev);
4860 if (!rc)
4861 return count;
4862
4863 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004864err_out_rbd_dev:
4865 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004866err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004867 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004868err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004869 if (ceph_opts)
4870 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004871 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004872 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004873err_out_module:
4874 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004875
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004876 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004877
Alex Elderc0cd10db2013-04-26 09:43:47 -05004878 return (ssize_t)rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004879}
4880
Alex Elderde71a292012-07-03 16:01:19 -05004881static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004882{
4883 struct list_head *tmp;
4884 struct rbd_device *rbd_dev;
4885
Alex Eldere124a82f2012-01-29 13:57:44 -06004886 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004887 list_for_each(tmp, &rbd_dev_list) {
4888 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004889 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06004890 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004891 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06004892 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004893 }
Alex Eldere124a82f2012-01-29 13:57:44 -06004894 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004895 return NULL;
4896}
4897
Alex Elder200a6a82013-04-28 23:32:34 -05004898static void rbd_dev_device_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004899{
Alex Elder593a9e72012-02-07 12:03:37 -06004900 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004901
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004902 rbd_free_disk(rbd_dev);
Alex Elder200a6a82013-04-28 23:32:34 -05004903 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4904 rbd_dev_clear_mapping(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004905 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder200a6a82013-04-28 23:32:34 -05004906 rbd_dev->major = 0;
Alex Eldere2839302012-08-29 17:11:06 -05004907 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05004908 rbd_dev_mapping_clear(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004909}
4910
Alex Elder05a46af2013-04-26 15:44:36 -05004911static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
4912{
Alex Elderad945fc2013-04-26 15:44:36 -05004913 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05004914 struct rbd_device *first = rbd_dev;
4915 struct rbd_device *second = first->parent;
4916 struct rbd_device *third;
4917
4918 /*
4919 * Follow to the parent with no grandparent and
4920 * remove it.
4921 */
4922 while (second && (third = second->parent)) {
4923 first = second;
4924 second = third;
4925 }
Alex Elderad945fc2013-04-26 15:44:36 -05004926 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05004927 rbd_dev_image_release(second);
Alex Elderad945fc2013-04-26 15:44:36 -05004928 first->parent = NULL;
4929 first->parent_overlap = 0;
4930
4931 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05004932 rbd_spec_put(first->parent_spec);
4933 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05004934 }
4935}
4936
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004937static ssize_t rbd_remove(struct bus_type *bus,
4938 const char *buf,
4939 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004940{
4941 struct rbd_device *rbd_dev = NULL;
Alex Elder0d8189e2013-04-27 09:59:30 -05004942 int target_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004943 unsigned long ul;
Alex Elder0d8189e2013-04-27 09:59:30 -05004944 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004945
Alex Elder0d8189e2013-04-27 09:59:30 -05004946 ret = strict_strtoul(buf, 10, &ul);
4947 if (ret)
4948 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004949
4950 /* convert to int; abort if we lost anything in the conversion */
4951 target_id = (int) ul;
4952 if (target_id != ul)
4953 return -EINVAL;
4954
4955 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4956
4957 rbd_dev = __rbd_get_dev(target_id);
4958 if (!rbd_dev) {
4959 ret = -ENOENT;
4960 goto done;
4961 }
4962
Alex Eldera14ea262013-02-05 13:23:12 -06004963 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004964 if (rbd_dev->open_count)
Alex Elder42382b72012-11-16 09:29:16 -06004965 ret = -EBUSY;
Alex Elderb82d1672013-01-14 12:43:31 -06004966 else
4967 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
Alex Eldera14ea262013-02-05 13:23:12 -06004968 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004969 if (ret < 0)
Alex Elder42382b72012-11-16 09:29:16 -06004970 goto done;
Alex Elder0d8189e2013-04-27 09:59:30 -05004971 ret = count;
Alex Elderb4808152013-04-26 15:44:36 -05004972 rbd_bus_del_dev(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05004973 rbd_dev_image_release(rbd_dev);
Alex Elder79ab7552013-04-28 23:32:34 -05004974 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004975done:
4976 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05004977
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004978 return ret;
4979}
4980
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004981/*
4982 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004983 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004984 */
4985static int rbd_sysfs_init(void)
4986{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004987 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004988
Alex Elderfed4c142012-02-07 12:03:36 -06004989 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004990 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004991 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004992
Alex Elderfed4c142012-02-07 12:03:36 -06004993 ret = bus_register(&rbd_bus_type);
4994 if (ret < 0)
4995 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004996
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004997 return ret;
4998}
4999
5000static void rbd_sysfs_cleanup(void)
5001{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005002 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005003 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005004}
5005
Alex Elder1c2a9df2013-05-01 12:43:03 -05005006static int rbd_slab_init(void)
5007{
5008 rbd_assert(!rbd_img_request_cache);
5009 rbd_img_request_cache = kmem_cache_create("rbd_img_request",
5010 sizeof (struct rbd_img_request),
5011 __alignof__(struct rbd_img_request),
5012 0, NULL);
Alex Elder868311b2013-05-01 12:43:03 -05005013 if (!rbd_img_request_cache)
5014 return -ENOMEM;
5015
5016 rbd_assert(!rbd_obj_request_cache);
5017 rbd_obj_request_cache = kmem_cache_create("rbd_obj_request",
5018 sizeof (struct rbd_obj_request),
5019 __alignof__(struct rbd_obj_request),
5020 0, NULL);
5021 if (rbd_obj_request_cache)
Alex Elder1c2a9df2013-05-01 12:43:03 -05005022 return 0;
5023
Alex Elder868311b2013-05-01 12:43:03 -05005024 kmem_cache_destroy(rbd_img_request_cache);
5025 rbd_img_request_cache = NULL;
5026
Alex Elder1c2a9df2013-05-01 12:43:03 -05005027 return -ENOMEM;
5028}
5029
5030static void rbd_slab_exit(void)
5031{
Alex Elder868311b2013-05-01 12:43:03 -05005032 rbd_assert(rbd_obj_request_cache);
5033 kmem_cache_destroy(rbd_obj_request_cache);
5034 rbd_obj_request_cache = NULL;
5035
Alex Elder1c2a9df2013-05-01 12:43:03 -05005036 rbd_assert(rbd_img_request_cache);
5037 kmem_cache_destroy(rbd_img_request_cache);
5038 rbd_img_request_cache = NULL;
5039}
5040
Alex Eldercc344fa2013-02-19 12:25:56 -06005041static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005042{
5043 int rc;
5044
Alex Elder1e32d342013-01-30 11:13:33 -06005045 if (!libceph_compatible(NULL)) {
5046 rbd_warn(NULL, "libceph incompatibility (quitting)");
5047
5048 return -EINVAL;
5049 }
Alex Elder1c2a9df2013-05-01 12:43:03 -05005050 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005051 if (rc)
5052 return rc;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005053 rc = rbd_sysfs_init();
5054 if (rc)
5055 rbd_slab_exit();
5056 else
5057 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
5058
5059 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005060}
5061
Alex Eldercc344fa2013-02-19 12:25:56 -06005062static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005063{
5064 rbd_sysfs_cleanup();
Alex Elder1c2a9df2013-05-01 12:43:03 -05005065 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005066}
5067
5068module_init(rbd_init);
5069module_exit(rbd_exit);
5070
5071MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5072MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5073MODULE_DESCRIPTION("rados block device");
5074
5075/* following authorship retained from original osdblk.c */
5076MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5077
5078MODULE_LICENSE("GPL");