blob: bf836dea113ad8696bb471e4cadc8710d26c6de2 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
34#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070035#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036
37#include <linux/kernel.h>
38#include <linux/device.h>
39#include <linux/module.h>
40#include <linux/fs.h>
41#include <linux/blkdev.h>
42
43#include "rbd_types.h"
44
Alex Elderaafb2302012-09-06 16:00:54 -050045#define RBD_DEBUG /* Activate rbd_assert() calls */
46
Alex Elder593a9e72012-02-07 12:03:37 -060047/*
48 * The basic unit of block I/O is a sector. It is interpreted in a
49 * number of contexts in Linux (blk, bio, genhd), but the default is
50 * universally 512 bytes. These symbols are just slightly more
51 * meaningful than the bare numbers they represent.
52 */
53#define SECTOR_SHIFT 9
54#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
55
Alex Elderf0f8cef2012-01-29 13:57:44 -060056#define RBD_DRV_NAME "rbd"
57#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058
59#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
60
Alex Elderd4b125e2012-07-03 16:01:19 -050061#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
62#define RBD_MAX_SNAP_NAME_LEN \
63 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
64
Alex Elder35d489f2012-07-03 16:01:19 -050065#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070066
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder9682fc62013-04-30 00:44:33 -050069#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
70
Alex Elder9e15b772012-10-30 19:40:33 -050071/* This allows a single page to hold an image name sent by OSD */
72#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050074
Alex Elder1e130192012-07-03 16:01:19 -050075#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050076
Alex Elderd8891402012-10-09 13:50:17 -070077/* Feature bits */
78
Alex Elder5cbf6f122013-04-11 09:29:48 -050079#define RBD_FEATURE_LAYERING (1<<0)
80#define RBD_FEATURE_STRIPINGV2 (1<<1)
81#define RBD_FEATURES_ALL \
82 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
Alex Elderd8891402012-10-09 13:50:17 -070083
84/* Features supported by this (client software) implementation. */
85
Alex Elder770eba62012-10-25 23:34:40 -050086#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -070087
Alex Elder81a89792012-02-02 08:13:30 -060088/*
89 * An RBD device name will be "rbd#", where the "rbd" comes from
90 * RBD_DRV_NAME above, and # is a unique integer identifier.
91 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
92 * enough to hold all possible device names.
93 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070094#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060095#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096
97/*
98 * block device image metadata (in-memory version)
99 */
100struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500101 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500102 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500103 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104 __u8 obj_order;
105 __u8 crypt_type;
106 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700107
Alex Elderf84344f2012-08-31 17:29:51 -0500108 /* The remaining fields need to be updated occasionally */
109 u64 image_size;
110 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700111 char *snap_names;
112 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700113
Alex Elder500d0c02013-04-26 09:43:47 -0500114 u64 stripe_unit;
115 u64 stripe_count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500145 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500146
Alex Elderecb4dc222013-04-26 09:43:47 -0500147 const char *image_id;
148 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500151 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
Alex Elderbf0d5f502012-11-22 00:00:08 -0600165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
Alex Elder9969ebc2013-01-18 12:31:10 -0600173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600176
Alex Elder926f9b32013-02-11 12:33:24 -0600177enum obj_req_flags {
178 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600179 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600180 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
181 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600182};
183
Alex Elderbf0d5f502012-11-22 00:00:08 -0600184struct rbd_obj_request {
185 const char *object_name;
186 u64 offset; /* object start byte */
187 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600188 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600189
Alex Elderc5b5ef62013-02-11 12:33:24 -0600190 /*
191 * An object request associated with an image will have its
192 * img_data flag set; a standalone object request will not.
193 *
194 * A standalone object request will have which == BAD_WHICH
195 * and a null obj_request pointer.
196 *
197 * An object request initiated in support of a layered image
198 * object (to check for its existence before a write) will
199 * have which == BAD_WHICH and a non-null obj_request pointer.
200 *
201 * Finally, an object request for rbd image data will have
202 * which != BAD_WHICH, and will have a non-null img_request
203 * pointer. The value of which will be in the range
204 * 0..(img_request->obj_request_count-1).
205 */
206 union {
207 struct rbd_obj_request *obj_request; /* STAT op */
208 struct {
209 struct rbd_img_request *img_request;
210 u64 img_offset;
211 /* links for img_request->obj_requests list */
212 struct list_head links;
213 };
214 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600215 u32 which; /* posn image request list */
216
217 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600218 union {
219 struct bio *bio_list;
220 struct {
221 struct page **pages;
222 u32 page_count;
223 };
224 };
Alex Elder0eefd472013-04-19 15:34:50 -0500225 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600226
227 struct ceph_osd_request *osd_req;
228
229 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800230 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600231
232 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600233 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600234
235 struct kref kref;
236};
237
Alex Elder0c425242013-02-08 09:55:49 -0600238enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600239 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
240 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600241 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600242};
243
Alex Elderbf0d5f502012-11-22 00:00:08 -0600244struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600245 struct rbd_device *rbd_dev;
246 u64 offset; /* starting image byte offset */
247 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600248 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600249 union {
Alex Elder9849e982013-01-24 16:13:36 -0600250 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600251 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600252 };
253 union {
254 struct request *rq; /* block request */
255 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600256 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500257 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600258 spinlock_t completion_lock;/* protects next_completion */
259 u32 next_completion;
260 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500261 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600262 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600263
264 u32 obj_request_count;
265 struct list_head obj_requests; /* rbd_obj_request structs */
266
267 struct kref kref;
268};
269
270#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600271 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600272#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600273 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600274#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600275 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800277struct rbd_snap {
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800278 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800279 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800280 struct list_head node;
281 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500282 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800283};
284
Alex Elderf84344f2012-08-31 17:29:51 -0500285struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500286 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500287 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500288 bool read_only;
289};
290
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700291/*
292 * a single device
293 */
294struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500295 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296
297 int major; /* blkdev assigned major */
298 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299
Alex Eldera30b71b2012-07-10 20:30:11 -0500300 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700301 struct rbd_client *rbd_client;
302
303 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
304
Alex Elderb82d1672013-01-14 12:43:31 -0600305 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700306
307 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600308 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500309 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500311 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500312
Alex Elder0903e872012-11-14 12:25:19 -0600313 struct ceph_file_layout layout;
314
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700315 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600316 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700317
Alex Elder86b00e02012-10-25 23:34:42 -0500318 struct rbd_spec *parent_spec;
319 u64 parent_overlap;
Alex Elder2f82ee52012-10-30 19:40:33 -0500320 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500321
Josh Durginc6666012011-11-21 17:11:12 -0800322 /* protects updating the header */
323 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500324
325 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700326
327 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800328
329 /* list of snapshots */
330 struct list_head snaps;
331
332 /* sysfs related */
333 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600334 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800335};
336
Alex Elderb82d1672013-01-14 12:43:31 -0600337/*
338 * Flag bits for rbd_dev->flags. If atomicity is required,
339 * rbd_dev->lock is used to protect access.
340 *
341 * Currently, only the "removing" flag (which is coupled with the
342 * "open_count" field) requires atomic access.
343 */
Alex Elder6d292902013-01-14 12:43:31 -0600344enum rbd_dev_flags {
345 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600346 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600347};
348
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600350
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600352static DEFINE_SPINLOCK(rbd_dev_list_lock);
353
Alex Elder432b8582012-01-29 13:57:44 -0600354static LIST_HEAD(rbd_client_list); /* clients */
355static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356
Alex Elder3d7efd12013-04-19 15:34:50 -0500357static int rbd_img_request_submit(struct rbd_img_request *img_request);
358
Alex Elder304f6802012-08-31 17:29:52 -0500359static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
Alex Elder304f6802012-08-31 17:29:52 -0500360
Alex Elder200a6a82013-04-28 23:32:34 -0500361static void rbd_dev_device_release(struct device *dev);
Alex Elder6087b512013-04-25 15:09:41 -0500362static void rbd_snap_destroy(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800363
Alex Elderf0f8cef2012-01-29 13:57:44 -0600364static ssize_t rbd_add(struct bus_type *bus, const char *buf,
365 size_t count);
366static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
367 size_t count);
Alex Elder71f293e2013-04-26 09:43:48 -0500368static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600369
370static struct bus_attribute rbd_bus_attrs[] = {
371 __ATTR(add, S_IWUSR, NULL, rbd_add),
372 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
373 __ATTR_NULL
374};
375
376static struct bus_type rbd_bus_type = {
377 .name = "rbd",
378 .bus_attrs = rbd_bus_attrs,
379};
380
381static void rbd_root_dev_release(struct device *dev)
382{
383}
384
385static struct device rbd_root_dev = {
386 .init_name = "rbd",
387 .release = rbd_root_dev_release,
388};
389
Alex Elder06ecc6c2012-11-01 10:17:15 -0500390static __printf(2, 3)
391void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
392{
393 struct va_format vaf;
394 va_list args;
395
396 va_start(args, fmt);
397 vaf.fmt = fmt;
398 vaf.va = &args;
399
400 if (!rbd_dev)
401 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
402 else if (rbd_dev->disk)
403 printk(KERN_WARNING "%s: %s: %pV\n",
404 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
405 else if (rbd_dev->spec && rbd_dev->spec->image_name)
406 printk(KERN_WARNING "%s: image %s: %pV\n",
407 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
408 else if (rbd_dev->spec && rbd_dev->spec->image_id)
409 printk(KERN_WARNING "%s: id %s: %pV\n",
410 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
411 else /* punt */
412 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
413 RBD_DRV_NAME, rbd_dev, &vaf);
414 va_end(args);
415}
416
Alex Elderaafb2302012-09-06 16:00:54 -0500417#ifdef RBD_DEBUG
418#define rbd_assert(expr) \
419 if (unlikely(!(expr))) { \
420 printk(KERN_ERR "\nAssertion failure in %s() " \
421 "at line %d:\n\n" \
422 "\trbd_assert(%s);\n\n", \
423 __func__, __LINE__, #expr); \
424 BUG(); \
425 }
426#else /* !RBD_DEBUG */
427# define rbd_assert(expr) ((void) 0)
428#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800429
Alex Elderb454e362013-04-19 15:34:50 -0500430static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder05a46af2013-04-26 15:44:36 -0500431static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
432static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600433
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500434static int rbd_dev_refresh(struct rbd_device *rbd_dev);
435static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500436static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
437 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500438static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
439 u8 *order, u64 *snap_size);
440static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
441 u64 *snap_features);
442static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700443
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700444static int rbd_open(struct block_device *bdev, fmode_t mode)
445{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600446 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600447 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700448
Alex Elderf84344f2012-08-31 17:29:51 -0500449 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450 return -EROFS;
451
Alex Eldera14ea262013-02-05 13:23:12 -0600452 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600453 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
454 removing = true;
455 else
456 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600457 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600458 if (removing)
459 return -ENOENT;
460
Alex Elder42382b72012-11-16 09:29:16 -0600461 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600462 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500463 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600464 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700465
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700466 return 0;
467}
468
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800469static int rbd_release(struct gendisk *disk, fmode_t mode)
470{
471 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600472 unsigned long open_count_before;
473
Alex Eldera14ea262013-02-05 13:23:12 -0600474 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600475 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600476 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600477 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800478
Alex Elder42382b72012-11-16 09:29:16 -0600479 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600480 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600481 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800482
483 return 0;
484}
485
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486static const struct block_device_operations rbd_bd_ops = {
487 .owner = THIS_MODULE,
488 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800489 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700490};
491
492/*
493 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500494 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700495 */
Alex Elderf8c38922012-08-10 13:12:07 -0700496static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700497{
498 struct rbd_client *rbdc;
499 int ret = -ENOMEM;
500
Alex Elder37206ee2013-02-20 17:32:08 -0600501 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700502 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
503 if (!rbdc)
504 goto out_opt;
505
506 kref_init(&rbdc->kref);
507 INIT_LIST_HEAD(&rbdc->node);
508
Alex Elderbc534d82012-01-29 13:57:44 -0600509 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
510
Alex Elder43ae4702012-07-03 16:01:18 -0500511 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700512 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600513 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500514 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700515
516 ret = ceph_open_session(rbdc->client);
517 if (ret < 0)
518 goto out_err;
519
Alex Elder432b8582012-01-29 13:57:44 -0600520 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700521 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600522 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700523
Alex Elderbc534d82012-01-29 13:57:44 -0600524 mutex_unlock(&ctl_mutex);
Alex Elder37206ee2013-02-20 17:32:08 -0600525 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600526
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700527 return rbdc;
528
529out_err:
530 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600531out_mutex:
532 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700533 kfree(rbdc);
534out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500535 if (ceph_opts)
536 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600537 dout("%s: error %d\n", __func__, ret);
538
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400539 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540}
541
Alex Elder2f82ee52012-10-30 19:40:33 -0500542static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
543{
544 kref_get(&rbdc->kref);
545
546 return rbdc;
547}
548
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700549/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700550 * Find a ceph client with specific addr and configuration. If
551 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700552 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700553static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700554{
555 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700556 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557
Alex Elder43ae4702012-07-03 16:01:18 -0500558 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700559 return NULL;
560
Alex Elder1f7ba332012-08-10 13:12:07 -0700561 spin_lock(&rbd_client_list_lock);
562 list_for_each_entry(client_node, &rbd_client_list, node) {
563 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500564 __rbd_get_client(client_node);
565
Alex Elder1f7ba332012-08-10 13:12:07 -0700566 found = true;
567 break;
568 }
569 }
570 spin_unlock(&rbd_client_list_lock);
571
572 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700573}
574
575/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700576 * mount options
577 */
578enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700579 Opt_last_int,
580 /* int args above */
581 Opt_last_string,
582 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700583 Opt_read_only,
584 Opt_read_write,
585 /* Boolean args above */
586 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700587};
588
Alex Elder43ae4702012-07-03 16:01:18 -0500589static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700590 /* int args above */
591 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500592 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700593 {Opt_read_only, "ro"}, /* Alternate spelling */
594 {Opt_read_write, "read_write"},
595 {Opt_read_write, "rw"}, /* Alternate spelling */
596 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700597 {-1, NULL}
598};
599
Alex Elder98571b52013-01-20 14:44:42 -0600600struct rbd_options {
601 bool read_only;
602};
603
604#define RBD_READ_ONLY_DEFAULT false
605
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700606static int parse_rbd_opts_token(char *c, void *private)
607{
Alex Elder43ae4702012-07-03 16:01:18 -0500608 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700609 substring_t argstr[MAX_OPT_ARGS];
610 int token, intval, ret;
611
Alex Elder43ae4702012-07-03 16:01:18 -0500612 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700613 if (token < 0)
614 return -EINVAL;
615
616 if (token < Opt_last_int) {
617 ret = match_int(&argstr[0], &intval);
618 if (ret < 0) {
619 pr_err("bad mount option arg (not int) "
620 "at '%s'\n", c);
621 return ret;
622 }
623 dout("got int token %d val %d\n", token, intval);
624 } else if (token > Opt_last_int && token < Opt_last_string) {
625 dout("got string token %d val %s\n", token,
626 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700627 } else if (token > Opt_last_string && token < Opt_last_bool) {
628 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700629 } else {
630 dout("got token %d\n", token);
631 }
632
633 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700634 case Opt_read_only:
635 rbd_opts->read_only = true;
636 break;
637 case Opt_read_write:
638 rbd_opts->read_only = false;
639 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700640 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500641 rbd_assert(false);
642 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700643 }
644 return 0;
645}
646
647/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700648 * Get a ceph client with specific addr and configuration, if one does
649 * not exist create it.
650 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500651static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652{
Alex Elderf8c38922012-08-10 13:12:07 -0700653 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700654
Alex Elder1f7ba332012-08-10 13:12:07 -0700655 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500656 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500657 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500658 else
Alex Elderf8c38922012-08-10 13:12:07 -0700659 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660
Alex Elder9d3997f2012-10-25 23:34:42 -0500661 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662}
663
664/*
665 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600666 *
Alex Elder432b8582012-01-29 13:57:44 -0600667 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668 */
669static void rbd_client_release(struct kref *kref)
670{
671 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
672
Alex Elder37206ee2013-02-20 17:32:08 -0600673 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500674 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700675 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500676 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700677
678 ceph_destroy_client(rbdc->client);
679 kfree(rbdc);
680}
681
682/*
683 * Drop reference to ceph client node. If it's not referenced anymore, release
684 * it.
685 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500686static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687{
Alex Elderc53d5892012-10-25 23:34:42 -0500688 if (rbdc)
689 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690}
691
Alex Eldera30b71b2012-07-10 20:30:11 -0500692static bool rbd_image_format_valid(u32 image_format)
693{
694 return image_format == 1 || image_format == 2;
695}
696
Alex Elder8e94af82012-07-25 09:32:40 -0500697static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
698{
Alex Elder103a1502012-08-02 11:29:45 -0500699 size_t size;
700 u32 snap_count;
701
702 /* The header has to start with the magic rbd header text */
703 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
704 return false;
705
Alex Elderdb2388b2012-10-20 22:17:27 -0500706 /* The bio layer requires at least sector-sized I/O */
707
708 if (ondisk->options.order < SECTOR_SHIFT)
709 return false;
710
711 /* If we use u64 in a few spots we may be able to loosen this */
712
713 if (ondisk->options.order > 8 * sizeof (int) - 1)
714 return false;
715
Alex Elder103a1502012-08-02 11:29:45 -0500716 /*
717 * The size of a snapshot header has to fit in a size_t, and
718 * that limits the number of snapshots.
719 */
720 snap_count = le32_to_cpu(ondisk->snap_count);
721 size = SIZE_MAX - sizeof (struct ceph_snap_context);
722 if (snap_count > size / sizeof (__le64))
723 return false;
724
725 /*
726 * Not only that, but the size of the entire the snapshot
727 * header must also be representable in a size_t.
728 */
729 size -= snap_count * sizeof (__le64);
730 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
731 return false;
732
733 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500734}
735
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700736/*
737 * Create a new header structure, translate header format from the on-disk
738 * header.
739 */
740static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500741 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742{
Alex Elderccece232012-07-10 20:30:10 -0500743 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500744 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500745 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500746 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700747
Alex Elder6a523252012-07-19 17:12:59 -0500748 memset(header, 0, sizeof (*header));
749
Alex Elder103a1502012-08-02 11:29:45 -0500750 snap_count = le32_to_cpu(ondisk->snap_count);
751
Alex Elder58c17b02012-08-23 23:22:06 -0500752 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
753 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500754 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700755 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500756 memcpy(header->object_prefix, ondisk->object_prefix, len);
757 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600758
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700759 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500760 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
761
Alex Elder621901d2012-08-23 23:22:06 -0500762 /* Save a copy of the snapshot names */
763
Alex Elderf785cc12012-08-23 23:22:06 -0500764 if (snap_names_len > (u64) SIZE_MAX)
765 return -EIO;
766 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500768 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500769 /*
770 * Note that rbd_dev_v1_header_read() guarantees
771 * the ondisk buffer we're working with has
772 * snap_names_len bytes beyond the end of the
773 * snapshot id array, this memcpy() is safe.
774 */
775 memcpy(header->snap_names, &ondisk->snaps[snap_count],
776 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500777
Alex Elder621901d2012-08-23 23:22:06 -0500778 /* Record each snapshot's size */
779
Alex Elderd2bb24e2012-07-26 23:37:14 -0500780 size = snap_count * sizeof (*header->snap_sizes);
781 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700782 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500783 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500784 for (i = 0; i < snap_count; i++)
785 header->snap_sizes[i] =
786 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 } else {
788 header->snap_names = NULL;
789 header->snap_sizes = NULL;
790 }
Alex Elder849b4262012-07-09 21:04:24 -0500791
Alex Elder34b13182012-07-13 20:35:12 -0500792 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700793 header->obj_order = ondisk->options.order;
794 header->crypt_type = ondisk->options.crypt_type;
795 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500796
Alex Elder621901d2012-08-23 23:22:06 -0500797 /* Allocate and fill in the snapshot context */
798
Alex Elderf84344f2012-08-31 17:29:51 -0500799 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder468521c2013-04-26 09:43:47 -0500800
Alex Elder812164f82013-04-30 00:44:32 -0500801 header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500802 if (!header->snapc)
803 goto out_err;
Alex Elder505cbb92012-07-19 08:49:18 -0500804 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Alex Elder621901d2012-08-23 23:22:06 -0500805 for (i = 0; i < snap_count; i++)
Alex Elder468521c2013-04-26 09:43:47 -0500806 header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807
808 return 0;
809
Alex Elder6a523252012-07-19 17:12:59 -0500810out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500811 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500812 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700813 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500814 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500815 kfree(header->object_prefix);
816 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500817
Alex Elder00f1f362012-02-07 12:03:36 -0600818 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700819}
820
Alex Elder9682fc62013-04-30 00:44:33 -0500821static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
822{
823 const char *snap_name;
824
825 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
826
827 /* Skip over names until we find the one we are looking for */
828
829 snap_name = rbd_dev->header.snap_names;
830 while (which--)
831 snap_name += strlen(snap_name) + 1;
832
833 return kstrdup(snap_name, GFP_KERNEL);
834}
835
836static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
837{
838 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
839 u32 which;
840
841 for (which = 0; which < snapc->num_snaps; which++)
842 if (snapc->snaps[which] == snap_id)
843 return which;
844
845 return BAD_SNAP_INDEX;
846}
847
Alex Elder2ad3d712013-04-30 00:44:33 -0500848static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
849 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -0500850{
851 u32 which;
852
853 which = rbd_dev_snap_index(rbd_dev, snap_id);
854 if (which == BAD_SNAP_INDEX)
855 return NULL;
856
857 return _rbd_dev_v1_snap_name(rbd_dev, which);
858}
859
Alex Elder9e15b772012-10-30 19:40:33 -0500860static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
861{
Alex Elder9e15b772012-10-30 19:40:33 -0500862 if (snap_id == CEPH_NOSNAP)
863 return RBD_SNAP_HEAD_NAME;
864
Alex Elder54cac612013-04-30 00:44:33 -0500865 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
866 if (rbd_dev->image_format == 1)
867 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -0500868
Alex Elder54cac612013-04-30 00:44:33 -0500869 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -0500870}
871
Alex Elder2ad3d712013-04-30 00:44:33 -0500872static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
873 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874{
Alex Elder2ad3d712013-04-30 00:44:33 -0500875 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
876 if (snap_id == CEPH_NOSNAP) {
877 *snap_size = rbd_dev->header.image_size;
878 } else if (rbd_dev->image_format == 1) {
879 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -0600880
Alex Elder2ad3d712013-04-30 00:44:33 -0500881 which = rbd_dev_snap_index(rbd_dev, snap_id);
882 if (which == BAD_SNAP_INDEX)
883 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -0600884
Alex Elder2ad3d712013-04-30 00:44:33 -0500885 *snap_size = rbd_dev->header.snap_sizes[which];
886 } else {
887 u64 size = 0;
888 int ret;
889
890 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
891 if (ret)
892 return ret;
893
894 *snap_size = size;
895 }
896 return 0;
897}
898
899static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
900 u64 *snap_features)
901{
902 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
903 if (snap_id == CEPH_NOSNAP) {
904 *snap_features = rbd_dev->header.features;
905 } else if (rbd_dev->image_format == 1) {
906 *snap_features = 0; /* No features for format 1 */
907 } else {
908 u64 features = 0;
909 int ret;
910
911 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
912 if (ret)
913 return ret;
914
915 *snap_features = features;
916 }
917 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700918}
919
Alex Elderd1cf5782013-04-27 09:59:30 -0500920static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700921{
Alex Elder2ad3d712013-04-30 00:44:33 -0500922 const char *snap_name = rbd_dev->spec->snap_name;
923 u64 snap_id;
924 u64 size = 0;
925 u64 features = 0;
926 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -0500927
Alex Elder2ad3d712013-04-30 00:44:33 -0500928 if (strcmp(snap_name, RBD_SNAP_HEAD_NAME)) {
929 snap_id = rbd_snap_id_by_name(rbd_dev, snap_name);
930 if (snap_id == CEPH_NOSNAP)
Alex Elder8b0241f2013-04-25 23:15:08 -0500931 return -ENOENT;
Alex Elder2ad3d712013-04-30 00:44:33 -0500932 } else {
933 snap_id = CEPH_NOSNAP;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700934 }
Alex Elder6d292902013-01-14 12:43:31 -0600935
Alex Elder2ad3d712013-04-30 00:44:33 -0500936 ret = rbd_snap_size(rbd_dev, snap_id, &size);
937 if (ret)
938 return ret;
939 ret = rbd_snap_features(rbd_dev, snap_id, &features);
940 if (ret)
941 return ret;
942
943 rbd_dev->mapping.size = size;
944 rbd_dev->mapping.features = features;
945
946 /* If we are mapping a snapshot it must be marked read-only */
947
948 if (snap_id != CEPH_NOSNAP)
949 rbd_dev->mapping.read_only = true;
950
Alex Elder8b0241f2013-04-25 23:15:08 -0500951 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700952}
953
Alex Elderd1cf5782013-04-27 09:59:30 -0500954static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
955{
956 rbd_dev->mapping.size = 0;
957 rbd_dev->mapping.features = 0;
958 rbd_dev->mapping.read_only = true;
959}
960
Alex Elder200a6a82013-04-28 23:32:34 -0500961static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
962{
963 rbd_dev->mapping.size = 0;
964 rbd_dev->mapping.features = 0;
965 rbd_dev->mapping.read_only = true;
966}
967
Alex Elder98571b52013-01-20 14:44:42 -0600968static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700969{
Alex Elder65ccfe22012-08-09 10:33:26 -0700970 char *name;
971 u64 segment;
972 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700973
Alex Elder2fd82b92012-11-09 15:05:54 -0600974 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700975 if (!name)
976 return NULL;
977 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600978 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700979 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600980 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700981 pr_err("error formatting segment name for #%llu (%d)\n",
982 segment, ret);
983 kfree(name);
984 name = NULL;
985 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700986
Alex Elder65ccfe22012-08-09 10:33:26 -0700987 return name;
988}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700989
Alex Elder65ccfe22012-08-09 10:33:26 -0700990static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
991{
992 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700993
Alex Elder65ccfe22012-08-09 10:33:26 -0700994 return offset & (segment_size - 1);
995}
996
997static u64 rbd_segment_length(struct rbd_device *rbd_dev,
998 u64 offset, u64 length)
999{
1000 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
1001
1002 offset &= segment_size - 1;
1003
Alex Elderaafb2302012-09-06 16:00:54 -05001004 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001005 if (offset + length > segment_size)
1006 length = segment_size - offset;
1007
1008 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001009}
1010
1011/*
Josh Durgin029bcbd2011-07-22 11:35:23 -07001012 * returns the size of an object in the image
1013 */
1014static u64 rbd_obj_bytes(struct rbd_image_header *header)
1015{
1016 return 1 << header->obj_order;
1017}
1018
1019/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001020 * bio helpers
1021 */
1022
1023static void bio_chain_put(struct bio *chain)
1024{
1025 struct bio *tmp;
1026
1027 while (chain) {
1028 tmp = chain;
1029 chain = chain->bi_next;
1030 bio_put(tmp);
1031 }
1032}
1033
1034/*
1035 * zeros a bio chain, starting at specific offset
1036 */
1037static void zero_bio_chain(struct bio *chain, int start_ofs)
1038{
1039 struct bio_vec *bv;
1040 unsigned long flags;
1041 void *buf;
1042 int i;
1043 int pos = 0;
1044
1045 while (chain) {
1046 bio_for_each_segment(bv, chain, i) {
1047 if (pos + bv->bv_len > start_ofs) {
1048 int remainder = max(start_ofs - pos, 0);
1049 buf = bvec_kmap_irq(bv, &flags);
1050 memset(buf + remainder, 0,
1051 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +02001052 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001053 }
1054 pos += bv->bv_len;
1055 }
1056
1057 chain = chain->bi_next;
1058 }
1059}
1060
1061/*
Alex Elderb9434c52013-04-19 15:34:50 -05001062 * similar to zero_bio_chain(), zeros data defined by a page array,
1063 * starting at the given byte offset from the start of the array and
1064 * continuing up to the given end offset. The pages array is
1065 * assumed to be big enough to hold all bytes up to the end.
1066 */
1067static void zero_pages(struct page **pages, u64 offset, u64 end)
1068{
1069 struct page **page = &pages[offset >> PAGE_SHIFT];
1070
1071 rbd_assert(end > offset);
1072 rbd_assert(end - offset <= (u64)SIZE_MAX);
1073 while (offset < end) {
1074 size_t page_offset;
1075 size_t length;
1076 unsigned long flags;
1077 void *kaddr;
1078
1079 page_offset = (size_t)(offset & ~PAGE_MASK);
1080 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
1081 local_irq_save(flags);
1082 kaddr = kmap_atomic(*page);
1083 memset(kaddr + page_offset, 0, length);
1084 kunmap_atomic(kaddr);
1085 local_irq_restore(flags);
1086
1087 offset += length;
1088 page++;
1089 }
1090}
1091
1092/*
Alex Elderf7760da2012-10-20 22:17:27 -05001093 * Clone a portion of a bio, starting at the given byte offset
1094 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001095 */
Alex Elderf7760da2012-10-20 22:17:27 -05001096static struct bio *bio_clone_range(struct bio *bio_src,
1097 unsigned int offset,
1098 unsigned int len,
1099 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001100{
Alex Elderf7760da2012-10-20 22:17:27 -05001101 struct bio_vec *bv;
1102 unsigned int resid;
1103 unsigned short idx;
1104 unsigned int voff;
1105 unsigned short end_idx;
1106 unsigned short vcnt;
1107 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001108
Alex Elderf7760da2012-10-20 22:17:27 -05001109 /* Handle the easy case for the caller */
1110
1111 if (!offset && len == bio_src->bi_size)
1112 return bio_clone(bio_src, gfpmask);
1113
1114 if (WARN_ON_ONCE(!len))
1115 return NULL;
1116 if (WARN_ON_ONCE(len > bio_src->bi_size))
1117 return NULL;
1118 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1119 return NULL;
1120
1121 /* Find first affected segment... */
1122
1123 resid = offset;
1124 __bio_for_each_segment(bv, bio_src, idx, 0) {
1125 if (resid < bv->bv_len)
1126 break;
1127 resid -= bv->bv_len;
1128 }
1129 voff = resid;
1130
1131 /* ...and the last affected segment */
1132
1133 resid += len;
1134 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1135 if (resid <= bv->bv_len)
1136 break;
1137 resid -= bv->bv_len;
1138 }
1139 vcnt = end_idx - idx + 1;
1140
1141 /* Build the clone */
1142
1143 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1144 if (!bio)
1145 return NULL; /* ENOMEM */
1146
1147 bio->bi_bdev = bio_src->bi_bdev;
1148 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1149 bio->bi_rw = bio_src->bi_rw;
1150 bio->bi_flags |= 1 << BIO_CLONED;
1151
1152 /*
1153 * Copy over our part of the bio_vec, then update the first
1154 * and last (or only) entries.
1155 */
1156 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1157 vcnt * sizeof (struct bio_vec));
1158 bio->bi_io_vec[0].bv_offset += voff;
1159 if (vcnt > 1) {
1160 bio->bi_io_vec[0].bv_len -= voff;
1161 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1162 } else {
1163 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001164 }
1165
Alex Elderf7760da2012-10-20 22:17:27 -05001166 bio->bi_vcnt = vcnt;
1167 bio->bi_size = len;
1168 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001169
Alex Elderf7760da2012-10-20 22:17:27 -05001170 return bio;
1171}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172
Alex Elderf7760da2012-10-20 22:17:27 -05001173/*
1174 * Clone a portion of a bio chain, starting at the given byte offset
1175 * into the first bio in the source chain and continuing for the
1176 * number of bytes indicated. The result is another bio chain of
1177 * exactly the given length, or a null pointer on error.
1178 *
1179 * The bio_src and offset parameters are both in-out. On entry they
1180 * refer to the first source bio and the offset into that bio where
1181 * the start of data to be cloned is located.
1182 *
1183 * On return, bio_src is updated to refer to the bio in the source
1184 * chain that contains first un-cloned byte, and *offset will
1185 * contain the offset of that byte within that bio.
1186 */
1187static struct bio *bio_chain_clone_range(struct bio **bio_src,
1188 unsigned int *offset,
1189 unsigned int len,
1190 gfp_t gfpmask)
1191{
1192 struct bio *bi = *bio_src;
1193 unsigned int off = *offset;
1194 struct bio *chain = NULL;
1195 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001196
Alex Elderf7760da2012-10-20 22:17:27 -05001197 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001198
Alex Elderf7760da2012-10-20 22:17:27 -05001199 if (!bi || off >= bi->bi_size || !len)
1200 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001201
Alex Elderf7760da2012-10-20 22:17:27 -05001202 end = &chain;
1203 while (len) {
1204 unsigned int bi_size;
1205 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001206
Alex Elderf5400b72012-11-01 10:17:15 -05001207 if (!bi) {
1208 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001209 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001210 }
Alex Elderf7760da2012-10-20 22:17:27 -05001211 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1212 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1213 if (!bio)
1214 goto out_err; /* ENOMEM */
1215
1216 *end = bio;
1217 end = &bio->bi_next;
1218
1219 off += bi_size;
1220 if (off == bi->bi_size) {
1221 bi = bi->bi_next;
1222 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223 }
Alex Elderf7760da2012-10-20 22:17:27 -05001224 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001225 }
Alex Elderf7760da2012-10-20 22:17:27 -05001226 *bio_src = bi;
1227 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001228
Alex Elderf7760da2012-10-20 22:17:27 -05001229 return chain;
1230out_err:
1231 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001232
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001233 return NULL;
1234}
1235
Alex Elder926f9b32013-02-11 12:33:24 -06001236/*
1237 * The default/initial value for all object request flags is 0. For
1238 * each flag, once its value is set to 1 it is never reset to 0
1239 * again.
1240 */
Alex Elder6365d332013-02-11 12:33:24 -06001241static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1242{
1243 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001244 struct rbd_device *rbd_dev;
1245
Alex Elder57acbaa2013-02-11 12:33:24 -06001246 rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder6365d332013-02-11 12:33:24 -06001247 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1248 obj_request);
1249 }
1250}
1251
1252static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1253{
1254 smp_mb();
1255 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1256}
1257
Alex Elder57acbaa2013-02-11 12:33:24 -06001258static void obj_request_done_set(struct rbd_obj_request *obj_request)
1259{
1260 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1261 struct rbd_device *rbd_dev = NULL;
1262
1263 if (obj_request_img_data_test(obj_request))
1264 rbd_dev = obj_request->img_request->rbd_dev;
1265 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1266 obj_request);
1267 }
1268}
1269
1270static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1271{
1272 smp_mb();
1273 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1274}
1275
Alex Elder5679c592013-02-11 12:33:24 -06001276/*
1277 * This sets the KNOWN flag after (possibly) setting the EXISTS
1278 * flag. The latter is set based on the "exists" value provided.
1279 *
1280 * Note that for our purposes once an object exists it never goes
1281 * away again. It's possible that the response from two existence
1282 * checks are separated by the creation of the target object, and
1283 * the first ("doesn't exist") response arrives *after* the second
1284 * ("does exist"). In that case we ignore the second one.
1285 */
1286static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1287 bool exists)
1288{
1289 if (exists)
1290 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1291 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1292 smp_mb();
1293}
1294
1295static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1296{
1297 smp_mb();
1298 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1299}
1300
1301static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1302{
1303 smp_mb();
1304 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1305}
1306
Alex Elderbf0d5f502012-11-22 00:00:08 -06001307static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1308{
Alex Elder37206ee2013-02-20 17:32:08 -06001309 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1310 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001311 kref_get(&obj_request->kref);
1312}
1313
1314static void rbd_obj_request_destroy(struct kref *kref);
1315static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1316{
1317 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001318 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1319 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001320 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1321}
1322
1323static void rbd_img_request_get(struct rbd_img_request *img_request)
1324{
Alex Elder37206ee2013-02-20 17:32:08 -06001325 dout("%s: img %p (was %d)\n", __func__, img_request,
1326 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001327 kref_get(&img_request->kref);
1328}
1329
1330static void rbd_img_request_destroy(struct kref *kref);
1331static void rbd_img_request_put(struct rbd_img_request *img_request)
1332{
1333 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001334 dout("%s: img %p (was %d)\n", __func__, img_request,
1335 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001336 kref_put(&img_request->kref, rbd_img_request_destroy);
1337}
1338
1339static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1340 struct rbd_obj_request *obj_request)
1341{
Alex Elder25dcf952013-01-25 17:08:55 -06001342 rbd_assert(obj_request->img_request == NULL);
1343
Alex Elderb155e862013-04-15 14:50:37 -05001344 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001345 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001346 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001347 rbd_assert(!obj_request_img_data_test(obj_request));
1348 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001349 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001350 img_request->obj_request_count++;
1351 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001352 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1353 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001354}
1355
1356static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1357 struct rbd_obj_request *obj_request)
1358{
1359 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001360
Alex Elder37206ee2013-02-20 17:32:08 -06001361 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1362 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001363 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001364 rbd_assert(img_request->obj_request_count > 0);
1365 img_request->obj_request_count--;
1366 rbd_assert(obj_request->which == img_request->obj_request_count);
1367 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001368 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001369 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001370 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001371 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001372 rbd_obj_request_put(obj_request);
1373}
1374
1375static bool obj_request_type_valid(enum obj_request_type type)
1376{
1377 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001378 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001379 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001380 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001381 return true;
1382 default:
1383 return false;
1384 }
1385}
1386
Alex Elderbf0d5f502012-11-22 00:00:08 -06001387static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1388 struct rbd_obj_request *obj_request)
1389{
Alex Elder37206ee2013-02-20 17:32:08 -06001390 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1391
Alex Elderbf0d5f502012-11-22 00:00:08 -06001392 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1393}
1394
1395static void rbd_img_request_complete(struct rbd_img_request *img_request)
1396{
Alex Elder55f27e02013-04-10 12:34:25 -05001397
Alex Elder37206ee2013-02-20 17:32:08 -06001398 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001399
1400 /*
1401 * If no error occurred, compute the aggregate transfer
1402 * count for the image request. We could instead use
1403 * atomic64_cmpxchg() to update it as each object request
1404 * completes; not clear which way is better off hand.
1405 */
1406 if (!img_request->result) {
1407 struct rbd_obj_request *obj_request;
1408 u64 xferred = 0;
1409
1410 for_each_obj_request(img_request, obj_request)
1411 xferred += obj_request->xferred;
1412 img_request->xferred = xferred;
1413 }
1414
Alex Elderbf0d5f502012-11-22 00:00:08 -06001415 if (img_request->callback)
1416 img_request->callback(img_request);
1417 else
1418 rbd_img_request_put(img_request);
1419}
1420
Alex Elder788e2df2013-01-17 12:25:27 -06001421/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1422
1423static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1424{
Alex Elder37206ee2013-02-20 17:32:08 -06001425 dout("%s: obj %p\n", __func__, obj_request);
1426
Alex Elder788e2df2013-01-17 12:25:27 -06001427 return wait_for_completion_interruptible(&obj_request->completion);
1428}
1429
Alex Elder0c425242013-02-08 09:55:49 -06001430/*
1431 * The default/initial value for all image request flags is 0. Each
1432 * is conditionally set to 1 at image request initialization time
1433 * and currently never change thereafter.
1434 */
1435static void img_request_write_set(struct rbd_img_request *img_request)
1436{
1437 set_bit(IMG_REQ_WRITE, &img_request->flags);
1438 smp_mb();
1439}
1440
1441static bool img_request_write_test(struct rbd_img_request *img_request)
1442{
1443 smp_mb();
1444 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1445}
1446
Alex Elder9849e982013-01-24 16:13:36 -06001447static void img_request_child_set(struct rbd_img_request *img_request)
1448{
1449 set_bit(IMG_REQ_CHILD, &img_request->flags);
1450 smp_mb();
1451}
1452
1453static bool img_request_child_test(struct rbd_img_request *img_request)
1454{
1455 smp_mb();
1456 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1457}
1458
Alex Elderd0b2e942013-01-24 16:13:36 -06001459static void img_request_layered_set(struct rbd_img_request *img_request)
1460{
1461 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1462 smp_mb();
1463}
1464
1465static bool img_request_layered_test(struct rbd_img_request *img_request)
1466{
1467 smp_mb();
1468 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1469}
1470
Alex Elder6e2a4502013-03-27 09:16:30 -05001471static void
1472rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1473{
Alex Elderb9434c52013-04-19 15:34:50 -05001474 u64 xferred = obj_request->xferred;
1475 u64 length = obj_request->length;
1476
Alex Elder6e2a4502013-03-27 09:16:30 -05001477 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1478 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001479 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001480 /*
1481 * ENOENT means a hole in the image. We zero-fill the
1482 * entire length of the request. A short read also implies
1483 * zero-fill to the end of the request. Either way we
1484 * update the xferred count to indicate the whole request
1485 * was satisfied.
1486 */
Alex Elderb9434c52013-04-19 15:34:50 -05001487 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001488 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001489 if (obj_request->type == OBJ_REQUEST_BIO)
1490 zero_bio_chain(obj_request->bio_list, 0);
1491 else
1492 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001493 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001494 obj_request->xferred = length;
1495 } else if (xferred < length && !obj_request->result) {
1496 if (obj_request->type == OBJ_REQUEST_BIO)
1497 zero_bio_chain(obj_request->bio_list, xferred);
1498 else
1499 zero_pages(obj_request->pages, xferred, length);
1500 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001501 }
1502 obj_request_done_set(obj_request);
1503}
1504
Alex Elderbf0d5f502012-11-22 00:00:08 -06001505static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1506{
Alex Elder37206ee2013-02-20 17:32:08 -06001507 dout("%s: obj %p cb %p\n", __func__, obj_request,
1508 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001509 if (obj_request->callback)
1510 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001511 else
1512 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001513}
1514
Alex Elderc47f9372013-02-26 14:23:07 -06001515static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001516{
1517 dout("%s: obj %p\n", __func__, obj_request);
1518 obj_request_done_set(obj_request);
1519}
1520
Alex Elderc47f9372013-02-26 14:23:07 -06001521static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001522{
Alex Elder57acbaa2013-02-11 12:33:24 -06001523 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001524 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001525 bool layered = false;
1526
1527 if (obj_request_img_data_test(obj_request)) {
1528 img_request = obj_request->img_request;
1529 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001530 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001531 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001532
1533 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1534 obj_request, img_request, obj_request->result,
1535 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001536 if (layered && obj_request->result == -ENOENT &&
1537 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001538 rbd_img_parent_read(obj_request);
1539 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001540 rbd_img_obj_request_read_callback(obj_request);
1541 else
1542 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001543}
1544
Alex Elderc47f9372013-02-26 14:23:07 -06001545static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001546{
Sage Weil1b83bef2013-02-25 16:11:12 -08001547 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1548 obj_request->result, obj_request->length);
1549 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001550 * There is no such thing as a successful short write. Set
1551 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001552 */
1553 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001554 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001555}
1556
Alex Elderfbfab532013-02-08 09:55:48 -06001557/*
1558 * For a simple stat call there's nothing to do. We'll do more if
1559 * this is part of a write sequence for a layered image.
1560 */
Alex Elderc47f9372013-02-26 14:23:07 -06001561static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001562{
Alex Elder37206ee2013-02-20 17:32:08 -06001563 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001564 obj_request_done_set(obj_request);
1565}
1566
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1568 struct ceph_msg *msg)
1569{
1570 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001571 u16 opcode;
1572
Alex Elder37206ee2013-02-20 17:32:08 -06001573 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001574 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001575 if (obj_request_img_data_test(obj_request)) {
1576 rbd_assert(obj_request->img_request);
1577 rbd_assert(obj_request->which != BAD_WHICH);
1578 } else {
1579 rbd_assert(obj_request->which == BAD_WHICH);
1580 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001581
Sage Weil1b83bef2013-02-25 16:11:12 -08001582 if (osd_req->r_result < 0)
1583 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001584
Alex Elder0eefd472013-04-19 15:34:50 -05001585 BUG_ON(osd_req->r_num_ops > 2);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001586
Alex Elderc47f9372013-02-26 14:23:07 -06001587 /*
1588 * We support a 64-bit length, but ultimately it has to be
1589 * passed to blk_end_request(), which takes an unsigned int.
1590 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001591 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elder8b3e1a52013-01-24 16:13:36 -06001592 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Alex Elder79528732013-04-03 21:32:51 -05001593 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001594 switch (opcode) {
1595 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001596 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001597 break;
1598 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001599 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001600 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001601 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001602 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001603 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001604 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001605 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001606 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001607 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001608 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001609 default:
1610 rbd_warn(NULL, "%s: unsupported op %hu\n",
1611 obj_request->object_name, (unsigned short) opcode);
1612 break;
1613 }
1614
Alex Elder07741302013-02-05 23:41:50 -06001615 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001616 rbd_obj_request_complete(obj_request);
1617}
1618
Alex Elder9d4df012013-04-19 15:34:50 -05001619static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001620{
1621 struct rbd_img_request *img_request = obj_request->img_request;
Alex Elder8c042b02013-04-03 01:28:58 -05001622 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001623 u64 snap_id;
Alex Elder430c28c2013-04-03 21:32:51 -05001624
Alex Elder8c042b02013-04-03 01:28:58 -05001625 rbd_assert(osd_req != NULL);
Alex Elder430c28c2013-04-03 21:32:51 -05001626
Alex Elder9d4df012013-04-19 15:34:50 -05001627 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
Alex Elder8c042b02013-04-03 01:28:58 -05001628 ceph_osdc_build_request(osd_req, obj_request->offset,
Alex Elder9d4df012013-04-19 15:34:50 -05001629 NULL, snap_id, NULL);
1630}
1631
1632static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1633{
1634 struct rbd_img_request *img_request = obj_request->img_request;
1635 struct ceph_osd_request *osd_req = obj_request->osd_req;
1636 struct ceph_snap_context *snapc;
1637 struct timespec mtime = CURRENT_TIME;
1638
1639 rbd_assert(osd_req != NULL);
1640
1641 snapc = img_request ? img_request->snapc : NULL;
1642 ceph_osdc_build_request(osd_req, obj_request->offset,
1643 snapc, CEPH_NOSNAP, &mtime);
Alex Elder430c28c2013-04-03 21:32:51 -05001644}
1645
Alex Elderbf0d5f502012-11-22 00:00:08 -06001646static struct ceph_osd_request *rbd_osd_req_create(
1647 struct rbd_device *rbd_dev,
1648 bool write_request,
Alex Elder430c28c2013-04-03 21:32:51 -05001649 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001650{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001651 struct ceph_snap_context *snapc = NULL;
1652 struct ceph_osd_client *osdc;
1653 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001654
Alex Elder6365d332013-02-11 12:33:24 -06001655 if (obj_request_img_data_test(obj_request)) {
1656 struct rbd_img_request *img_request = obj_request->img_request;
1657
Alex Elder0c425242013-02-08 09:55:49 -06001658 rbd_assert(write_request ==
1659 img_request_write_test(img_request));
1660 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001661 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001662 }
1663
1664 /* Allocate and initialize the request, for the single op */
1665
1666 osdc = &rbd_dev->rbd_client->client->osdc;
1667 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1668 if (!osd_req)
1669 return NULL; /* ENOMEM */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001670
Alex Elder430c28c2013-04-03 21:32:51 -05001671 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001672 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05001673 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06001674 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001675
1676 osd_req->r_callback = rbd_osd_req_callback;
1677 osd_req->r_priv = obj_request;
1678
1679 osd_req->r_oid_len = strlen(obj_request->object_name);
1680 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1681 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1682
1683 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1684
Alex Elderbf0d5f502012-11-22 00:00:08 -06001685 return osd_req;
1686}
1687
Alex Elder0eefd472013-04-19 15:34:50 -05001688/*
1689 * Create a copyup osd request based on the information in the
1690 * object request supplied. A copyup request has two osd ops,
1691 * a copyup method call, and a "normal" write request.
1692 */
1693static struct ceph_osd_request *
1694rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1695{
1696 struct rbd_img_request *img_request;
1697 struct ceph_snap_context *snapc;
1698 struct rbd_device *rbd_dev;
1699 struct ceph_osd_client *osdc;
1700 struct ceph_osd_request *osd_req;
1701
1702 rbd_assert(obj_request_img_data_test(obj_request));
1703 img_request = obj_request->img_request;
1704 rbd_assert(img_request);
1705 rbd_assert(img_request_write_test(img_request));
1706
1707 /* Allocate and initialize the request, for the two ops */
1708
1709 snapc = img_request->snapc;
1710 rbd_dev = img_request->rbd_dev;
1711 osdc = &rbd_dev->rbd_client->client->osdc;
1712 osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1713 if (!osd_req)
1714 return NULL; /* ENOMEM */
1715
1716 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1717 osd_req->r_callback = rbd_osd_req_callback;
1718 osd_req->r_priv = obj_request;
1719
1720 osd_req->r_oid_len = strlen(obj_request->object_name);
1721 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1722 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1723
1724 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1725
1726 return osd_req;
1727}
1728
1729
Alex Elderbf0d5f502012-11-22 00:00:08 -06001730static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1731{
1732 ceph_osdc_put_request(osd_req);
1733}
1734
1735/* object_name is assumed to be a non-null pointer and NUL-terminated */
1736
1737static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1738 u64 offset, u64 length,
1739 enum obj_request_type type)
1740{
1741 struct rbd_obj_request *obj_request;
1742 size_t size;
1743 char *name;
1744
1745 rbd_assert(obj_request_type_valid(type));
1746
1747 size = strlen(object_name) + 1;
1748 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1749 if (!obj_request)
1750 return NULL;
1751
1752 name = (char *)(obj_request + 1);
1753 obj_request->object_name = memcpy(name, object_name, size);
1754 obj_request->offset = offset;
1755 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06001756 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001757 obj_request->which = BAD_WHICH;
1758 obj_request->type = type;
1759 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06001760 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001761 kref_init(&obj_request->kref);
1762
Alex Elder37206ee2013-02-20 17:32:08 -06001763 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1764 offset, length, (int)type, obj_request);
1765
Alex Elderbf0d5f502012-11-22 00:00:08 -06001766 return obj_request;
1767}
1768
1769static void rbd_obj_request_destroy(struct kref *kref)
1770{
1771 struct rbd_obj_request *obj_request;
1772
1773 obj_request = container_of(kref, struct rbd_obj_request, kref);
1774
Alex Elder37206ee2013-02-20 17:32:08 -06001775 dout("%s: obj %p\n", __func__, obj_request);
1776
Alex Elderbf0d5f502012-11-22 00:00:08 -06001777 rbd_assert(obj_request->img_request == NULL);
1778 rbd_assert(obj_request->which == BAD_WHICH);
1779
1780 if (obj_request->osd_req)
1781 rbd_osd_req_destroy(obj_request->osd_req);
1782
1783 rbd_assert(obj_request_type_valid(obj_request->type));
1784 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001785 case OBJ_REQUEST_NODATA:
1786 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001787 case OBJ_REQUEST_BIO:
1788 if (obj_request->bio_list)
1789 bio_chain_put(obj_request->bio_list);
1790 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001791 case OBJ_REQUEST_PAGES:
1792 if (obj_request->pages)
1793 ceph_release_page_vector(obj_request->pages,
1794 obj_request->page_count);
1795 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001796 }
1797
1798 kfree(obj_request);
1799}
1800
1801/*
1802 * Caller is responsible for filling in the list of object requests
1803 * that comprises the image request, and the Linux request pointer
1804 * (if there is one).
1805 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001806static struct rbd_img_request *rbd_img_request_create(
1807 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001808 u64 offset, u64 length,
Alex Elder9849e982013-01-24 16:13:36 -06001809 bool write_request,
1810 bool child_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001811{
1812 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001813
1814 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1815 if (!img_request)
1816 return NULL;
1817
1818 if (write_request) {
1819 down_read(&rbd_dev->header_rwsem);
Alex Elder812164f82013-04-30 00:44:32 -05001820 ceph_get_snap_context(rbd_dev->header.snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001821 up_read(&rbd_dev->header_rwsem);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001822 }
1823
1824 img_request->rq = NULL;
1825 img_request->rbd_dev = rbd_dev;
1826 img_request->offset = offset;
1827 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06001828 img_request->flags = 0;
1829 if (write_request) {
1830 img_request_write_set(img_request);
Alex Elder468521c2013-04-26 09:43:47 -05001831 img_request->snapc = rbd_dev->header.snapc;
Alex Elder0c425242013-02-08 09:55:49 -06001832 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001833 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06001834 }
Alex Elder9849e982013-01-24 16:13:36 -06001835 if (child_request)
1836 img_request_child_set(img_request);
Alex Elderd0b2e942013-01-24 16:13:36 -06001837 if (rbd_dev->parent_spec)
1838 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001839 spin_lock_init(&img_request->completion_lock);
1840 img_request->next_completion = 0;
1841 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06001842 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001843 img_request->obj_request_count = 0;
1844 INIT_LIST_HEAD(&img_request->obj_requests);
1845 kref_init(&img_request->kref);
1846
1847 rbd_img_request_get(img_request); /* Avoid a warning */
1848 rbd_img_request_put(img_request); /* TEMPORARY */
1849
Alex Elder37206ee2013-02-20 17:32:08 -06001850 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1851 write_request ? "write" : "read", offset, length,
1852 img_request);
1853
Alex Elderbf0d5f502012-11-22 00:00:08 -06001854 return img_request;
1855}
1856
1857static void rbd_img_request_destroy(struct kref *kref)
1858{
1859 struct rbd_img_request *img_request;
1860 struct rbd_obj_request *obj_request;
1861 struct rbd_obj_request *next_obj_request;
1862
1863 img_request = container_of(kref, struct rbd_img_request, kref);
1864
Alex Elder37206ee2013-02-20 17:32:08 -06001865 dout("%s: img %p\n", __func__, img_request);
1866
Alex Elderbf0d5f502012-11-22 00:00:08 -06001867 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1868 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001869 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001870
Alex Elder0c425242013-02-08 09:55:49 -06001871 if (img_request_write_test(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001872 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001873
Alex Elder8b3e1a52013-01-24 16:13:36 -06001874 if (img_request_child_test(img_request))
1875 rbd_obj_request_put(img_request->obj_request);
1876
Alex Elderbf0d5f502012-11-22 00:00:08 -06001877 kfree(img_request);
1878}
1879
Alex Elder12178572013-02-08 09:55:49 -06001880static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1881{
Alex Elder6365d332013-02-11 12:33:24 -06001882 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001883 unsigned int xferred;
1884 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001885 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001886
Alex Elder6365d332013-02-11 12:33:24 -06001887 rbd_assert(obj_request_img_data_test(obj_request));
1888 img_request = obj_request->img_request;
1889
Alex Elder12178572013-02-08 09:55:49 -06001890 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1891 xferred = (unsigned int)obj_request->xferred;
1892 result = obj_request->result;
1893 if (result) {
1894 struct rbd_device *rbd_dev = img_request->rbd_dev;
1895
1896 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1897 img_request_write_test(img_request) ? "write" : "read",
1898 obj_request->length, obj_request->img_offset,
1899 obj_request->offset);
1900 rbd_warn(rbd_dev, " result %d xferred %x\n",
1901 result, xferred);
1902 if (!img_request->result)
1903 img_request->result = result;
1904 }
1905
Alex Elderf1a47392013-04-19 15:34:50 -05001906 /* Image object requests don't own their page array */
1907
1908 if (obj_request->type == OBJ_REQUEST_PAGES) {
1909 obj_request->pages = NULL;
1910 obj_request->page_count = 0;
1911 }
1912
Alex Elder8b3e1a52013-01-24 16:13:36 -06001913 if (img_request_child_test(img_request)) {
1914 rbd_assert(img_request->obj_request != NULL);
1915 more = obj_request->which < img_request->obj_request_count - 1;
1916 } else {
1917 rbd_assert(img_request->rq != NULL);
1918 more = blk_end_request(img_request->rq, result, xferred);
1919 }
1920
1921 return more;
Alex Elder12178572013-02-08 09:55:49 -06001922}
1923
Alex Elder21692382013-04-05 01:27:12 -05001924static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1925{
1926 struct rbd_img_request *img_request;
1927 u32 which = obj_request->which;
1928 bool more = true;
1929
Alex Elder6365d332013-02-11 12:33:24 -06001930 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001931 img_request = obj_request->img_request;
1932
1933 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1934 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001935 rbd_assert(img_request->obj_request_count > 0);
1936 rbd_assert(which != BAD_WHICH);
1937 rbd_assert(which < img_request->obj_request_count);
1938 rbd_assert(which >= img_request->next_completion);
1939
1940 spin_lock_irq(&img_request->completion_lock);
1941 if (which != img_request->next_completion)
1942 goto out;
1943
1944 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001945 rbd_assert(more);
1946 rbd_assert(which < img_request->obj_request_count);
1947
1948 if (!obj_request_done_test(obj_request))
1949 break;
Alex Elder12178572013-02-08 09:55:49 -06001950 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001951 which++;
1952 }
1953
1954 rbd_assert(more ^ (which == img_request->obj_request_count));
1955 img_request->next_completion = which;
1956out:
1957 spin_unlock_irq(&img_request->completion_lock);
1958
1959 if (!more)
1960 rbd_img_request_complete(img_request);
1961}
1962
Alex Elderf1a47392013-04-19 15:34:50 -05001963/*
1964 * Split up an image request into one or more object requests, each
1965 * to a different object. The "type" parameter indicates whether
1966 * "data_desc" is the pointer to the head of a list of bio
1967 * structures, or the base of a page array. In either case this
1968 * function assumes data_desc describes memory sufficient to hold
1969 * all data described by the image request.
1970 */
1971static int rbd_img_request_fill(struct rbd_img_request *img_request,
1972 enum obj_request_type type,
1973 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001974{
1975 struct rbd_device *rbd_dev = img_request->rbd_dev;
1976 struct rbd_obj_request *obj_request = NULL;
1977 struct rbd_obj_request *next_obj_request;
Alex Elder0c425242013-02-08 09:55:49 -06001978 bool write_request = img_request_write_test(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05001979 struct bio *bio_list;
1980 unsigned int bio_offset = 0;
1981 struct page **pages;
Alex Elder7da22d22013-01-24 16:13:36 -06001982 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001983 u64 resid;
1984 u16 opcode;
1985
Alex Elderf1a47392013-04-19 15:34:50 -05001986 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1987 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06001988
Alex Elder430c28c2013-04-03 21:32:51 -05001989 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
Alex Elder7da22d22013-01-24 16:13:36 -06001990 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001991 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06001992 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001993
1994 if (type == OBJ_REQUEST_BIO) {
1995 bio_list = data_desc;
1996 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1997 } else {
1998 rbd_assert(type == OBJ_REQUEST_PAGES);
1999 pages = data_desc;
2000 }
2001
Alex Elderbf0d5f502012-11-22 00:00:08 -06002002 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05002003 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002004 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002005 u64 offset;
2006 u64 length;
2007
Alex Elder7da22d22013-01-24 16:13:36 -06002008 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002009 if (!object_name)
2010 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06002011 offset = rbd_segment_offset(rbd_dev, img_offset);
2012 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002013 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05002014 offset, length, type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002015 kfree(object_name); /* object request has its own copy */
2016 if (!obj_request)
2017 goto out_unwind;
2018
Alex Elderf1a47392013-04-19 15:34:50 -05002019 if (type == OBJ_REQUEST_BIO) {
2020 unsigned int clone_size;
2021
2022 rbd_assert(length <= (u64)UINT_MAX);
2023 clone_size = (unsigned int)length;
2024 obj_request->bio_list =
2025 bio_chain_clone_range(&bio_list,
2026 &bio_offset,
2027 clone_size,
2028 GFP_ATOMIC);
2029 if (!obj_request->bio_list)
2030 goto out_partial;
2031 } else {
2032 unsigned int page_count;
2033
2034 obj_request->pages = pages;
2035 page_count = (u32)calc_pages_for(offset, length);
2036 obj_request->page_count = page_count;
2037 if ((offset + length) & ~PAGE_MASK)
2038 page_count--; /* more on last page */
2039 pages += page_count;
2040 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002041
Alex Elder2fa12322013-04-05 01:27:12 -05002042 osd_req = rbd_osd_req_create(rbd_dev, write_request,
2043 obj_request);
2044 if (!osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002045 goto out_partial;
Alex Elder2fa12322013-04-05 01:27:12 -05002046 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05002047 obj_request->callback = rbd_img_obj_callback;
Alex Elder430c28c2013-04-03 21:32:51 -05002048
Alex Elder2fa12322013-04-05 01:27:12 -05002049 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
2050 0, 0);
Alex Elderf1a47392013-04-19 15:34:50 -05002051 if (type == OBJ_REQUEST_BIO)
2052 osd_req_op_extent_osd_data_bio(osd_req, 0,
2053 obj_request->bio_list, length);
2054 else
2055 osd_req_op_extent_osd_data_pages(osd_req, 0,
2056 obj_request->pages, length,
2057 offset & ~PAGE_MASK, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002058
2059 if (write_request)
2060 rbd_osd_req_format_write(obj_request);
2061 else
2062 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002063
Alex Elder7da22d22013-01-24 16:13:36 -06002064 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002065 rbd_img_obj_request_add(img_request, obj_request);
2066
Alex Elder7da22d22013-01-24 16:13:36 -06002067 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002068 resid -= length;
2069 }
2070
2071 return 0;
2072
2073out_partial:
2074 rbd_obj_request_put(obj_request);
2075out_unwind:
2076 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2077 rbd_obj_request_put(obj_request);
2078
2079 return -ENOMEM;
2080}
2081
Alex Elder3d7efd12013-04-19 15:34:50 -05002082static void
Alex Elder0eefd472013-04-19 15:34:50 -05002083rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2084{
2085 struct rbd_img_request *img_request;
2086 struct rbd_device *rbd_dev;
2087 u64 length;
2088 u32 page_count;
2089
2090 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2091 rbd_assert(obj_request_img_data_test(obj_request));
2092 img_request = obj_request->img_request;
2093 rbd_assert(img_request);
2094
2095 rbd_dev = img_request->rbd_dev;
2096 rbd_assert(rbd_dev);
2097 length = (u64)1 << rbd_dev->header.obj_order;
2098 page_count = (u32)calc_pages_for(0, length);
2099
2100 rbd_assert(obj_request->copyup_pages);
2101 ceph_release_page_vector(obj_request->copyup_pages, page_count);
2102 obj_request->copyup_pages = NULL;
2103
2104 /*
2105 * We want the transfer count to reflect the size of the
2106 * original write request. There is no such thing as a
2107 * successful short write, so if the request was successful
2108 * we can just set it to the originally-requested length.
2109 */
2110 if (!obj_request->result)
2111 obj_request->xferred = obj_request->length;
2112
2113 /* Finish up with the normal image object callback */
2114
2115 rbd_img_obj_callback(obj_request);
2116}
2117
2118static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002119rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2120{
2121 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002122 struct ceph_osd_request *osd_req;
2123 struct ceph_osd_client *osdc;
2124 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002125 struct page **pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002126 int result;
2127 u64 obj_size;
2128 u64 xferred;
2129
2130 rbd_assert(img_request_child_test(img_request));
2131
2132 /* First get what we need from the image request */
2133
2134 pages = img_request->copyup_pages;
2135 rbd_assert(pages != NULL);
2136 img_request->copyup_pages = NULL;
2137
2138 orig_request = img_request->obj_request;
2139 rbd_assert(orig_request != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002140 rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
Alex Elder3d7efd12013-04-19 15:34:50 -05002141 result = img_request->result;
2142 obj_size = img_request->length;
2143 xferred = img_request->xferred;
2144
Alex Elder0eefd472013-04-19 15:34:50 -05002145 rbd_dev = img_request->rbd_dev;
2146 rbd_assert(rbd_dev);
2147 rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2148
Alex Elder3d7efd12013-04-19 15:34:50 -05002149 rbd_img_request_put(img_request);
2150
Alex Elder0eefd472013-04-19 15:34:50 -05002151 if (result)
2152 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002153
Alex Elder0eefd472013-04-19 15:34:50 -05002154 /* Allocate the new copyup osd request for the original request */
Alex Elder3d7efd12013-04-19 15:34:50 -05002155
Alex Elder0eefd472013-04-19 15:34:50 -05002156 result = -ENOMEM;
2157 rbd_assert(!orig_request->osd_req);
2158 osd_req = rbd_osd_req_create_copyup(orig_request);
2159 if (!osd_req)
2160 goto out_err;
2161 orig_request->osd_req = osd_req;
2162 orig_request->copyup_pages = pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002163
Alex Elder0eefd472013-04-19 15:34:50 -05002164 /* Initialize the copyup op */
2165
2166 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2167 osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2168 false, false);
2169
2170 /* Then the original write request op */
2171
2172 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2173 orig_request->offset,
2174 orig_request->length, 0, 0);
2175 osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2176 orig_request->length);
2177
2178 rbd_osd_req_format_write(orig_request);
2179
2180 /* All set, send it off. */
2181
2182 orig_request->callback = rbd_img_obj_copyup_callback;
2183 osdc = &rbd_dev->rbd_client->client->osdc;
2184 result = rbd_obj_request_submit(osdc, orig_request);
2185 if (!result)
2186 return;
2187out_err:
2188 /* Record the error code and complete the request */
2189
2190 orig_request->result = result;
2191 orig_request->xferred = 0;
2192 obj_request_done_set(orig_request);
2193 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002194}
2195
2196/*
2197 * Read from the parent image the range of data that covers the
2198 * entire target of the given object request. This is used for
2199 * satisfying a layered image write request when the target of an
2200 * object request from the image request does not exist.
2201 *
2202 * A page array big enough to hold the returned data is allocated
2203 * and supplied to rbd_img_request_fill() as the "data descriptor."
2204 * When the read completes, this page array will be transferred to
2205 * the original object request for the copyup operation.
2206 *
2207 * If an error occurs, record it as the result of the original
2208 * object request and mark it done so it gets completed.
2209 */
2210static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2211{
2212 struct rbd_img_request *img_request = NULL;
2213 struct rbd_img_request *parent_request = NULL;
2214 struct rbd_device *rbd_dev;
2215 u64 img_offset;
2216 u64 length;
2217 struct page **pages = NULL;
2218 u32 page_count;
2219 int result;
2220
2221 rbd_assert(obj_request_img_data_test(obj_request));
2222 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2223
2224 img_request = obj_request->img_request;
2225 rbd_assert(img_request != NULL);
2226 rbd_dev = img_request->rbd_dev;
2227 rbd_assert(rbd_dev->parent != NULL);
2228
2229 /*
Alex Elder0eefd472013-04-19 15:34:50 -05002230 * First things first. The original osd request is of no
2231 * use to use any more, we'll need a new one that can hold
2232 * the two ops in a copyup request. We'll get that later,
2233 * but for now we can release the old one.
2234 */
2235 rbd_osd_req_destroy(obj_request->osd_req);
2236 obj_request->osd_req = NULL;
2237
2238 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002239 * Determine the byte range covered by the object in the
2240 * child image to which the original request was to be sent.
2241 */
2242 img_offset = obj_request->img_offset - obj_request->offset;
2243 length = (u64)1 << rbd_dev->header.obj_order;
2244
2245 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002246 * There is no defined parent data beyond the parent
2247 * overlap, so limit what we read at that boundary if
2248 * necessary.
2249 */
2250 if (img_offset + length > rbd_dev->parent_overlap) {
2251 rbd_assert(img_offset < rbd_dev->parent_overlap);
2252 length = rbd_dev->parent_overlap - img_offset;
2253 }
2254
2255 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002256 * Allocate a page array big enough to receive the data read
2257 * from the parent.
2258 */
2259 page_count = (u32)calc_pages_for(0, length);
2260 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2261 if (IS_ERR(pages)) {
2262 result = PTR_ERR(pages);
2263 pages = NULL;
2264 goto out_err;
2265 }
2266
2267 result = -ENOMEM;
2268 parent_request = rbd_img_request_create(rbd_dev->parent,
2269 img_offset, length,
2270 false, true);
2271 if (!parent_request)
2272 goto out_err;
2273 rbd_obj_request_get(obj_request);
2274 parent_request->obj_request = obj_request;
2275
2276 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2277 if (result)
2278 goto out_err;
2279 parent_request->copyup_pages = pages;
2280
2281 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2282 result = rbd_img_request_submit(parent_request);
2283 if (!result)
2284 return 0;
2285
2286 parent_request->copyup_pages = NULL;
2287 parent_request->obj_request = NULL;
2288 rbd_obj_request_put(obj_request);
2289out_err:
2290 if (pages)
2291 ceph_release_page_vector(pages, page_count);
2292 if (parent_request)
2293 rbd_img_request_put(parent_request);
2294 obj_request->result = result;
2295 obj_request->xferred = 0;
2296 obj_request_done_set(obj_request);
2297
2298 return result;
2299}
2300
Alex Elderc5b5ef62013-02-11 12:33:24 -06002301static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2302{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002303 struct rbd_obj_request *orig_request;
2304 int result;
2305
2306 rbd_assert(!obj_request_img_data_test(obj_request));
2307
2308 /*
2309 * All we need from the object request is the original
2310 * request and the result of the STAT op. Grab those, then
2311 * we're done with the request.
2312 */
2313 orig_request = obj_request->obj_request;
2314 obj_request->obj_request = NULL;
2315 rbd_assert(orig_request);
2316 rbd_assert(orig_request->img_request);
2317
2318 result = obj_request->result;
2319 obj_request->result = 0;
2320
2321 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2322 obj_request, orig_request, result,
2323 obj_request->xferred, obj_request->length);
2324 rbd_obj_request_put(obj_request);
2325
2326 rbd_assert(orig_request);
2327 rbd_assert(orig_request->img_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002328
2329 /*
2330 * Our only purpose here is to determine whether the object
2331 * exists, and we don't want to treat the non-existence as
2332 * an error. If something else comes back, transfer the
2333 * error to the original request and complete it now.
2334 */
2335 if (!result) {
2336 obj_request_existence_set(orig_request, true);
2337 } else if (result == -ENOENT) {
2338 obj_request_existence_set(orig_request, false);
2339 } else if (result) {
2340 orig_request->result = result;
Alex Elder3d7efd12013-04-19 15:34:50 -05002341 goto out;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002342 }
2343
2344 /*
2345 * Resubmit the original request now that we have recorded
2346 * whether the target object exists.
2347 */
Alex Elderb454e362013-04-19 15:34:50 -05002348 orig_request->result = rbd_img_obj_request_submit(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002349out:
Alex Elderc5b5ef62013-02-11 12:33:24 -06002350 if (orig_request->result)
2351 rbd_obj_request_complete(orig_request);
2352 rbd_obj_request_put(orig_request);
2353}
2354
2355static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2356{
2357 struct rbd_obj_request *stat_request;
2358 struct rbd_device *rbd_dev;
2359 struct ceph_osd_client *osdc;
2360 struct page **pages = NULL;
2361 u32 page_count;
2362 size_t size;
2363 int ret;
2364
2365 /*
2366 * The response data for a STAT call consists of:
2367 * le64 length;
2368 * struct {
2369 * le32 tv_sec;
2370 * le32 tv_nsec;
2371 * } mtime;
2372 */
2373 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2374 page_count = (u32)calc_pages_for(0, size);
2375 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2376 if (IS_ERR(pages))
2377 return PTR_ERR(pages);
2378
2379 ret = -ENOMEM;
2380 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2381 OBJ_REQUEST_PAGES);
2382 if (!stat_request)
2383 goto out;
2384
2385 rbd_obj_request_get(obj_request);
2386 stat_request->obj_request = obj_request;
2387 stat_request->pages = pages;
2388 stat_request->page_count = page_count;
2389
2390 rbd_assert(obj_request->img_request);
2391 rbd_dev = obj_request->img_request->rbd_dev;
2392 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2393 stat_request);
2394 if (!stat_request->osd_req)
2395 goto out;
2396 stat_request->callback = rbd_img_obj_exists_callback;
2397
2398 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2399 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2400 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002401 rbd_osd_req_format_read(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002402
2403 osdc = &rbd_dev->rbd_client->client->osdc;
2404 ret = rbd_obj_request_submit(osdc, stat_request);
2405out:
2406 if (ret)
2407 rbd_obj_request_put(obj_request);
2408
2409 return ret;
2410}
2411
Alex Elderb454e362013-04-19 15:34:50 -05002412static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2413{
2414 struct rbd_img_request *img_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002415 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002416 bool known;
Alex Elderb454e362013-04-19 15:34:50 -05002417
2418 rbd_assert(obj_request_img_data_test(obj_request));
2419
2420 img_request = obj_request->img_request;
2421 rbd_assert(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002422 rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002423
Alex Elderb454e362013-04-19 15:34:50 -05002424 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002425 * Only writes to layered images need special handling.
2426 * Reads and non-layered writes are simple object requests.
2427 * Layered writes that start beyond the end of the overlap
2428 * with the parent have no parent data, so they too are
2429 * simple object requests. Finally, if the target object is
2430 * known to already exist, its parent data has already been
2431 * copied, so a write to the object can also be handled as a
2432 * simple object request.
Alex Elderb454e362013-04-19 15:34:50 -05002433 */
2434 if (!img_request_write_test(img_request) ||
2435 !img_request_layered_test(img_request) ||
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002436 rbd_dev->parent_overlap <= obj_request->img_offset ||
Alex Elder3d7efd12013-04-19 15:34:50 -05002437 ((known = obj_request_known_test(obj_request)) &&
2438 obj_request_exists_test(obj_request))) {
Alex Elderb454e362013-04-19 15:34:50 -05002439
2440 struct rbd_device *rbd_dev;
2441 struct ceph_osd_client *osdc;
2442
2443 rbd_dev = obj_request->img_request->rbd_dev;
2444 osdc = &rbd_dev->rbd_client->client->osdc;
2445
2446 return rbd_obj_request_submit(osdc, obj_request);
2447 }
2448
2449 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002450 * It's a layered write. The target object might exist but
2451 * we may not know that yet. If we know it doesn't exist,
2452 * start by reading the data for the full target object from
2453 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002454 */
Alex Elder3d7efd12013-04-19 15:34:50 -05002455 if (known)
2456 return rbd_img_obj_parent_read_full(obj_request);
2457
2458 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002459
2460 return rbd_img_obj_exists_submit(obj_request);
2461}
2462
Alex Elderbf0d5f502012-11-22 00:00:08 -06002463static int rbd_img_request_submit(struct rbd_img_request *img_request)
2464{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002465 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002466 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002467
Alex Elder37206ee2013-02-20 17:32:08 -06002468 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05002469 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002470 int ret;
2471
Alex Elderb454e362013-04-19 15:34:50 -05002472 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002473 if (ret)
2474 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002475 }
2476
2477 return 0;
2478}
2479
Alex Elder8b3e1a52013-01-24 16:13:36 -06002480static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2481{
2482 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002483 struct rbd_device *rbd_dev;
2484 u64 obj_end;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002485
2486 rbd_assert(img_request_child_test(img_request));
2487
2488 obj_request = img_request->obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002489 rbd_assert(obj_request);
2490 rbd_assert(obj_request->img_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002491
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002492 obj_request->result = img_request->result;
2493 if (obj_request->result)
2494 goto out;
2495
2496 /*
2497 * We need to zero anything beyond the parent overlap
2498 * boundary. Since rbd_img_obj_request_read_callback()
2499 * will zero anything beyond the end of a short read, an
2500 * easy way to do this is to pretend the data from the
2501 * parent came up short--ending at the overlap boundary.
2502 */
2503 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2504 obj_end = obj_request->img_offset + obj_request->length;
2505 rbd_dev = obj_request->img_request->rbd_dev;
2506 if (obj_end > rbd_dev->parent_overlap) {
2507 u64 xferred = 0;
2508
2509 if (obj_request->img_offset < rbd_dev->parent_overlap)
2510 xferred = rbd_dev->parent_overlap -
2511 obj_request->img_offset;
2512
2513 obj_request->xferred = min(img_request->xferred, xferred);
2514 } else {
2515 obj_request->xferred = img_request->xferred;
2516 }
2517out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06002518 rbd_img_obj_request_read_callback(obj_request);
2519 rbd_obj_request_complete(obj_request);
2520}
2521
2522static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2523{
2524 struct rbd_device *rbd_dev;
2525 struct rbd_img_request *img_request;
2526 int result;
2527
2528 rbd_assert(obj_request_img_data_test(obj_request));
2529 rbd_assert(obj_request->img_request != NULL);
2530 rbd_assert(obj_request->result == (s32) -ENOENT);
2531 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2532
2533 rbd_dev = obj_request->img_request->rbd_dev;
2534 rbd_assert(rbd_dev->parent != NULL);
2535 /* rbd_read_finish(obj_request, obj_request->length); */
2536 img_request = rbd_img_request_create(rbd_dev->parent,
2537 obj_request->img_offset,
2538 obj_request->length,
2539 false, true);
2540 result = -ENOMEM;
2541 if (!img_request)
2542 goto out_err;
2543
2544 rbd_obj_request_get(obj_request);
2545 img_request->obj_request = obj_request;
2546
Alex Elderf1a47392013-04-19 15:34:50 -05002547 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2548 obj_request->bio_list);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002549 if (result)
2550 goto out_err;
2551
2552 img_request->callback = rbd_img_parent_read_callback;
2553 result = rbd_img_request_submit(img_request);
2554 if (result)
2555 goto out_err;
2556
2557 return;
2558out_err:
2559 if (img_request)
2560 rbd_img_request_put(img_request);
2561 obj_request->result = result;
2562 obj_request->xferred = 0;
2563 obj_request_done_set(obj_request);
2564}
2565
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002566static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
Alex Elderb8d70032012-11-30 17:53:04 -06002567{
2568 struct rbd_obj_request *obj_request;
Alex Elder21692382013-04-05 01:27:12 -05002569 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderb8d70032012-11-30 17:53:04 -06002570 int ret;
2571
2572 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2573 OBJ_REQUEST_NODATA);
2574 if (!obj_request)
2575 return -ENOMEM;
2576
2577 ret = -ENOMEM;
Alex Elder430c28c2013-04-03 21:32:51 -05002578 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002579 if (!obj_request->osd_req)
2580 goto out;
Alex Elder21692382013-04-05 01:27:12 -05002581 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06002582
Alex Elderc99d2d42013-04-05 01:27:11 -05002583 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002584 notify_id, 0, 0);
Alex Elder9d4df012013-04-19 15:34:50 -05002585 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002586
Alex Elderb8d70032012-11-30 17:53:04 -06002587 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002588out:
Alex Eldercf81b602013-01-17 12:18:46 -06002589 if (ret)
2590 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002591
2592 return ret;
2593}
2594
2595static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2596{
2597 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Alex Elderb8d70032012-11-30 17:53:04 -06002598
2599 if (!rbd_dev)
2600 return;
2601
Alex Elder37206ee2013-02-20 17:32:08 -06002602 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002603 rbd_dev->header_name, (unsigned long long)notify_id,
2604 (unsigned int)opcode);
2605 (void)rbd_dev_refresh(rbd_dev);
Alex Elderb8d70032012-11-30 17:53:04 -06002606
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002607 rbd_obj_notify_ack(rbd_dev, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06002608}
2609
Alex Elder9969ebc2013-01-18 12:31:10 -06002610/*
2611 * Request sync osd watch/unwatch. The value of "start" determines
2612 * whether a watch request is being initiated or torn down.
2613 */
2614static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2615{
2616 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2617 struct rbd_obj_request *obj_request;
Alex Elder9969ebc2013-01-18 12:31:10 -06002618 int ret;
2619
2620 rbd_assert(start ^ !!rbd_dev->watch_event);
2621 rbd_assert(start ^ !!rbd_dev->watch_request);
2622
2623 if (start) {
Alex Elder3c663bb2013-02-15 11:42:30 -06002624 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
Alex Elder9969ebc2013-01-18 12:31:10 -06002625 &rbd_dev->watch_event);
2626 if (ret < 0)
2627 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06002628 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06002629 }
2630
2631 ret = -ENOMEM;
2632 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2633 OBJ_REQUEST_NODATA);
2634 if (!obj_request)
2635 goto out_cancel;
2636
Alex Elder430c28c2013-04-03 21:32:51 -05002637 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2638 if (!obj_request->osd_req)
2639 goto out_cancel;
2640
Alex Elder8eb87562013-01-25 17:08:55 -06002641 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06002642 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06002643 else
Alex Elder6977c3f2013-01-25 17:08:55 -06002644 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06002645 rbd_dev->watch_request->osd_req);
Alex Elder21692382013-04-05 01:27:12 -05002646
2647 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
Alex Elderb21ebdd2013-04-30 00:44:32 -05002648 rbd_dev->watch_event->cookie, 0, start);
Alex Elder9d4df012013-04-19 15:34:50 -05002649 rbd_osd_req_format_write(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002650
Alex Elder9969ebc2013-01-18 12:31:10 -06002651 ret = rbd_obj_request_submit(osdc, obj_request);
2652 if (ret)
2653 goto out_cancel;
2654 ret = rbd_obj_request_wait(obj_request);
2655 if (ret)
2656 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06002657 ret = obj_request->result;
2658 if (ret)
2659 goto out_cancel;
2660
Alex Elder8eb87562013-01-25 17:08:55 -06002661 /*
2662 * A watch request is set to linger, so the underlying osd
2663 * request won't go away until we unregister it. We retain
2664 * a pointer to the object request during that time (in
2665 * rbd_dev->watch_request), so we'll keep a reference to
2666 * it. We'll drop that reference (below) after we've
2667 * unregistered it.
2668 */
2669 if (start) {
2670 rbd_dev->watch_request = obj_request;
2671
2672 return 0;
2673 }
2674
2675 /* We have successfully torn down the watch request */
2676
2677 rbd_obj_request_put(rbd_dev->watch_request);
2678 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002679out_cancel:
2680 /* Cancel the event if we're tearing down, or on error */
2681 ceph_osdc_cancel_event(rbd_dev->watch_event);
2682 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002683 if (obj_request)
2684 rbd_obj_request_put(obj_request);
2685
2686 return ret;
2687}
2688
Alex Elder36be9a72013-01-19 00:30:28 -06002689/*
Alex Elderf40eb342013-04-25 15:09:42 -05002690 * Synchronous osd object method call. Returns the number of bytes
2691 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06002692 */
2693static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2694 const char *object_name,
2695 const char *class_name,
2696 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05002697 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06002698 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05002699 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05002700 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06002701{
Alex Elder21692382013-04-05 01:27:12 -05002702 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder36be9a72013-01-19 00:30:28 -06002703 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06002704 struct page **pages;
2705 u32 page_count;
2706 int ret;
2707
2708 /*
Alex Elder6010a452013-04-05 01:27:11 -05002709 * Method calls are ultimately read operations. The result
2710 * should placed into the inbound buffer provided. They
2711 * also supply outbound data--parameters for the object
2712 * method. Currently if this is present it will be a
2713 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06002714 */
Alex Elder57385b52013-04-21 12:14:45 -05002715 page_count = (u32)calc_pages_for(0, inbound_size);
Alex Elder36be9a72013-01-19 00:30:28 -06002716 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2717 if (IS_ERR(pages))
2718 return PTR_ERR(pages);
2719
2720 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05002721 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06002722 OBJ_REQUEST_PAGES);
2723 if (!obj_request)
2724 goto out;
2725
2726 obj_request->pages = pages;
2727 obj_request->page_count = page_count;
2728
Alex Elder430c28c2013-04-03 21:32:51 -05002729 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06002730 if (!obj_request->osd_req)
2731 goto out;
2732
Alex Elderc99d2d42013-04-05 01:27:11 -05002733 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05002734 class_name, method_name);
2735 if (outbound_size) {
2736 struct ceph_pagelist *pagelist;
2737
2738 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2739 if (!pagelist)
2740 goto out;
2741
2742 ceph_pagelist_init(pagelist);
2743 ceph_pagelist_append(pagelist, outbound, outbound_size);
2744 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2745 pagelist);
2746 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05002747 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2748 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05002749 0, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002750 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002751
Alex Elder36be9a72013-01-19 00:30:28 -06002752 ret = rbd_obj_request_submit(osdc, obj_request);
2753 if (ret)
2754 goto out;
2755 ret = rbd_obj_request_wait(obj_request);
2756 if (ret)
2757 goto out;
2758
2759 ret = obj_request->result;
2760 if (ret < 0)
2761 goto out;
Alex Elder57385b52013-04-21 12:14:45 -05002762
2763 rbd_assert(obj_request->xferred < (u64)INT_MAX);
2764 ret = (int)obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002765 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06002766out:
2767 if (obj_request)
2768 rbd_obj_request_put(obj_request);
2769 else
2770 ceph_release_page_vector(pages, page_count);
2771
2772 return ret;
2773}
2774
Alex Elderbf0d5f502012-11-22 00:00:08 -06002775static void rbd_request_fn(struct request_queue *q)
Alex Eldercc344fa2013-02-19 12:25:56 -06002776 __releases(q->queue_lock) __acquires(q->queue_lock)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002777{
2778 struct rbd_device *rbd_dev = q->queuedata;
2779 bool read_only = rbd_dev->mapping.read_only;
2780 struct request *rq;
2781 int result;
2782
2783 while ((rq = blk_fetch_request(q))) {
2784 bool write_request = rq_data_dir(rq) == WRITE;
2785 struct rbd_img_request *img_request;
2786 u64 offset;
2787 u64 length;
2788
2789 /* Ignore any non-FS requests that filter through. */
2790
2791 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06002792 dout("%s: non-fs request type %d\n", __func__,
2793 (int) rq->cmd_type);
2794 __blk_end_request_all(rq, 0);
2795 continue;
2796 }
2797
2798 /* Ignore/skip any zero-length requests */
2799
2800 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2801 length = (u64) blk_rq_bytes(rq);
2802
2803 if (!length) {
2804 dout("%s: zero-length request\n", __func__);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002805 __blk_end_request_all(rq, 0);
2806 continue;
2807 }
2808
2809 spin_unlock_irq(q->queue_lock);
2810
2811 /* Disallow writes to a read-only device */
2812
2813 if (write_request) {
2814 result = -EROFS;
2815 if (read_only)
2816 goto end_request;
2817 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2818 }
2819
Alex Elder6d292902013-01-14 12:43:31 -06002820 /*
2821 * Quit early if the mapped snapshot no longer
2822 * exists. It's still possible the snapshot will
2823 * have disappeared by the time our request arrives
2824 * at the osd, but there's no sense in sending it if
2825 * we already know.
2826 */
2827 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002828 dout("request for non-existent snapshot");
2829 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2830 result = -ENXIO;
2831 goto end_request;
2832 }
2833
Alex Elderbf0d5f502012-11-22 00:00:08 -06002834 result = -EINVAL;
Alex Elderc0cd10db2013-04-26 09:43:47 -05002835 if (offset && length > U64_MAX - offset + 1) {
2836 rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2837 offset, length);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002838 goto end_request; /* Shouldn't happen */
Alex Elderc0cd10db2013-04-26 09:43:47 -05002839 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002840
2841 result = -ENOMEM;
2842 img_request = rbd_img_request_create(rbd_dev, offset, length,
Alex Elder9849e982013-01-24 16:13:36 -06002843 write_request, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002844 if (!img_request)
2845 goto end_request;
2846
2847 img_request->rq = rq;
2848
Alex Elderf1a47392013-04-19 15:34:50 -05002849 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2850 rq->bio);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002851 if (!result)
2852 result = rbd_img_request_submit(img_request);
2853 if (result)
2854 rbd_img_request_put(img_request);
2855end_request:
2856 spin_lock_irq(q->queue_lock);
2857 if (result < 0) {
Alex Elder7da22d22013-01-24 16:13:36 -06002858 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2859 write_request ? "write" : "read",
2860 length, offset, result);
2861
Alex Elderbf0d5f502012-11-22 00:00:08 -06002862 __blk_end_request_all(rq, result);
2863 }
2864 }
2865}
2866
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002867/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002868 * a queue callback. Makes sure that we don't create a bio that spans across
2869 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002870 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002871 */
2872static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2873 struct bio_vec *bvec)
2874{
2875 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05002876 sector_t sector_offset;
2877 sector_t sectors_per_obj;
2878 sector_t obj_sector_offset;
2879 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002880
Alex Eldere5cfeed22012-10-20 22:17:27 -05002881 /*
2882 * Find how far into its rbd object the partition-relative
2883 * bio start sector is to offset relative to the enclosing
2884 * device.
2885 */
2886 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2887 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2888 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002889
Alex Eldere5cfeed22012-10-20 22:17:27 -05002890 /*
2891 * Compute the number of bytes from that offset to the end
2892 * of the object. Account for what's already used by the bio.
2893 */
2894 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2895 if (ret > bmd->bi_size)
2896 ret -= bmd->bi_size;
2897 else
2898 ret = 0;
2899
2900 /*
2901 * Don't send back more than was asked for. And if the bio
2902 * was empty, let the whole thing through because: "Note
2903 * that a block device *must* allow a single page to be
2904 * added to an empty bio."
2905 */
2906 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2907 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2908 ret = (int) bvec->bv_len;
2909
2910 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002911}
2912
2913static void rbd_free_disk(struct rbd_device *rbd_dev)
2914{
2915 struct gendisk *disk = rbd_dev->disk;
2916
2917 if (!disk)
2918 return;
2919
Alex Eldera0cab922013-04-25 23:15:08 -05002920 rbd_dev->disk = NULL;
2921 if (disk->flags & GENHD_FL_UP) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002922 del_gendisk(disk);
Alex Eldera0cab922013-04-25 23:15:08 -05002923 if (disk->queue)
2924 blk_cleanup_queue(disk->queue);
2925 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002926 put_disk(disk);
2927}
2928
Alex Elder788e2df2013-01-17 12:25:27 -06002929static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2930 const char *object_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05002931 u64 offset, u64 length, void *buf)
Alex Elder788e2df2013-01-17 12:25:27 -06002932
2933{
Alex Elder21692382013-04-05 01:27:12 -05002934 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder788e2df2013-01-17 12:25:27 -06002935 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06002936 struct page **pages = NULL;
2937 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002938 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06002939 int ret;
2940
2941 page_count = (u32) calc_pages_for(offset, length);
2942 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2943 if (IS_ERR(pages))
2944 ret = PTR_ERR(pages);
2945
2946 ret = -ENOMEM;
2947 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002948 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002949 if (!obj_request)
2950 goto out;
2951
2952 obj_request->pages = pages;
2953 obj_request->page_count = page_count;
2954
Alex Elder430c28c2013-04-03 21:32:51 -05002955 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06002956 if (!obj_request->osd_req)
2957 goto out;
2958
Alex Elderc99d2d42013-04-05 01:27:11 -05002959 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2960 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05002961 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05002962 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05002963 obj_request->length,
2964 obj_request->offset & ~PAGE_MASK,
2965 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002966 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002967
Alex Elder788e2df2013-01-17 12:25:27 -06002968 ret = rbd_obj_request_submit(osdc, obj_request);
2969 if (ret)
2970 goto out;
2971 ret = rbd_obj_request_wait(obj_request);
2972 if (ret)
2973 goto out;
2974
2975 ret = obj_request->result;
2976 if (ret < 0)
2977 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002978
2979 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2980 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002981 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder7097f8d2013-04-30 00:44:33 -05002982 rbd_assert(size <= (size_t)INT_MAX);
2983 ret = (int)size;
Alex Elder788e2df2013-01-17 12:25:27 -06002984out:
2985 if (obj_request)
2986 rbd_obj_request_put(obj_request);
2987 else
2988 ceph_release_page_vector(pages, page_count);
2989
2990 return ret;
2991}
2992
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002993/*
Alex Elder4156d992012-08-02 11:29:46 -05002994 * Read the complete header for the given rbd device.
2995 *
2996 * Returns a pointer to a dynamically-allocated buffer containing
2997 * the complete and validated header. Caller can pass the address
2998 * of a variable that will be filled in with the version of the
2999 * header object at the time it was read.
3000 *
3001 * Returns a pointer-coded errno if a failure occurs.
3002 */
3003static struct rbd_image_header_ondisk *
Alex Elder7097f8d2013-04-30 00:44:33 -05003004rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003005{
3006 struct rbd_image_header_ondisk *ondisk = NULL;
3007 u32 snap_count = 0;
3008 u64 names_size = 0;
3009 u32 want_count;
3010 int ret;
3011
3012 /*
3013 * The complete header will include an array of its 64-bit
3014 * snapshot ids, followed by the names of those snapshots as
3015 * a contiguous block of NUL-terminated strings. Note that
3016 * the number of snapshots could change by the time we read
3017 * it in, in which case we re-read it.
3018 */
3019 do {
3020 size_t size;
3021
3022 kfree(ondisk);
3023
3024 size = sizeof (*ondisk);
3025 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3026 size += names_size;
3027 ondisk = kmalloc(size, GFP_KERNEL);
3028 if (!ondisk)
3029 return ERR_PTR(-ENOMEM);
3030
Alex Elder788e2df2013-01-17 12:25:27 -06003031 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05003032 0, size, ondisk);
Alex Elder4156d992012-08-02 11:29:46 -05003033 if (ret < 0)
3034 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003035 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003036 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003037 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3038 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05003039 goto out_err;
3040 }
3041 if (!rbd_dev_ondisk_valid(ondisk)) {
3042 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003043 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05003044 goto out_err;
3045 }
3046
3047 names_size = le64_to_cpu(ondisk->snap_names_len);
3048 want_count = snap_count;
3049 snap_count = le32_to_cpu(ondisk->snap_count);
3050 } while (snap_count != want_count);
3051
3052 return ondisk;
3053
3054out_err:
3055 kfree(ondisk);
3056
3057 return ERR_PTR(ret);
3058}
3059
3060/*
3061 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003062 */
3063static int rbd_read_header(struct rbd_device *rbd_dev,
3064 struct rbd_image_header *header)
3065{
Alex Elder4156d992012-08-02 11:29:46 -05003066 struct rbd_image_header_ondisk *ondisk;
Alex Elder4156d992012-08-02 11:29:46 -05003067 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003068
Alex Elder7097f8d2013-04-30 00:44:33 -05003069 ondisk = rbd_dev_v1_header_read(rbd_dev);
Alex Elder4156d992012-08-02 11:29:46 -05003070 if (IS_ERR(ondisk))
3071 return PTR_ERR(ondisk);
3072 ret = rbd_header_from_disk(header, ondisk);
Alex Elder4156d992012-08-02 11:29:46 -05003073 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003074
Alex Elder4156d992012-08-02 11:29:46 -05003075 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003076}
3077
Alex Elder41f38c22012-10-25 23:34:40 -05003078static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003079{
3080 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05003081 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003082
Alex Elder6087b512013-04-25 15:09:41 -05003083 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) {
3084 list_del(&snap->node);
3085 rbd_snap_destroy(snap);
3086 }
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003087}
3088
Alex Elder94785542012-10-09 13:50:17 -07003089static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
3090{
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003091 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07003092 return;
3093
Alex Eldere28626a2013-04-26 15:44:35 -05003094 if (rbd_dev->mapping.size != rbd_dev->header.image_size) {
3095 sector_t size;
3096
3097 rbd_dev->mapping.size = rbd_dev->header.image_size;
3098 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3099 dout("setting size to %llu sectors", (unsigned long long)size);
3100 set_capacity(rbd_dev->disk, size);
3101 }
Alex Elder94785542012-10-09 13:50:17 -07003102}
3103
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003104/*
3105 * only read the first part of the ondisk header, without the snaps info
3106 */
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003107static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003108{
3109 int ret;
3110 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003111
3112 ret = rbd_read_header(rbd_dev, &h);
3113 if (ret < 0)
3114 return ret;
3115
Josh Durgina51aa0c2011-12-05 10:35:04 -08003116 down_write(&rbd_dev->header_rwsem);
3117
Alex Elder94785542012-10-09 13:50:17 -07003118 /* Update image size, and check for resize of mapped image */
3119 rbd_dev->header.image_size = h.image_size;
3120 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07003121
Alex Elder849b4262012-07-09 21:04:24 -05003122 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003123 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05003124 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08003125 /* osd requests may still refer to snapc */
Alex Elder812164f82013-04-30 00:44:32 -05003126 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003127
Josh Durgin93a24e02011-12-05 10:41:28 -08003128 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003129 rbd_dev->header.snapc = h.snapc;
3130 rbd_dev->header.snap_names = h.snap_names;
3131 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05003132 /* Free the extra copy of the object prefix */
Alex Elderc0cd10db2013-04-26 09:43:47 -05003133 if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
3134 rbd_warn(rbd_dev, "object prefix changed (ignoring)");
Alex Elder849b4262012-07-09 21:04:24 -05003135 kfree(h.object_prefix);
3136
Alex Elder304f6802012-08-31 17:29:52 -05003137 ret = rbd_dev_snaps_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003138
Josh Durginc6666012011-11-21 17:11:12 -08003139 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003140
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003141 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003142}
3143
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003144static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003145{
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003146 u64 image_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003147 int ret;
3148
Alex Elder117973f2012-08-31 17:29:55 -05003149 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003150 image_size = rbd_dev->header.image_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003151 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05003152 if (rbd_dev->image_format == 1)
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003153 ret = rbd_dev_v1_refresh(rbd_dev);
Alex Elder117973f2012-08-31 17:29:55 -05003154 else
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003155 ret = rbd_dev_v2_refresh(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003156 mutex_unlock(&ctl_mutex);
Alex Elder522a0cc2013-04-25 15:09:41 -05003157 if (ret)
3158 rbd_warn(rbd_dev, "got notification but failed to "
3159 " update snaps: %d\n", ret);
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003160 if (image_size != rbd_dev->header.image_size)
3161 revalidate_disk(rbd_dev->disk);
Alex Elder1fe5e992012-07-25 09:32:41 -05003162
3163 return ret;
3164}
3165
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003166static int rbd_init_disk(struct rbd_device *rbd_dev)
3167{
3168 struct gendisk *disk;
3169 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003170 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003171
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003172 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003173 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3174 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003175 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003176
Alex Elderf0f8cef2012-01-29 13:57:44 -06003177 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003178 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003179 disk->major = rbd_dev->major;
3180 disk->first_minor = 0;
3181 disk->fops = &rbd_bd_ops;
3182 disk->private_data = rbd_dev;
3183
Alex Elderbf0d5f502012-11-22 00:00:08 -06003184 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003185 if (!q)
3186 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003187
Alex Elder593a9e72012-02-07 12:03:37 -06003188 /* We use the default size, but let's be explicit about it. */
3189 blk_queue_physical_block_size(q, SECTOR_SIZE);
3190
Josh Durgin029bcbd2011-07-22 11:35:23 -07003191 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003192 segment_size = rbd_obj_bytes(&rbd_dev->header);
3193 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3194 blk_queue_max_segment_size(q, segment_size);
3195 blk_queue_io_min(q, segment_size);
3196 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003197
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003198 blk_queue_merge_bvec(q, rbd_merge_bvec);
3199 disk->queue = q;
3200
3201 q->queuedata = rbd_dev;
3202
3203 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003204
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003205 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003206out_disk:
3207 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003208
3209 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003210}
3211
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003212/*
3213 sysfs
3214*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003215
Alex Elder593a9e72012-02-07 12:03:37 -06003216static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3217{
3218 return container_of(dev, struct rbd_device, dev);
3219}
3220
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003221static ssize_t rbd_size_show(struct device *dev,
3222 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003223{
Alex Elder593a9e72012-02-07 12:03:37 -06003224 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003225
Alex Elderfc71d832013-04-26 15:44:36 -05003226 return sprintf(buf, "%llu\n",
3227 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003228}
3229
Alex Elder34b13182012-07-13 20:35:12 -05003230/*
3231 * Note this shows the features for whatever's mapped, which is not
3232 * necessarily the base image.
3233 */
3234static ssize_t rbd_features_show(struct device *dev,
3235 struct device_attribute *attr, char *buf)
3236{
3237 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3238
3239 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003240 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05003241}
3242
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003243static ssize_t rbd_major_show(struct device *dev,
3244 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003245{
Alex Elder593a9e72012-02-07 12:03:37 -06003246 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003247
Alex Elderfc71d832013-04-26 15:44:36 -05003248 if (rbd_dev->major)
3249 return sprintf(buf, "%d\n", rbd_dev->major);
3250
3251 return sprintf(buf, "(none)\n");
3252
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003253}
3254
3255static ssize_t rbd_client_id_show(struct device *dev,
3256 struct device_attribute *attr, char *buf)
3257{
Alex Elder593a9e72012-02-07 12:03:37 -06003258 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003259
Alex Elder1dbb4392012-01-24 10:08:37 -06003260 return sprintf(buf, "client%lld\n",
3261 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003262}
3263
3264static ssize_t rbd_pool_show(struct device *dev,
3265 struct device_attribute *attr, char *buf)
3266{
Alex Elder593a9e72012-02-07 12:03:37 -06003267 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003268
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003269 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003270}
3271
Alex Elder9bb2f332012-07-12 10:46:35 -05003272static ssize_t rbd_pool_id_show(struct device *dev,
3273 struct device_attribute *attr, char *buf)
3274{
3275 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3276
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003277 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003278 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003279}
3280
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003281static ssize_t rbd_name_show(struct device *dev,
3282 struct device_attribute *attr, char *buf)
3283{
Alex Elder593a9e72012-02-07 12:03:37 -06003284 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003285
Alex Eldera92ffdf2012-10-30 19:40:33 -05003286 if (rbd_dev->spec->image_name)
3287 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3288
3289 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003290}
3291
Alex Elder589d30e2012-07-10 20:30:11 -05003292static ssize_t rbd_image_id_show(struct device *dev,
3293 struct device_attribute *attr, char *buf)
3294{
3295 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3296
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003297 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003298}
3299
Alex Elder34b13182012-07-13 20:35:12 -05003300/*
3301 * Shows the name of the currently-mapped snapshot (or
3302 * RBD_SNAP_HEAD_NAME for the base image).
3303 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003304static ssize_t rbd_snap_show(struct device *dev,
3305 struct device_attribute *attr,
3306 char *buf)
3307{
Alex Elder593a9e72012-02-07 12:03:37 -06003308 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003309
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003310 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003311}
3312
Alex Elder86b00e02012-10-25 23:34:42 -05003313/*
3314 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3315 * for the parent image. If there is no parent, simply shows
3316 * "(no parent image)".
3317 */
3318static ssize_t rbd_parent_show(struct device *dev,
3319 struct device_attribute *attr,
3320 char *buf)
3321{
3322 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3323 struct rbd_spec *spec = rbd_dev->parent_spec;
3324 int count;
3325 char *bufp = buf;
3326
3327 if (!spec)
3328 return sprintf(buf, "(no parent image)\n");
3329
3330 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3331 (unsigned long long) spec->pool_id, spec->pool_name);
3332 if (count < 0)
3333 return count;
3334 bufp += count;
3335
3336 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3337 spec->image_name ? spec->image_name : "(unknown)");
3338 if (count < 0)
3339 return count;
3340 bufp += count;
3341
3342 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3343 (unsigned long long) spec->snap_id, spec->snap_name);
3344 if (count < 0)
3345 return count;
3346 bufp += count;
3347
3348 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3349 if (count < 0)
3350 return count;
3351 bufp += count;
3352
3353 return (ssize_t) (bufp - buf);
3354}
3355
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003356static ssize_t rbd_image_refresh(struct device *dev,
3357 struct device_attribute *attr,
3358 const char *buf,
3359 size_t size)
3360{
Alex Elder593a9e72012-02-07 12:03:37 -06003361 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003362 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003363
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003364 ret = rbd_dev_refresh(rbd_dev);
Alex Elderb8136232012-07-25 09:32:41 -05003365
3366 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003367}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003368
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003369static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003370static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003371static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3372static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3373static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003374static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003375static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003376static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003377static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3378static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003379static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003380
3381static struct attribute *rbd_attrs[] = {
3382 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003383 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003384 &dev_attr_major.attr,
3385 &dev_attr_client_id.attr,
3386 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05003387 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003388 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05003389 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003390 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05003391 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003392 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003393 NULL
3394};
3395
3396static struct attribute_group rbd_attr_group = {
3397 .attrs = rbd_attrs,
3398};
3399
3400static const struct attribute_group *rbd_attr_groups[] = {
3401 &rbd_attr_group,
3402 NULL
3403};
3404
3405static void rbd_sysfs_dev_release(struct device *dev)
3406{
3407}
3408
3409static struct device_type rbd_device_type = {
3410 .name = "rbd",
3411 .groups = rbd_attr_groups,
3412 .release = rbd_sysfs_dev_release,
3413};
3414
Alex Elder8b8fb992012-10-26 17:25:24 -05003415static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3416{
3417 kref_get(&spec->kref);
3418
3419 return spec;
3420}
3421
3422static void rbd_spec_free(struct kref *kref);
3423static void rbd_spec_put(struct rbd_spec *spec)
3424{
3425 if (spec)
3426 kref_put(&spec->kref, rbd_spec_free);
3427}
3428
3429static struct rbd_spec *rbd_spec_alloc(void)
3430{
3431 struct rbd_spec *spec;
3432
3433 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3434 if (!spec)
3435 return NULL;
3436 kref_init(&spec->kref);
3437
Alex Elder8b8fb992012-10-26 17:25:24 -05003438 return spec;
3439}
3440
3441static void rbd_spec_free(struct kref *kref)
3442{
3443 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3444
3445 kfree(spec->pool_name);
3446 kfree(spec->image_id);
3447 kfree(spec->image_name);
3448 kfree(spec->snap_name);
3449 kfree(spec);
3450}
3451
Alex Eldercc344fa2013-02-19 12:25:56 -06003452static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05003453 struct rbd_spec *spec)
3454{
3455 struct rbd_device *rbd_dev;
3456
3457 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3458 if (!rbd_dev)
3459 return NULL;
3460
3461 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06003462 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05003463 INIT_LIST_HEAD(&rbd_dev->node);
3464 INIT_LIST_HEAD(&rbd_dev->snaps);
3465 init_rwsem(&rbd_dev->header_rwsem);
3466
3467 rbd_dev->spec = spec;
3468 rbd_dev->rbd_client = rbdc;
3469
Alex Elder0903e872012-11-14 12:25:19 -06003470 /* Initialize the layout used for all rbd requests */
3471
3472 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3473 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3474 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3475 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3476
Alex Elderc53d5892012-10-25 23:34:42 -05003477 return rbd_dev;
3478}
3479
3480static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3481{
Alex Elderc53d5892012-10-25 23:34:42 -05003482 rbd_put_client(rbd_dev->rbd_client);
3483 rbd_spec_put(rbd_dev->spec);
3484 kfree(rbd_dev);
3485}
3486
Alex Elder6087b512013-04-25 15:09:41 -05003487static void rbd_snap_destroy(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003488{
Alex Elder3e83b652013-04-23 13:52:53 -05003489 kfree(snap->name);
3490 kfree(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003491}
3492
Alex Elder6087b512013-04-25 15:09:41 -05003493static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05003494 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05003495 u64 snap_id, u64 snap_size,
3496 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003497{
Alex Elder4e891e02012-07-10 20:30:10 -05003498 struct rbd_snap *snap;
Alex Elder4e891e02012-07-10 20:30:10 -05003499
3500 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003501 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05003502 return ERR_PTR(-ENOMEM);
3503
Alex Elder6e584f52013-04-25 15:09:42 -05003504 snap->name = snap_name;
Alex Elderc8d18422012-07-10 20:30:11 -05003505 snap->id = snap_id;
3506 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05003507 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05003508
3509 return snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003510}
3511
Alex Elder6e584f52013-04-25 15:09:42 -05003512/*
3513 * Returns a dynamically-allocated snapshot name if successful, or a
3514 * pointer-coded error otherwise.
3515 */
Alex Elder54cac612013-04-30 00:44:33 -05003516static const char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev,
3517 u64 snap_id, u64 *snap_size, u64 *snap_features)
Alex Eldercd892122012-07-03 16:01:19 -05003518{
Alex Eldercb752232013-04-30 00:44:33 -05003519 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05003520 u32 which;
Alex Eldercd892122012-07-03 16:01:19 -05003521
Alex Elder54cac612013-04-30 00:44:33 -05003522 which = rbd_dev_snap_index(rbd_dev, snap_id);
3523 if (which == BAD_SNAP_INDEX)
3524 return ERR_PTR(-ENOENT);
Alex Elder9682fc62013-04-30 00:44:33 -05003525 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
Alex Elder6e584f52013-04-25 15:09:42 -05003526 if (!snap_name)
3527 return ERR_PTR(-ENOMEM);
3528
3529 *snap_size = rbd_dev->header.snap_sizes[which];
3530 *snap_features = 0; /* No features for v1 */
3531
Alex Eldercd892122012-07-03 16:01:19 -05003532 return snap_name;
3533}
3534
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003535/*
Alex Elder9d475de2012-07-03 16:01:19 -05003536 * Get the size and object order for an image snapshot, or if
3537 * snap_id is CEPH_NOSNAP, gets this information for the base
3538 * image.
3539 */
3540static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3541 u8 *order, u64 *snap_size)
3542{
3543 __le64 snapid = cpu_to_le64(snap_id);
3544 int ret;
3545 struct {
3546 u8 order;
3547 __le64 size;
3548 } __attribute__ ((packed)) size_buf = { 0 };
3549
Alex Elder36be9a72013-01-19 00:30:28 -06003550 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05003551 "rbd", "get_size",
Alex Elder41579762013-04-21 12:14:45 -05003552 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003553 &size_buf, sizeof (size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06003554 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05003555 if (ret < 0)
3556 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003557 if (ret < sizeof (size_buf))
3558 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05003559
Alex Elderc86f86e2013-04-25 15:09:41 -05003560 if (order)
3561 *order = size_buf.order;
Alex Elder9d475de2012-07-03 16:01:19 -05003562 *snap_size = le64_to_cpu(size_buf.size);
3563
3564 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
Alex Elder57385b52013-04-21 12:14:45 -05003565 (unsigned long long)snap_id, (unsigned int)*order,
3566 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05003567
3568 return 0;
3569}
3570
3571static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3572{
3573 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3574 &rbd_dev->header.obj_order,
3575 &rbd_dev->header.image_size);
3576}
3577
Alex Elder1e130192012-07-03 16:01:19 -05003578static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3579{
3580 void *reply_buf;
3581 int ret;
3582 void *p;
3583
3584 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3585 if (!reply_buf)
3586 return -ENOMEM;
3587
Alex Elder36be9a72013-01-19 00:30:28 -06003588 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003589 "rbd", "get_object_prefix", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003590 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06003591 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05003592 if (ret < 0)
3593 goto out;
3594
3595 p = reply_buf;
3596 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05003597 p + ret, NULL, GFP_NOIO);
3598 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05003599
3600 if (IS_ERR(rbd_dev->header.object_prefix)) {
3601 ret = PTR_ERR(rbd_dev->header.object_prefix);
3602 rbd_dev->header.object_prefix = NULL;
3603 } else {
3604 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
3605 }
Alex Elder1e130192012-07-03 16:01:19 -05003606out:
3607 kfree(reply_buf);
3608
3609 return ret;
3610}
3611
Alex Elderb1b54022012-07-03 16:01:19 -05003612static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3613 u64 *snap_features)
3614{
3615 __le64 snapid = cpu_to_le64(snap_id);
3616 struct {
3617 __le64 features;
3618 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05003619 } __attribute__ ((packed)) features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07003620 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05003621 int ret;
3622
Alex Elder36be9a72013-01-19 00:30:28 -06003623 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05003624 "rbd", "get_features",
Alex Elder41579762013-04-21 12:14:45 -05003625 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003626 &features_buf, sizeof (features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06003627 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05003628 if (ret < 0)
3629 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003630 if (ret < sizeof (features_buf))
3631 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07003632
3633 incompat = le64_to_cpu(features_buf.incompat);
Alex Elder5cbf6f122013-04-11 09:29:48 -05003634 if (incompat & ~RBD_FEATURES_SUPPORTED)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05003635 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07003636
Alex Elderb1b54022012-07-03 16:01:19 -05003637 *snap_features = le64_to_cpu(features_buf.features);
3638
3639 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05003640 (unsigned long long)snap_id,
3641 (unsigned long long)*snap_features,
3642 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05003643
3644 return 0;
3645}
3646
3647static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3648{
3649 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3650 &rbd_dev->header.features);
3651}
3652
Alex Elder86b00e02012-10-25 23:34:42 -05003653static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3654{
3655 struct rbd_spec *parent_spec;
3656 size_t size;
3657 void *reply_buf = NULL;
3658 __le64 snapid;
3659 void *p;
3660 void *end;
3661 char *image_id;
3662 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05003663 int ret;
3664
3665 parent_spec = rbd_spec_alloc();
3666 if (!parent_spec)
3667 return -ENOMEM;
3668
3669 size = sizeof (__le64) + /* pool_id */
3670 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
3671 sizeof (__le64) + /* snap_id */
3672 sizeof (__le64); /* overlap */
3673 reply_buf = kmalloc(size, GFP_KERNEL);
3674 if (!reply_buf) {
3675 ret = -ENOMEM;
3676 goto out_err;
3677 }
3678
3679 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06003680 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05003681 "rbd", "get_parent",
Alex Elder41579762013-04-21 12:14:45 -05003682 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003683 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06003684 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05003685 if (ret < 0)
3686 goto out_err;
3687
Alex Elder86b00e02012-10-25 23:34:42 -05003688 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003689 end = reply_buf + ret;
3690 ret = -ERANGE;
Alex Elder86b00e02012-10-25 23:34:42 -05003691 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3692 if (parent_spec->pool_id == CEPH_NOPOOL)
3693 goto out; /* No parent? No problem. */
3694
Alex Elder0903e872012-11-14 12:25:19 -06003695 /* The ceph file layout needs to fit pool id in 32 bits */
3696
3697 ret = -EIO;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003698 if (parent_spec->pool_id > (u64)U32_MAX) {
3699 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3700 (unsigned long long)parent_spec->pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05003701 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003702 }
Alex Elder0903e872012-11-14 12:25:19 -06003703
Alex Elder979ed482012-11-01 08:39:26 -05003704 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05003705 if (IS_ERR(image_id)) {
3706 ret = PTR_ERR(image_id);
3707 goto out_err;
3708 }
3709 parent_spec->image_id = image_id;
3710 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3711 ceph_decode_64_safe(&p, end, overlap, out_err);
3712
3713 rbd_dev->parent_overlap = overlap;
3714 rbd_dev->parent_spec = parent_spec;
3715 parent_spec = NULL; /* rbd_dev now owns this */
3716out:
3717 ret = 0;
3718out_err:
3719 kfree(reply_buf);
3720 rbd_spec_put(parent_spec);
3721
3722 return ret;
3723}
3724
Alex Eldercc070d52013-04-21 12:14:45 -05003725static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3726{
3727 struct {
3728 __le64 stripe_unit;
3729 __le64 stripe_count;
3730 } __attribute__ ((packed)) striping_info_buf = { 0 };
3731 size_t size = sizeof (striping_info_buf);
3732 void *p;
3733 u64 obj_size;
3734 u64 stripe_unit;
3735 u64 stripe_count;
3736 int ret;
3737
3738 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3739 "rbd", "get_stripe_unit_count", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003740 (char *)&striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05003741 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3742 if (ret < 0)
3743 return ret;
3744 if (ret < size)
3745 return -ERANGE;
3746
3747 /*
3748 * We don't actually support the "fancy striping" feature
3749 * (STRIPINGV2) yet, but if the striping sizes are the
3750 * defaults the behavior is the same as before. So find
3751 * out, and only fail if the image has non-default values.
3752 */
3753 ret = -EINVAL;
3754 obj_size = (u64)1 << rbd_dev->header.obj_order;
3755 p = &striping_info_buf;
3756 stripe_unit = ceph_decode_64(&p);
3757 if (stripe_unit != obj_size) {
3758 rbd_warn(rbd_dev, "unsupported stripe unit "
3759 "(got %llu want %llu)",
3760 stripe_unit, obj_size);
3761 return -EINVAL;
3762 }
3763 stripe_count = ceph_decode_64(&p);
3764 if (stripe_count != 1) {
3765 rbd_warn(rbd_dev, "unsupported stripe count "
3766 "(got %llu want 1)", stripe_count);
3767 return -EINVAL;
3768 }
Alex Elder500d0c02013-04-26 09:43:47 -05003769 rbd_dev->header.stripe_unit = stripe_unit;
3770 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05003771
3772 return 0;
3773}
3774
Alex Elder9e15b772012-10-30 19:40:33 -05003775static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3776{
3777 size_t image_id_size;
3778 char *image_id;
3779 void *p;
3780 void *end;
3781 size_t size;
3782 void *reply_buf = NULL;
3783 size_t len = 0;
3784 char *image_name = NULL;
3785 int ret;
3786
3787 rbd_assert(!rbd_dev->spec->image_name);
3788
Alex Elder69e7a022012-11-01 08:39:26 -05003789 len = strlen(rbd_dev->spec->image_id);
3790 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003791 image_id = kmalloc(image_id_size, GFP_KERNEL);
3792 if (!image_id)
3793 return NULL;
3794
3795 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05003796 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05003797 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05003798
3799 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3800 reply_buf = kmalloc(size, GFP_KERNEL);
3801 if (!reply_buf)
3802 goto out;
3803
Alex Elder36be9a72013-01-19 00:30:28 -06003804 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003805 "rbd", "dir_get_name",
3806 image_id, image_id_size,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003807 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05003808 if (ret < 0)
3809 goto out;
3810 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05003811 end = reply_buf + ret;
3812
Alex Elder9e15b772012-10-30 19:40:33 -05003813 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3814 if (IS_ERR(image_name))
3815 image_name = NULL;
3816 else
3817 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3818out:
3819 kfree(reply_buf);
3820 kfree(image_id);
3821
3822 return image_name;
3823}
3824
Alex Elder2ad3d712013-04-30 00:44:33 -05003825static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3826{
3827 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3828 const char *snap_name;
3829 u32 which = 0;
3830
3831 /* Skip over names until we find the one we are looking for */
3832
3833 snap_name = rbd_dev->header.snap_names;
3834 while (which < snapc->num_snaps) {
3835 if (!strcmp(name, snap_name))
3836 return snapc->snaps[which];
3837 snap_name += strlen(snap_name) + 1;
3838 which++;
3839 }
3840 return CEPH_NOSNAP;
3841}
3842
3843static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3844{
3845 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3846 u32 which;
3847 bool found = false;
3848 u64 snap_id;
3849
3850 for (which = 0; !found && which < snapc->num_snaps; which++) {
3851 const char *snap_name;
3852
3853 snap_id = snapc->snaps[which];
3854 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
3855 if (IS_ERR(snap_name))
3856 break;
3857 found = !strcmp(name, snap_name);
3858 kfree(snap_name);
3859 }
3860 return found ? snap_id : CEPH_NOSNAP;
3861}
3862
3863/*
3864 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
3865 * no snapshot by that name is found, or if an error occurs.
3866 */
3867static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
3868{
3869 if (rbd_dev->image_format == 1)
3870 return rbd_v1_snap_id_by_name(rbd_dev, name);
3871
3872 return rbd_v2_snap_id_by_name(rbd_dev, name);
3873}
3874
Alex Elder9e15b772012-10-30 19:40:33 -05003875/*
Alex Elder2e9f7f12013-04-26 09:43:48 -05003876 * When an rbd image has a parent image, it is identified by the
3877 * pool, image, and snapshot ids (not names). This function fills
3878 * in the names for those ids. (It's OK if we can't figure out the
3879 * name for an image id, but the pool and snapshot ids should always
3880 * exist and have names.) All names in an rbd spec are dynamically
3881 * allocated.
Alex Eldere1d42132013-04-25 23:15:08 -05003882 *
3883 * When an image being mapped (not a parent) is probed, we have the
3884 * pool name and pool id, image name and image id, and the snapshot
3885 * name. The only thing we're missing is the snapshot id.
Alex Elder2e9f7f12013-04-26 09:43:48 -05003886 *
3887 * The set of snapshots for an image is not known until they have
3888 * been read by rbd_dev_snaps_update(), so we can't completely fill
3889 * in this information until after that has been called.
Alex Elder9e15b772012-10-30 19:40:33 -05003890 */
Alex Elder2e9f7f12013-04-26 09:43:48 -05003891static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05003892{
Alex Elder2e9f7f12013-04-26 09:43:48 -05003893 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3894 struct rbd_spec *spec = rbd_dev->spec;
3895 const char *pool_name;
3896 const char *image_name;
3897 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05003898 int ret;
3899
Alex Eldere1d42132013-04-25 23:15:08 -05003900 /*
3901 * An image being mapped will have the pool name (etc.), but
3902 * we need to look up the snapshot id.
3903 */
Alex Elder2e9f7f12013-04-26 09:43:48 -05003904 if (spec->pool_name) {
3905 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
Alex Elder2ad3d712013-04-30 00:44:33 -05003906 u64 snap_id;
Alex Eldere1d42132013-04-25 23:15:08 -05003907
Alex Elder2ad3d712013-04-30 00:44:33 -05003908 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
3909 if (snap_id == CEPH_NOSNAP)
Alex Eldere1d42132013-04-25 23:15:08 -05003910 return -ENOENT;
Alex Elder2ad3d712013-04-30 00:44:33 -05003911 spec->snap_id = snap_id;
Alex Eldere1d42132013-04-25 23:15:08 -05003912 } else {
Alex Elder2e9f7f12013-04-26 09:43:48 -05003913 spec->snap_id = CEPH_NOSNAP;
Alex Eldere1d42132013-04-25 23:15:08 -05003914 }
3915
3916 return 0;
3917 }
Alex Elder9e15b772012-10-30 19:40:33 -05003918
Alex Elder2e9f7f12013-04-26 09:43:48 -05003919 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05003920
Alex Elder2e9f7f12013-04-26 09:43:48 -05003921 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
3922 if (!pool_name) {
3923 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003924 return -EIO;
3925 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05003926 pool_name = kstrdup(pool_name, GFP_KERNEL);
3927 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05003928 return -ENOMEM;
3929
3930 /* Fetch the image name; tolerate failure here */
3931
Alex Elder2e9f7f12013-04-26 09:43:48 -05003932 image_name = rbd_dev_image_name(rbd_dev);
3933 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05003934 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003935
Alex Elder2e9f7f12013-04-26 09:43:48 -05003936 /* Look up the snapshot name, and make a copy */
Alex Elder9e15b772012-10-30 19:40:33 -05003937
Alex Elder2e9f7f12013-04-26 09:43:48 -05003938 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
3939 if (!snap_name) {
Alex Elder2e9f7f12013-04-26 09:43:48 -05003940 ret = -ENOMEM;
Alex Elder9e15b772012-10-30 19:40:33 -05003941 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05003942 }
3943
3944 spec->pool_name = pool_name;
3945 spec->image_name = image_name;
3946 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05003947
3948 return 0;
3949out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05003950 kfree(image_name);
3951 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05003952
3953 return ret;
3954}
3955
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003956static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05003957{
3958 size_t size;
3959 int ret;
3960 void *reply_buf;
3961 void *p;
3962 void *end;
3963 u64 seq;
3964 u32 snap_count;
3965 struct ceph_snap_context *snapc;
3966 u32 i;
3967
3968 /*
3969 * We'll need room for the seq value (maximum snapshot id),
3970 * snapshot count, and array of that many snapshot ids.
3971 * For now we have a fixed upper limit on the number we're
3972 * prepared to receive.
3973 */
3974 size = sizeof (__le64) + sizeof (__le32) +
3975 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3976 reply_buf = kzalloc(size, GFP_KERNEL);
3977 if (!reply_buf)
3978 return -ENOMEM;
3979
Alex Elder36be9a72013-01-19 00:30:28 -06003980 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003981 "rbd", "get_snapcontext", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003982 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06003983 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003984 if (ret < 0)
3985 goto out;
3986
Alex Elder35d489f2012-07-03 16:01:19 -05003987 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003988 end = reply_buf + ret;
3989 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05003990 ceph_decode_64_safe(&p, end, seq, out);
3991 ceph_decode_32_safe(&p, end, snap_count, out);
3992
3993 /*
3994 * Make sure the reported number of snapshot ids wouldn't go
3995 * beyond the end of our buffer. But before checking that,
3996 * make sure the computed size of the snapshot context we
3997 * allocate is representable in a size_t.
3998 */
3999 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4000 / sizeof (u64)) {
4001 ret = -EINVAL;
4002 goto out;
4003 }
4004 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4005 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05004006 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05004007
Alex Elder812164f82013-04-30 00:44:32 -05004008 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05004009 if (!snapc) {
4010 ret = -ENOMEM;
4011 goto out;
4012 }
Alex Elder35d489f2012-07-03 16:01:19 -05004013 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05004014 for (i = 0; i < snap_count; i++)
4015 snapc->snaps[i] = ceph_decode_64(&p);
4016
4017 rbd_dev->header.snapc = snapc;
4018
4019 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05004020 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05004021out:
4022 kfree(reply_buf);
4023
Alex Elder57385b52013-04-21 12:14:45 -05004024 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05004025}
4026
Alex Elder54cac612013-04-30 00:44:33 -05004027static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4028 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004029{
4030 size_t size;
4031 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05004032 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004033 int ret;
4034 void *p;
4035 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004036 char *snap_name;
4037
4038 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4039 reply_buf = kmalloc(size, GFP_KERNEL);
4040 if (!reply_buf)
4041 return ERR_PTR(-ENOMEM);
4042
Alex Elder54cac612013-04-30 00:44:33 -05004043 snapid = cpu_to_le64(snap_id);
Alex Elder36be9a72013-01-19 00:30:28 -06004044 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004045 "rbd", "get_snapshot_name",
Alex Elder54cac612013-04-30 00:44:33 -05004046 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05004047 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004048 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05004049 if (ret < 0) {
4050 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004051 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05004052 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004053
4054 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004055 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05004056 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05004057 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004058 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004059
Alex Elderf40eb342013-04-25 15:09:42 -05004060 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05004061 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004062out:
4063 kfree(reply_buf);
4064
Alex Elderf40eb342013-04-25 15:09:42 -05004065 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004066}
4067
Alex Elder54cac612013-04-30 00:44:33 -05004068static const char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev,
4069 u64 snap_id, u64 *snap_size, u64 *snap_features)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004070{
Alex Elderacb1b6c2013-04-25 15:09:41 -05004071 u64 size;
4072 u64 features;
Alex Eldercb752232013-04-30 00:44:33 -05004073 const char *snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004074 int ret;
4075
Alex Elderacb1b6c2013-04-25 15:09:41 -05004076 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004077 if (ret)
Alex Elderacb1b6c2013-04-25 15:09:41 -05004078 goto out_err;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004079
Alex Elderacb1b6c2013-04-25 15:09:41 -05004080 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
4081 if (ret)
4082 goto out_err;
4083
Alex Elder54cac612013-04-30 00:44:33 -05004084 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elderacb1b6c2013-04-25 15:09:41 -05004085 if (!IS_ERR(snap_name)) {
4086 *snap_size = size;
4087 *snap_features = features;
4088 }
4089
4090 return snap_name;
4091out_err:
4092 return ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004093}
4094
Alex Elder54cac612013-04-30 00:44:33 -05004095static const char *rbd_dev_snap_info(struct rbd_device *rbd_dev,
4096 u64 snap_id, u64 *snap_size, u64 *snap_features)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004097{
4098 if (rbd_dev->image_format == 1)
Alex Elder54cac612013-04-30 00:44:33 -05004099 return rbd_dev_v1_snap_info(rbd_dev, snap_id,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004100 snap_size, snap_features);
4101 if (rbd_dev->image_format == 2)
Alex Elder54cac612013-04-30 00:44:33 -05004102 return rbd_dev_v2_snap_info(rbd_dev, snap_id,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004103 snap_size, snap_features);
4104 return ERR_PTR(-EINVAL);
4105}
4106
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004107static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05004108{
4109 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05004110
4111 down_write(&rbd_dev->header_rwsem);
4112
Alex Elder117973f2012-08-31 17:29:55 -05004113 ret = rbd_dev_v2_image_size(rbd_dev);
4114 if (ret)
4115 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05004116 rbd_update_mapping_size(rbd_dev);
4117
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004118 ret = rbd_dev_v2_snap_context(rbd_dev);
Alex Elder117973f2012-08-31 17:29:55 -05004119 dout("rbd_dev_v2_snap_context returned %d\n", ret);
4120 if (ret)
4121 goto out;
4122 ret = rbd_dev_snaps_update(rbd_dev);
4123 dout("rbd_dev_snaps_update returned %d\n", ret);
4124 if (ret)
4125 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05004126out:
4127 up_write(&rbd_dev->header_rwsem);
4128
4129 return ret;
4130}
4131
Alex Elder9d475de2012-07-03 16:01:19 -05004132/*
Alex Elder35938152012-08-02 11:29:46 -05004133 * Scan the rbd device's current snapshot list and compare it to the
4134 * newly-received snapshot context. Remove any existing snapshots
4135 * not present in the new snapshot context. Add a new snapshot for
4136 * any snaphots in the snapshot context not in the current list.
4137 * And verify there are no changes to snapshots we already know
4138 * about.
4139 *
4140 * Assumes the snapshots in the snapshot context are sorted by
4141 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
4142 * are also maintained in that order.)
Alex Elder522a0cc2013-04-25 15:09:41 -05004143 *
4144 * Note that any error occurs while updating the snapshot list
4145 * aborts the update, and the entire list is cleared. The snapshot
4146 * list becomes inconsistent at that point anyway, so it might as
4147 * well be empty.
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004148 */
Alex Elder304f6802012-08-31 17:29:52 -05004149static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004150{
Alex Elder35938152012-08-02 11:29:46 -05004151 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4152 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05004153 struct list_head *head = &rbd_dev->snaps;
4154 struct list_head *links = head->next;
4155 u32 index = 0;
Alex Elder522a0cc2013-04-25 15:09:41 -05004156 int ret = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004157
Alex Elder522a0cc2013-04-25 15:09:41 -05004158 dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count);
Alex Elder35938152012-08-02 11:29:46 -05004159 while (index < snap_count || links != head) {
4160 u64 snap_id;
4161 struct rbd_snap *snap;
Alex Eldercb752232013-04-30 00:44:33 -05004162 const char *snap_name;
Alex Eldercd892122012-07-03 16:01:19 -05004163 u64 snap_size = 0;
4164 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004165
Alex Elder35938152012-08-02 11:29:46 -05004166 snap_id = index < snap_count ? snapc->snaps[index]
4167 : CEPH_NOSNAP;
4168 snap = links != head ? list_entry(links, struct rbd_snap, node)
4169 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05004170 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004171
Alex Elder35938152012-08-02 11:29:46 -05004172 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
4173 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004174
Alex Elder6d292902013-01-14 12:43:31 -06004175 /*
4176 * A previously-existing snapshot is not in
4177 * the new snap context.
4178 *
Alex Elder522a0cc2013-04-25 15:09:41 -05004179 * If the now-missing snapshot is the one
4180 * the image represents, clear its existence
4181 * flag so we can avoid sending any more
4182 * requests to it.
Alex Elder6d292902013-01-14 12:43:31 -06004183 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004184 if (rbd_dev->spec->snap_id == snap->id)
Alex Elder6d292902013-01-14 12:43:31 -06004185 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder3e83b652013-04-23 13:52:53 -05004186 dout("removing %ssnap id %llu\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004187 rbd_dev->spec->snap_id == snap->id ?
4188 "mapped " : "",
Alex Elder522a0cc2013-04-25 15:09:41 -05004189 (unsigned long long)snap->id);
Alex Elder6087b512013-04-25 15:09:41 -05004190
4191 list_del(&snap->node);
4192 rbd_snap_destroy(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004193
Alex Elder35938152012-08-02 11:29:46 -05004194 /* Done with this list entry; advance */
4195
4196 links = next;
4197 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004198 }
Alex Elder35938152012-08-02 11:29:46 -05004199
Alex Elder54cac612013-04-30 00:44:33 -05004200 snap_name = rbd_dev_snap_info(rbd_dev, snap_id,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004201 &snap_size, &snap_features);
Alex Elder522a0cc2013-04-25 15:09:41 -05004202 if (IS_ERR(snap_name)) {
4203 ret = PTR_ERR(snap_name);
4204 dout("failed to get snap info, error %d\n", ret);
4205 goto out_err;
4206 }
Alex Eldercd892122012-07-03 16:01:19 -05004207
Alex Elder522a0cc2013-04-25 15:09:41 -05004208 dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count,
4209 (unsigned long long)snap_id);
Alex Elder35938152012-08-02 11:29:46 -05004210 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
4211 struct rbd_snap *new_snap;
4212
4213 /* We haven't seen this snapshot before */
4214
Alex Elder6087b512013-04-25 15:09:41 -05004215 new_snap = rbd_snap_create(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05004216 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05004217 if (IS_ERR(new_snap)) {
Alex Elder522a0cc2013-04-25 15:09:41 -05004218 ret = PTR_ERR(new_snap);
4219 dout(" failed to add dev, error %d\n", ret);
4220 goto out_err;
Alex Elder9fcbb802012-08-23 23:48:49 -05004221 }
Alex Elder35938152012-08-02 11:29:46 -05004222
4223 /* New goes before existing, or at end of list */
4224
Alex Elder9fcbb802012-08-23 23:48:49 -05004225 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05004226 if (snap)
4227 list_add_tail(&new_snap->node, &snap->node);
4228 else
Alex Elder523f3252012-08-30 00:16:37 -05004229 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05004230 } else {
4231 /* Already have this one */
4232
Alex Elder9fcbb802012-08-23 23:48:49 -05004233 dout(" already present\n");
4234
Alex Eldercd892122012-07-03 16:01:19 -05004235 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05004236 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05004237 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05004238
4239 /* Done with this list entry; advance */
4240
4241 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004242 }
Alex Elder35938152012-08-02 11:29:46 -05004243
4244 /* Advance to the next entry in the snapshot context */
4245
4246 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004247 }
Alex Elder9fcbb802012-08-23 23:48:49 -05004248 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004249
4250 return 0;
Alex Elder522a0cc2013-04-25 15:09:41 -05004251out_err:
4252 rbd_remove_all_snaps(rbd_dev);
4253
4254 return ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004255}
4256
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004257static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4258{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004259 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05004260 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004261
4262 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004263
Alex Eldercd789ab2012-08-30 00:16:38 -05004264 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004265 dev->bus = &rbd_bus_type;
4266 dev->type = &rbd_device_type;
4267 dev->parent = &rbd_root_dev;
Alex Elder200a6a82013-04-28 23:32:34 -05004268 dev->release = rbd_dev_device_release;
Alex Elderde71a292012-07-03 16:01:19 -05004269 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004270 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004271
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004272 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05004273
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004274 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004275}
4276
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004277static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4278{
4279 device_unregister(&rbd_dev->dev);
4280}
4281
Alex Eldere2839302012-08-29 17:11:06 -05004282static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06004283
4284/*
Alex Elder499afd52012-02-02 08:13:29 -06004285 * Get a unique rbd identifier for the given new rbd_dev, and add
4286 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06004287 */
Alex Eldere2839302012-08-29 17:11:06 -05004288static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06004289{
Alex Eldere2839302012-08-29 17:11:06 -05004290 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06004291
4292 spin_lock(&rbd_dev_list_lock);
4293 list_add_tail(&rbd_dev->node, &rbd_dev_list);
4294 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05004295 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4296 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06004297}
Alex Elderb7f23c32012-01-29 13:57:43 -06004298
Alex Elder1ddbe942012-01-29 13:57:44 -06004299/*
Alex Elder499afd52012-02-02 08:13:29 -06004300 * Remove an rbd_dev from the global list, and record that its
4301 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06004302 */
Alex Eldere2839302012-08-29 17:11:06 -05004303static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06004304{
Alex Elderd184f6b2012-01-29 13:57:44 -06004305 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05004306 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004307 int max_id;
4308
Alex Elderaafb2302012-09-06 16:00:54 -05004309 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06004310
Alex Eldere2839302012-08-29 17:11:06 -05004311 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4312 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06004313 spin_lock(&rbd_dev_list_lock);
4314 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06004315
4316 /*
4317 * If the id being "put" is not the current maximum, there
4318 * is nothing special we need to do.
4319 */
Alex Eldere2839302012-08-29 17:11:06 -05004320 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06004321 spin_unlock(&rbd_dev_list_lock);
4322 return;
4323 }
4324
4325 /*
4326 * We need to update the current maximum id. Search the
4327 * list to find out what it is. We're more likely to find
4328 * the maximum at the end, so search the list backward.
4329 */
4330 max_id = 0;
4331 list_for_each_prev(tmp, &rbd_dev_list) {
4332 struct rbd_device *rbd_dev;
4333
4334 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07004335 if (rbd_dev->dev_id > max_id)
4336 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004337 }
Alex Elder499afd52012-02-02 08:13:29 -06004338 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06004339
Alex Elder1ddbe942012-01-29 13:57:44 -06004340 /*
Alex Eldere2839302012-08-29 17:11:06 -05004341 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06004342 * which case it now accurately reflects the new maximum.
4343 * Be careful not to overwrite the maximum value in that
4344 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06004345 */
Alex Eldere2839302012-08-29 17:11:06 -05004346 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4347 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06004348}
4349
Alex Eldera725f65e2012-02-02 08:13:30 -06004350/*
Alex Eldere28fff262012-02-02 08:13:30 -06004351 * Skips over white space at *buf, and updates *buf to point to the
4352 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004353 * the token (string of non-white space characters) found. Note
4354 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004355 */
4356static inline size_t next_token(const char **buf)
4357{
4358 /*
4359 * These are the characters that produce nonzero for
4360 * isspace() in the "C" and "POSIX" locales.
4361 */
4362 const char *spaces = " \f\n\r\t\v";
4363
4364 *buf += strspn(*buf, spaces); /* Find start of token */
4365
4366 return strcspn(*buf, spaces); /* Return token length */
4367}
4368
4369/*
4370 * Finds the next token in *buf, and if the provided token buffer is
4371 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06004372 * copied, is guaranteed to be terminated with '\0'. Note that *buf
4373 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06004374 *
4375 * Returns the length of the token found (not including the '\0').
4376 * Return value will be 0 if no token is found, and it will be >=
4377 * token_size if the token would not fit.
4378 *
Alex Elder593a9e72012-02-07 12:03:37 -06004379 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06004380 * found token. Note that this occurs even if the token buffer is
4381 * too small to hold it.
4382 */
4383static inline size_t copy_token(const char **buf,
4384 char *token,
4385 size_t token_size)
4386{
4387 size_t len;
4388
4389 len = next_token(buf);
4390 if (len < token_size) {
4391 memcpy(token, *buf, len);
4392 *(token + len) = '\0';
4393 }
4394 *buf += len;
4395
4396 return len;
4397}
4398
4399/*
Alex Elderea3352f2012-07-09 21:04:23 -05004400 * Finds the next token in *buf, dynamically allocates a buffer big
4401 * enough to hold a copy of it, and copies the token into the new
4402 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4403 * that a duplicate buffer is created even for a zero-length token.
4404 *
4405 * Returns a pointer to the newly-allocated duplicate, or a null
4406 * pointer if memory for the duplicate was not available. If
4407 * the lenp argument is a non-null pointer, the length of the token
4408 * (not including the '\0') is returned in *lenp.
4409 *
4410 * If successful, the *buf pointer will be updated to point beyond
4411 * the end of the found token.
4412 *
4413 * Note: uses GFP_KERNEL for allocation.
4414 */
4415static inline char *dup_token(const char **buf, size_t *lenp)
4416{
4417 char *dup;
4418 size_t len;
4419
4420 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004421 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004422 if (!dup)
4423 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004424 *(dup + len) = '\0';
4425 *buf += len;
4426
4427 if (lenp)
4428 *lenp = len;
4429
4430 return dup;
4431}
4432
4433/*
Alex Elder859c31d2012-10-25 23:34:42 -05004434 * Parse the options provided for an "rbd add" (i.e., rbd image
4435 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4436 * and the data written is passed here via a NUL-terminated buffer.
4437 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004438 *
Alex Elder859c31d2012-10-25 23:34:42 -05004439 * The information extracted from these options is recorded in
4440 * the other parameters which return dynamically-allocated
4441 * structures:
4442 * ceph_opts
4443 * The address of a pointer that will refer to a ceph options
4444 * structure. Caller must release the returned pointer using
4445 * ceph_destroy_options() when it is no longer needed.
4446 * rbd_opts
4447 * Address of an rbd options pointer. Fully initialized by
4448 * this function; caller must release with kfree().
4449 * spec
4450 * Address of an rbd image specification pointer. Fully
4451 * initialized by this function based on parsed options.
4452 * Caller must release with rbd_spec_put().
4453 *
4454 * The options passed take this form:
4455 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4456 * where:
4457 * <mon_addrs>
4458 * A comma-separated list of one or more monitor addresses.
4459 * A monitor address is an ip address, optionally followed
4460 * by a port number (separated by a colon).
4461 * I.e.: ip1[:port1][,ip2[:port2]...]
4462 * <options>
4463 * A comma-separated list of ceph and/or rbd options.
4464 * <pool_name>
4465 * The name of the rados pool containing the rbd image.
4466 * <image_name>
4467 * The name of the image in that pool to map.
4468 * <snap_id>
4469 * An optional snapshot id. If provided, the mapping will
4470 * present data from the image at the time that snapshot was
4471 * created. The image head is used if no snapshot id is
4472 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004473 */
Alex Elder859c31d2012-10-25 23:34:42 -05004474static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004475 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004476 struct rbd_options **opts,
4477 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004478{
Alex Elderd22f76e2012-07-12 10:46:35 -05004479 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004480 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004481 const char *mon_addrs;
Alex Elderecb4dc222013-04-26 09:43:47 -05004482 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05004483 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004484 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004485 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004486 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004487 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004488
4489 /* The first four tokens are required */
4490
Alex Elder7ef32142012-02-02 08:13:30 -06004491 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004492 if (!len) {
4493 rbd_warn(NULL, "no monitor address(es) provided");
4494 return -EINVAL;
4495 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004496 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004497 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004498 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004499
Alex Elderdc79b112012-10-25 23:34:41 -05004500 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004501 options = dup_token(&buf, NULL);
4502 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004503 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004504 if (!*options) {
4505 rbd_warn(NULL, "no options provided");
4506 goto out_err;
4507 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004508
Alex Elder859c31d2012-10-25 23:34:42 -05004509 spec = rbd_spec_alloc();
4510 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004511 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004512
4513 spec->pool_name = dup_token(&buf, NULL);
4514 if (!spec->pool_name)
4515 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004516 if (!*spec->pool_name) {
4517 rbd_warn(NULL, "no pool name provided");
4518 goto out_err;
4519 }
Alex Eldere28fff262012-02-02 08:13:30 -06004520
Alex Elder69e7a022012-11-01 08:39:26 -05004521 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004522 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004523 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004524 if (!*spec->image_name) {
4525 rbd_warn(NULL, "no image name provided");
4526 goto out_err;
4527 }
Alex Eldere28fff262012-02-02 08:13:30 -06004528
Alex Elderf28e5652012-10-25 23:34:41 -05004529 /*
4530 * Snapshot name is optional; default is to use "-"
4531 * (indicating the head/no snapshot).
4532 */
Alex Elder3feeb8942012-08-31 17:29:52 -05004533 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05004534 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05004535 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4536 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05004537 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05004538 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05004539 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05004540 }
Alex Elderecb4dc222013-04-26 09:43:47 -05004541 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4542 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004543 goto out_mem;
Alex Elderecb4dc222013-04-26 09:43:47 -05004544 *(snap_name + len) = '\0';
4545 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05004546
Alex Elder0ddebc02012-10-25 23:34:41 -05004547 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06004548
Alex Elder4e9afeb2012-10-25 23:34:41 -05004549 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4550 if (!rbd_opts)
4551 goto out_mem;
4552
4553 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05004554
Alex Elder859c31d2012-10-25 23:34:42 -05004555 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05004556 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05004557 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004558 if (IS_ERR(copts)) {
4559 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05004560 goto out_err;
4561 }
Alex Elder859c31d2012-10-25 23:34:42 -05004562 kfree(options);
4563
4564 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004565 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05004566 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05004567
Alex Elderdc79b112012-10-25 23:34:41 -05004568 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05004569out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05004570 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05004571out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05004572 kfree(rbd_opts);
4573 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05004574 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05004575
Alex Elderdc79b112012-10-25 23:34:41 -05004576 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06004577}
4578
Alex Elder589d30e2012-07-10 20:30:11 -05004579/*
4580 * An rbd format 2 image has a unique identifier, distinct from the
4581 * name given to it by the user. Internally, that identifier is
4582 * what's used to specify the names of objects related to the image.
4583 *
4584 * A special "rbd id" object is used to map an rbd image name to its
4585 * id. If that object doesn't exist, then there is no v2 rbd image
4586 * with the supplied name.
4587 *
4588 * This function will record the given rbd_dev's image_id field if
4589 * it can be determined, and in that case will return 0. If any
4590 * errors occur a negative errno will be returned and the rbd_dev's
4591 * image_id field will be unchanged (and should be NULL).
4592 */
4593static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4594{
4595 int ret;
4596 size_t size;
4597 char *object_name;
4598 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05004599 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05004600
Alex Elder589d30e2012-07-10 20:30:11 -05004601 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05004602 * When probing a parent image, the image id is already
4603 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05004604 * need to fetch the image id again in this case. We
4605 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05004606 */
Alex Elderc0fba362013-04-25 23:15:08 -05004607 if (rbd_dev->spec->image_id) {
4608 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4609
Alex Elder2c0d0a12012-10-30 19:40:33 -05004610 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05004611 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05004612
4613 /*
Alex Elder589d30e2012-07-10 20:30:11 -05004614 * First, see if the format 2 image id file exists, and if
4615 * so, get the image's persistent id from it.
4616 */
Alex Elder69e7a022012-11-01 08:39:26 -05004617 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004618 object_name = kmalloc(size, GFP_NOIO);
4619 if (!object_name)
4620 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004621 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004622 dout("rbd id object name is %s\n", object_name);
4623
4624 /* Response will be an encoded string, which includes a length */
4625
4626 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4627 response = kzalloc(size, GFP_NOIO);
4628 if (!response) {
4629 ret = -ENOMEM;
4630 goto out;
4631 }
4632
Alex Elderc0fba362013-04-25 23:15:08 -05004633 /* If it doesn't exist we'll assume it's a format 1 image */
4634
Alex Elder36be9a72013-01-19 00:30:28 -06004635 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder41579762013-04-21 12:14:45 -05004636 "rbd", "get_id", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004637 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004638 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05004639 if (ret == -ENOENT) {
4640 image_id = kstrdup("", GFP_KERNEL);
4641 ret = image_id ? 0 : -ENOMEM;
4642 if (!ret)
4643 rbd_dev->image_format = 1;
4644 } else if (ret > sizeof (__le32)) {
4645 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05004646
Alex Elderc0fba362013-04-25 23:15:08 -05004647 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05004648 NULL, GFP_NOIO);
Alex Elderc0fba362013-04-25 23:15:08 -05004649 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4650 if (!ret)
4651 rbd_dev->image_format = 2;
Alex Elder589d30e2012-07-10 20:30:11 -05004652 } else {
Alex Elderc0fba362013-04-25 23:15:08 -05004653 ret = -EINVAL;
4654 }
4655
4656 if (!ret) {
4657 rbd_dev->spec->image_id = image_id;
4658 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004659 }
4660out:
4661 kfree(response);
4662 kfree(object_name);
4663
4664 return ret;
4665}
4666
Alex Elder6fd48b32013-04-28 23:32:34 -05004667/* Undo whatever state changes are made by v1 or v2 image probe */
4668
4669static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
4670{
4671 struct rbd_image_header *header;
4672
4673 rbd_dev_remove_parent(rbd_dev);
4674 rbd_spec_put(rbd_dev->parent_spec);
4675 rbd_dev->parent_spec = NULL;
4676 rbd_dev->parent_overlap = 0;
4677
4678 /* Free dynamic fields from the header, then zero it out */
4679
4680 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05004681 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05004682 kfree(header->snap_sizes);
4683 kfree(header->snap_names);
4684 kfree(header->object_prefix);
4685 memset(header, 0, sizeof (*header));
4686}
4687
Alex Eldera30b71b2012-07-10 20:30:11 -05004688static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4689{
4690 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004691
4692 /* Populate rbd image metadata */
4693
4694 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4695 if (ret < 0)
4696 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05004697
4698 /* Version 1 images have no parent (no layering) */
4699
4700 rbd_dev->parent_spec = NULL;
4701 rbd_dev->parent_overlap = 0;
4702
Alex Eldera30b71b2012-07-10 20:30:11 -05004703 dout("discovered version 1 image, header name is %s\n",
4704 rbd_dev->header_name);
4705
4706 return 0;
4707
4708out_err:
4709 kfree(rbd_dev->header_name);
4710 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004711 kfree(rbd_dev->spec->image_id);
4712 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05004713
4714 return ret;
4715}
4716
4717static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4718{
Alex Elder9d475de2012-07-03 16:01:19 -05004719 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004720
Alex Elder9d475de2012-07-03 16:01:19 -05004721 ret = rbd_dev_v2_image_size(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004722 if (ret)
Alex Elder9d475de2012-07-03 16:01:19 -05004723 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05004724
4725 /* Get the object prefix (a.k.a. block_name) for the image */
4726
4727 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004728 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05004729 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05004730
Alex Elderd8891402012-10-09 13:50:17 -07004731 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05004732
4733 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004734 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05004735 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05004736
Alex Elder86b00e02012-10-25 23:34:42 -05004737 /* If the image supports layering, get the parent info */
4738
4739 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4740 ret = rbd_dev_v2_parent_info(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004741 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004742 goto out_err;
Alex Elder96882f52013-04-30 00:44:32 -05004743
4744 /*
4745 * Don't print a warning for parent images. We can
4746 * tell this point because we won't know its pool
4747 * name yet (just its pool id).
4748 */
4749 if (rbd_dev->spec->pool_name)
4750 rbd_warn(rbd_dev, "WARNING: kernel layering "
4751 "is EXPERIMENTAL!");
Alex Elder86b00e02012-10-25 23:34:42 -05004752 }
4753
Alex Eldercc070d52013-04-21 12:14:45 -05004754 /* If the image supports fancy striping, get its parameters */
4755
4756 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4757 ret = rbd_dev_v2_striping_info(rbd_dev);
4758 if (ret < 0)
4759 goto out_err;
4760 }
4761
Alex Elder6e14b1a2012-07-03 16:01:19 -05004762 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05004763
Alex Elder6e14b1a2012-07-03 16:01:19 -05004764 rbd_dev->header.crypt_type = 0;
4765 rbd_dev->header.comp_type = 0;
4766
4767 /* Get the snapshot context, plus the header version */
4768
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004769 ret = rbd_dev_v2_snap_context(rbd_dev);
Alex Elder35d489f2012-07-03 16:01:19 -05004770 if (ret)
4771 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004772
Alex Eldera30b71b2012-07-10 20:30:11 -05004773 dout("discovered version 2 image, header name is %s\n",
4774 rbd_dev->header_name);
4775
Alex Elder35152972012-08-31 17:29:55 -05004776 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05004777out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05004778 rbd_dev->parent_overlap = 0;
4779 rbd_spec_put(rbd_dev->parent_spec);
4780 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004781 kfree(rbd_dev->header_name);
4782 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05004783 kfree(rbd_dev->header.object_prefix);
4784 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004785
4786 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004787}
4788
Alex Elder124afba2013-04-26 15:44:36 -05004789static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
Alex Elder83a06262012-10-30 15:47:17 -05004790{
Alex Elder2f82ee52012-10-30 19:40:33 -05004791 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05004792 struct rbd_spec *parent_spec;
4793 struct rbd_client *rbdc;
4794 int ret;
4795
4796 if (!rbd_dev->parent_spec)
4797 return 0;
4798 /*
4799 * We need to pass a reference to the client and the parent
4800 * spec when creating the parent rbd_dev. Images related by
4801 * parent/child relationships always share both.
4802 */
4803 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4804 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4805
4806 ret = -ENOMEM;
4807 parent = rbd_dev_create(rbdc, parent_spec);
4808 if (!parent)
4809 goto out_err;
4810
4811 ret = rbd_dev_image_probe(parent);
4812 if (ret < 0)
4813 goto out_err;
4814 rbd_dev->parent = parent;
4815
4816 return 0;
4817out_err:
4818 if (parent) {
4819 rbd_spec_put(rbd_dev->parent_spec);
4820 kfree(rbd_dev->header_name);
4821 rbd_dev_destroy(parent);
4822 } else {
4823 rbd_put_client(rbdc);
4824 rbd_spec_put(parent_spec);
4825 }
4826
4827 return ret;
4828}
4829
Alex Elder200a6a82013-04-28 23:32:34 -05004830static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05004831{
Alex Elder83a06262012-10-30 15:47:17 -05004832 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05004833
Alex Elderd1cf5782013-04-27 09:59:30 -05004834 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004835 if (ret)
Alex Elder9bb81c92013-04-27 09:59:30 -05004836 return ret;
Alex Elder5de10f32013-04-26 15:44:37 -05004837
Alex Elder83a06262012-10-30 15:47:17 -05004838 /* generate unique id: find highest unique id, add one */
4839 rbd_dev_id_get(rbd_dev);
4840
4841 /* Fill in the device name, now that we have its id. */
4842 BUILD_BUG_ON(DEV_NAME_LEN
4843 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4844 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4845
4846 /* Get our block major device number. */
4847
4848 ret = register_blkdev(0, rbd_dev->name);
4849 if (ret < 0)
4850 goto err_out_id;
4851 rbd_dev->major = ret;
4852
4853 /* Set up the blkdev mapping. */
4854
4855 ret = rbd_init_disk(rbd_dev);
4856 if (ret)
4857 goto err_out_blkdev;
4858
4859 ret = rbd_bus_add_dev(rbd_dev);
4860 if (ret)
4861 goto err_out_disk;
4862
Alex Elder83a06262012-10-30 15:47:17 -05004863 /* Everything's ready. Announce the disk to the world. */
4864
Alex Elderb5156e72013-04-26 15:44:36 -05004865 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Alex Elder129b79d2013-04-26 15:44:36 -05004866 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder83a06262012-10-30 15:47:17 -05004867 add_disk(rbd_dev->disk);
4868
4869 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4870 (unsigned long long) rbd_dev->mapping.size);
4871
4872 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05004873
Alex Elder83a06262012-10-30 15:47:17 -05004874err_out_disk:
4875 rbd_free_disk(rbd_dev);
4876err_out_blkdev:
4877 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4878err_out_id:
4879 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05004880 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004881
4882 return ret;
4883}
4884
Alex Elder332bb122013-04-27 09:59:30 -05004885static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4886{
4887 struct rbd_spec *spec = rbd_dev->spec;
4888 size_t size;
4889
4890 /* Record the header object name for this rbd image. */
4891
4892 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4893
4894 if (rbd_dev->image_format == 1)
4895 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4896 else
4897 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4898
4899 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4900 if (!rbd_dev->header_name)
4901 return -ENOMEM;
4902
4903 if (rbd_dev->image_format == 1)
4904 sprintf(rbd_dev->header_name, "%s%s",
4905 spec->image_name, RBD_SUFFIX);
4906 else
4907 sprintf(rbd_dev->header_name, "%s%s",
4908 RBD_HEADER_PREFIX, spec->image_id);
4909 return 0;
4910}
4911
Alex Elder200a6a82013-04-28 23:32:34 -05004912static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4913{
Alex Elder6fd48b32013-04-28 23:32:34 -05004914 int ret;
4915
4916 rbd_remove_all_snaps(rbd_dev);
4917 rbd_dev_unprobe(rbd_dev);
4918 ret = rbd_dev_header_watch_sync(rbd_dev, 0);
4919 if (ret)
4920 rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
Alex Elder200a6a82013-04-28 23:32:34 -05004921 kfree(rbd_dev->header_name);
Alex Elder6fd48b32013-04-28 23:32:34 -05004922 rbd_dev->header_name = NULL;
4923 rbd_dev->image_format = 0;
4924 kfree(rbd_dev->spec->image_id);
4925 rbd_dev->spec->image_id = NULL;
4926
Alex Elder200a6a82013-04-28 23:32:34 -05004927 rbd_dev_destroy(rbd_dev);
4928}
4929
Alex Eldera30b71b2012-07-10 20:30:11 -05004930/*
4931 * Probe for the existence of the header object for the given rbd
4932 * device. For format 2 images this includes determining the image
4933 * id.
4934 */
Alex Elder71f293e2013-04-26 09:43:48 -05004935static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05004936{
4937 int ret;
Alex Elderb644de22013-04-27 09:59:31 -05004938 int tmp;
Alex Eldera30b71b2012-07-10 20:30:11 -05004939
4940 /*
4941 * Get the id from the image id object. If it's not a
4942 * format 2 image, we'll get ENOENT back, and we'll assume
4943 * it's a format 1 image.
4944 */
4945 ret = rbd_dev_image_id(rbd_dev);
4946 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05004947 return ret;
4948 rbd_assert(rbd_dev->spec->image_id);
4949 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4950
Alex Elder332bb122013-04-27 09:59:30 -05004951 ret = rbd_dev_header_name(rbd_dev);
4952 if (ret)
4953 goto err_out_format;
4954
Alex Elderb644de22013-04-27 09:59:31 -05004955 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4956 if (ret)
4957 goto out_header_name;
4958
Alex Elderc0fba362013-04-25 23:15:08 -05004959 if (rbd_dev->image_format == 1)
Alex Eldera30b71b2012-07-10 20:30:11 -05004960 ret = rbd_dev_v1_probe(rbd_dev);
4961 else
4962 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05004963 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05004964 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05004965
Alex Elder9bb81c92013-04-27 09:59:30 -05004966 ret = rbd_dev_snaps_update(rbd_dev);
4967 if (ret)
Alex Elder6fd48b32013-04-28 23:32:34 -05004968 goto err_out_probe;
Alex Elder9bb81c92013-04-27 09:59:30 -05004969
4970 ret = rbd_dev_spec_update(rbd_dev);
4971 if (ret)
4972 goto err_out_snaps;
4973
4974 ret = rbd_dev_probe_parent(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05004975 if (!ret)
4976 return 0;
Alex Elder83a06262012-10-30 15:47:17 -05004977
Alex Elder9bb81c92013-04-27 09:59:30 -05004978err_out_snaps:
4979 rbd_remove_all_snaps(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05004980err_out_probe:
4981 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05004982err_out_watch:
4983 tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4984 if (tmp)
4985 rbd_warn(rbd_dev, "unable to tear down watch request\n");
Alex Elder332bb122013-04-27 09:59:30 -05004986out_header_name:
4987 kfree(rbd_dev->header_name);
4988 rbd_dev->header_name = NULL;
4989err_out_format:
4990 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05004991 kfree(rbd_dev->spec->image_id);
4992 rbd_dev->spec->image_id = NULL;
4993
4994 dout("probe failed, returning %d\n", ret);
4995
4996 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004997}
4998
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004999static ssize_t rbd_add(struct bus_type *bus,
5000 const char *buf,
5001 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005002{
Alex Eldercb8627c2012-07-09 21:04:23 -05005003 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005004 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005005 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005006 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005007 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06005008 struct ceph_osd_client *osdc;
5009 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005010
5011 if (!try_module_get(THIS_MODULE))
5012 return -ENODEV;
5013
Alex Eldera725f65e2012-02-02 08:13:30 -06005014 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005015 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005016 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05005017 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06005018
Alex Elder9d3997f2012-10-25 23:34:42 -05005019 rbdc = rbd_get_client(ceph_opts);
5020 if (IS_ERR(rbdc)) {
5021 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005022 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005023 }
Alex Elderc53d5892012-10-25 23:34:42 -05005024 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005025
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005026 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05005027 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05005028 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005029 if (rc < 0)
5030 goto err_out_client;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005031 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005032
Alex Elder0903e872012-11-14 12:25:19 -06005033 /* The ceph file layout needs to fit pool id in 32 bits */
5034
Alex Elderc0cd10db2013-04-26 09:43:47 -05005035 if (spec->pool_id > (u64)U32_MAX) {
5036 rbd_warn(NULL, "pool id too large (%llu > %u)\n",
5037 (unsigned long long)spec->pool_id, U32_MAX);
Alex Elder0903e872012-11-14 12:25:19 -06005038 rc = -EIO;
5039 goto err_out_client;
5040 }
5041
Alex Elderc53d5892012-10-25 23:34:42 -05005042 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05005043 if (!rbd_dev)
5044 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05005045 rbdc = NULL; /* rbd_dev now owns this */
5046 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005047
Alex Elderbd4ba652012-10-25 23:34:42 -05005048 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05005049 kfree(rbd_opts);
5050 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05005051
Alex Elder71f293e2013-04-26 09:43:48 -05005052 rc = rbd_dev_image_probe(rbd_dev);
Alex Eldera30b71b2012-07-10 20:30:11 -05005053 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05005054 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05005055
Alex Elderb536f692013-04-28 23:32:34 -05005056 rc = rbd_dev_device_setup(rbd_dev);
5057 if (!rc)
5058 return count;
5059
5060 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005061err_out_rbd_dev:
5062 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005063err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005064 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005065err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05005066 if (ceph_opts)
5067 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05005068 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05005069 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05005070err_out_module:
5071 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06005072
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005073 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06005074
Alex Elderc0cd10db2013-04-26 09:43:47 -05005075 return (ssize_t)rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005076}
5077
Alex Elderde71a292012-07-03 16:01:19 -05005078static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005079{
5080 struct list_head *tmp;
5081 struct rbd_device *rbd_dev;
5082
Alex Eldere124a82f2012-01-29 13:57:44 -06005083 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005084 list_for_each(tmp, &rbd_dev_list) {
5085 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05005086 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06005087 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005088 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06005089 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005090 }
Alex Eldere124a82f2012-01-29 13:57:44 -06005091 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005092 return NULL;
5093}
5094
Alex Elder200a6a82013-04-28 23:32:34 -05005095static void rbd_dev_device_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005096{
Alex Elder593a9e72012-02-07 12:03:37 -06005097 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005098
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005099 rbd_free_disk(rbd_dev);
Alex Elder200a6a82013-04-28 23:32:34 -05005100 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5101 rbd_dev_clear_mapping(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005102 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder200a6a82013-04-28 23:32:34 -05005103 rbd_dev->major = 0;
Alex Eldere2839302012-08-29 17:11:06 -05005104 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05005105 rbd_dev_mapping_clear(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005106}
5107
Alex Elder05a46af2013-04-26 15:44:36 -05005108static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5109{
Alex Elderad945fc2013-04-26 15:44:36 -05005110 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005111 struct rbd_device *first = rbd_dev;
5112 struct rbd_device *second = first->parent;
5113 struct rbd_device *third;
5114
5115 /*
5116 * Follow to the parent with no grandparent and
5117 * remove it.
5118 */
5119 while (second && (third = second->parent)) {
5120 first = second;
5121 second = third;
5122 }
Alex Elderad945fc2013-04-26 15:44:36 -05005123 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005124 rbd_dev_image_release(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005125 first->parent = NULL;
5126 first->parent_overlap = 0;
5127
5128 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005129 rbd_spec_put(first->parent_spec);
5130 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005131 }
5132}
5133
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005134static ssize_t rbd_remove(struct bus_type *bus,
5135 const char *buf,
5136 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005137{
5138 struct rbd_device *rbd_dev = NULL;
Alex Elder0d8189e2013-04-27 09:59:30 -05005139 int target_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005140 unsigned long ul;
Alex Elder0d8189e2013-04-27 09:59:30 -05005141 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005142
Alex Elder0d8189e2013-04-27 09:59:30 -05005143 ret = strict_strtoul(buf, 10, &ul);
5144 if (ret)
5145 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005146
5147 /* convert to int; abort if we lost anything in the conversion */
5148 target_id = (int) ul;
5149 if (target_id != ul)
5150 return -EINVAL;
5151
5152 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
5153
5154 rbd_dev = __rbd_get_dev(target_id);
5155 if (!rbd_dev) {
5156 ret = -ENOENT;
5157 goto done;
5158 }
5159
Alex Eldera14ea262013-02-05 13:23:12 -06005160 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06005161 if (rbd_dev->open_count)
Alex Elder42382b72012-11-16 09:29:16 -06005162 ret = -EBUSY;
Alex Elderb82d1672013-01-14 12:43:31 -06005163 else
5164 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
Alex Eldera14ea262013-02-05 13:23:12 -06005165 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06005166 if (ret < 0)
Alex Elder42382b72012-11-16 09:29:16 -06005167 goto done;
Alex Elder0d8189e2013-04-27 09:59:30 -05005168 ret = count;
Alex Elderb4808152013-04-26 15:44:36 -05005169 rbd_bus_del_dev(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005170 rbd_dev_image_release(rbd_dev);
Alex Elder79ab7552013-04-28 23:32:34 -05005171 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005172done:
5173 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05005174
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005175 return ret;
5176}
5177
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005178/*
5179 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005180 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005181 */
5182static int rbd_sysfs_init(void)
5183{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005184 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005185
Alex Elderfed4c142012-02-07 12:03:36 -06005186 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005187 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005188 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005189
Alex Elderfed4c142012-02-07 12:03:36 -06005190 ret = bus_register(&rbd_bus_type);
5191 if (ret < 0)
5192 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005193
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005194 return ret;
5195}
5196
5197static void rbd_sysfs_cleanup(void)
5198{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005199 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005200 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005201}
5202
Alex Eldercc344fa2013-02-19 12:25:56 -06005203static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005204{
5205 int rc;
5206
Alex Elder1e32d342013-01-30 11:13:33 -06005207 if (!libceph_compatible(NULL)) {
5208 rbd_warn(NULL, "libceph incompatibility (quitting)");
5209
5210 return -EINVAL;
5211 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005212 rc = rbd_sysfs_init();
5213 if (rc)
5214 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06005215 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005216 return 0;
5217}
5218
Alex Eldercc344fa2013-02-19 12:25:56 -06005219static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005220{
5221 rbd_sysfs_cleanup();
5222}
5223
5224module_init(rbd_init);
5225module_exit(rbd_exit);
5226
5227MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5228MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5229MODULE_DESCRIPTION("rados block device");
5230
5231/* following authorship retained from original osdblk.c */
5232MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5233
5234MODULE_LICENSE("GPL");