blob: c34719c917b1cb8a710fc0b9be12c2e3bfa6ae1d [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderf0f8cef2012-01-29 13:57:44 -060055#define RBD_DRV_NAME "rbd"
56#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070057
58#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
59
Alex Elderd4b125e2012-07-03 16:01:19 -050060#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
61#define RBD_MAX_SNAP_NAME_LEN \
62 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
63
Alex Elder35d489f2012-07-03 16:01:19 -050064#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070065
66#define RBD_SNAP_HEAD_NAME "-"
67
Alex Elder9e15b772012-10-30 19:40:33 -050068/* This allows a single page to hold an image name sent by OSD */
69#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050070#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050071
Alex Elder1e130192012-07-03 16:01:19 -050072#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050073
Alex Elderd8891402012-10-09 13:50:17 -070074/* Feature bits */
75
Alex Elder5cbf6f122013-04-11 09:29:48 -050076#define RBD_FEATURE_LAYERING (1<<0)
77#define RBD_FEATURE_STRIPINGV2 (1<<1)
78#define RBD_FEATURES_ALL \
79 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
Alex Elderd8891402012-10-09 13:50:17 -070080
81/* Features supported by this (client software) implementation. */
82
Alex Elder5cbf6f122013-04-11 09:29:48 -050083#define RBD_FEATURES_SUPPORTED (0)
Alex Elderd8891402012-10-09 13:50:17 -070084
Alex Elder81a89792012-02-02 08:13:30 -060085/*
86 * An RBD device name will be "rbd#", where the "rbd" comes from
87 * RBD_DRV_NAME above, and # is a unique integer identifier.
88 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
89 * enough to hold all possible device names.
90 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060092#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093
94/*
95 * block device image metadata (in-memory version)
96 */
97struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050098 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050099 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500100 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101 __u8 obj_order;
102 __u8 crypt_type;
103 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104
Alex Elderf84344f2012-08-31 17:29:51 -0500105 /* The remaining fields need to be updated occasionally */
106 u64 image_size;
107 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108 char *snap_names;
109 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700110
111 u64 obj_version;
112};
113
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500114/*
115 * An rbd image specification.
116 *
117 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500118 * identify an image. Each rbd_dev structure includes a pointer to
119 * an rbd_spec structure that encapsulates this identity.
120 *
121 * Each of the id's in an rbd_spec has an associated name. For a
122 * user-mapped image, the names are supplied and the id's associated
123 * with them are looked up. For a layered image, a parent image is
124 * defined by the tuple, and the names are looked up.
125 *
126 * An rbd_dev structure contains a parent_spec pointer which is
127 * non-null if the image it represents is a child in a layered
128 * image. This pointer will refer to the rbd_spec structure used
129 * by the parent rbd_dev for its own identity (i.e., the structure
130 * is shared between the parent and child).
131 *
132 * Since these structures are populated once, during the discovery
133 * phase of image construction, they are effectively immutable so
134 * we make no effort to synchronize access to them.
135 *
136 * Note that code herein does not assume the image name is known (it
137 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500138 */
139struct rbd_spec {
140 u64 pool_id;
141 char *pool_name;
142
143 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500144 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500145
146 u64 snap_id;
147 char *snap_name;
148
149 struct kref kref;
150};
151
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700152/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600153 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154 */
155struct rbd_client {
156 struct ceph_client *client;
157 struct kref kref;
158 struct list_head node;
159};
160
Alex Elderbf0d5f502012-11-22 00:00:08 -0600161struct rbd_img_request;
162typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
163
164#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
165
166struct rbd_obj_request;
167typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
168
Alex Elder9969ebc2013-01-18 12:31:10 -0600169enum obj_request_type {
170 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
171};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600172
Alex Elder926f9b32013-02-11 12:33:24 -0600173enum obj_req_flags {
174 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600175 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600176 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
177 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600178};
179
Alex Elderbf0d5f502012-11-22 00:00:08 -0600180struct rbd_obj_request {
181 const char *object_name;
182 u64 offset; /* object start byte */
183 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600184 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600185
Alex Elderc5b5ef62013-02-11 12:33:24 -0600186 /*
187 * An object request associated with an image will have its
188 * img_data flag set; a standalone object request will not.
189 *
190 * A standalone object request will have which == BAD_WHICH
191 * and a null obj_request pointer.
192 *
193 * An object request initiated in support of a layered image
194 * object (to check for its existence before a write) will
195 * have which == BAD_WHICH and a non-null obj_request pointer.
196 *
197 * Finally, an object request for rbd image data will have
198 * which != BAD_WHICH, and will have a non-null img_request
199 * pointer. The value of which will be in the range
200 * 0..(img_request->obj_request_count-1).
201 */
202 union {
203 struct rbd_obj_request *obj_request; /* STAT op */
204 struct {
205 struct rbd_img_request *img_request;
206 u64 img_offset;
207 /* links for img_request->obj_requests list */
208 struct list_head links;
209 };
210 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211 u32 which; /* posn image request list */
212
213 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600214 union {
215 struct bio *bio_list;
216 struct {
217 struct page **pages;
218 u32 page_count;
219 };
220 };
Alex Elder0eefd472013-04-19 15:34:50 -0500221 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600222
223 struct ceph_osd_request *osd_req;
224
225 u64 xferred; /* bytes transferred */
226 u64 version;
Sage Weil1b83bef2013-02-25 16:11:12 -0800227 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600228
229 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600230 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600231
232 struct kref kref;
233};
234
Alex Elder0c425242013-02-08 09:55:49 -0600235enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600236 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
237 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600238 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600239};
240
Alex Elderbf0d5f502012-11-22 00:00:08 -0600241struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600242 struct rbd_device *rbd_dev;
243 u64 offset; /* starting image byte offset */
244 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600245 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600246 union {
Alex Elder9849e982013-01-24 16:13:36 -0600247 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600248 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600249 };
250 union {
251 struct request *rq; /* block request */
252 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600253 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500254 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255 spinlock_t completion_lock;/* protects next_completion */
256 u32 next_completion;
257 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500258 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600259 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600260
261 u32 obj_request_count;
262 struct list_head obj_requests; /* rbd_obj_request structs */
263
264 struct kref kref;
265};
266
267#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600268 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600269#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600270 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600271#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600272 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600273
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800274struct rbd_snap {
275 struct device dev;
276 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800277 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800278 struct list_head node;
279 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500280 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800281};
282
Alex Elderf84344f2012-08-31 17:29:51 -0500283struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500284 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500285 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500286 bool read_only;
287};
288
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700289/*
290 * a single device
291 */
292struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500293 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294
295 int major; /* blkdev assigned major */
296 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
Alex Eldera30b71b2012-07-10 20:30:11 -0500298 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299 struct rbd_client *rbd_client;
300
301 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
302
Alex Elderb82d1672013-01-14 12:43:31 -0600303 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304
305 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600306 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500307 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700308
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500309 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500310
Alex Elder0903e872012-11-14 12:25:19 -0600311 struct ceph_file_layout layout;
312
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700313 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600314 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700315
Alex Elder86b00e02012-10-25 23:34:42 -0500316 struct rbd_spec *parent_spec;
317 u64 parent_overlap;
Alex Elder2f82ee52012-10-30 19:40:33 -0500318 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500319
Josh Durginc6666012011-11-21 17:11:12 -0800320 /* protects updating the header */
321 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500322
323 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700324
325 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800326
327 /* list of snapshots */
328 struct list_head snaps;
329
330 /* sysfs related */
331 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600332 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800333};
334
Alex Elderb82d1672013-01-14 12:43:31 -0600335/*
336 * Flag bits for rbd_dev->flags. If atomicity is required,
337 * rbd_dev->lock is used to protect access.
338 *
339 * Currently, only the "removing" flag (which is coupled with the
340 * "open_count" field) requires atomic access.
341 */
Alex Elder6d292902013-01-14 12:43:31 -0600342enum rbd_dev_flags {
343 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600344 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600345};
346
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600348
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600350static DEFINE_SPINLOCK(rbd_dev_list_lock);
351
Alex Elder432b8582012-01-29 13:57:44 -0600352static LIST_HEAD(rbd_client_list); /* clients */
353static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700354
Alex Elder3d7efd12013-04-19 15:34:50 -0500355static int rbd_img_request_submit(struct rbd_img_request *img_request);
356
Alex Elder304f6802012-08-31 17:29:52 -0500357static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
358static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
359
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800360static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500361static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800362
Alex Elderf0f8cef2012-01-29 13:57:44 -0600363static ssize_t rbd_add(struct bus_type *bus, const char *buf,
364 size_t count);
365static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
366 size_t count);
Alex Elder2f82ee52012-10-30 19:40:33 -0500367static int rbd_dev_probe(struct rbd_device *rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600368
369static struct bus_attribute rbd_bus_attrs[] = {
370 __ATTR(add, S_IWUSR, NULL, rbd_add),
371 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
372 __ATTR_NULL
373};
374
375static struct bus_type rbd_bus_type = {
376 .name = "rbd",
377 .bus_attrs = rbd_bus_attrs,
378};
379
380static void rbd_root_dev_release(struct device *dev)
381{
382}
383
384static struct device rbd_root_dev = {
385 .init_name = "rbd",
386 .release = rbd_root_dev_release,
387};
388
Alex Elder06ecc6c2012-11-01 10:17:15 -0500389static __printf(2, 3)
390void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
391{
392 struct va_format vaf;
393 va_list args;
394
395 va_start(args, fmt);
396 vaf.fmt = fmt;
397 vaf.va = &args;
398
399 if (!rbd_dev)
400 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
401 else if (rbd_dev->disk)
402 printk(KERN_WARNING "%s: %s: %pV\n",
403 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
404 else if (rbd_dev->spec && rbd_dev->spec->image_name)
405 printk(KERN_WARNING "%s: image %s: %pV\n",
406 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
407 else if (rbd_dev->spec && rbd_dev->spec->image_id)
408 printk(KERN_WARNING "%s: id %s: %pV\n",
409 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
410 else /* punt */
411 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
412 RBD_DRV_NAME, rbd_dev, &vaf);
413 va_end(args);
414}
415
Alex Elderaafb2302012-09-06 16:00:54 -0500416#ifdef RBD_DEBUG
417#define rbd_assert(expr) \
418 if (unlikely(!(expr))) { \
419 printk(KERN_ERR "\nAssertion failure in %s() " \
420 "at line %d:\n\n" \
421 "\trbd_assert(%s);\n\n", \
422 __func__, __LINE__, #expr); \
423 BUG(); \
424 }
425#else /* !RBD_DEBUG */
426# define rbd_assert(expr) ((void) 0)
427#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800428
Alex Elder8b3e1a52013-01-24 16:13:36 -0600429static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
Alex Elderb454e362013-04-19 15:34:50 -0500430static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600431
Alex Elder117973f2012-08-31 17:29:55 -0500432static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
433static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700434
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435static int rbd_open(struct block_device *bdev, fmode_t mode)
436{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600437 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600438 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439
Alex Elderf84344f2012-08-31 17:29:51 -0500440 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441 return -EROFS;
442
Alex Eldera14ea262013-02-05 13:23:12 -0600443 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600444 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
445 removing = true;
446 else
447 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600448 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600449 if (removing)
450 return -ENOENT;
451
Alex Elder42382b72012-11-16 09:29:16 -0600452 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600453 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500454 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600455 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700456
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457 return 0;
458}
459
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800460static int rbd_release(struct gendisk *disk, fmode_t mode)
461{
462 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600463 unsigned long open_count_before;
464
Alex Eldera14ea262013-02-05 13:23:12 -0600465 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600466 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600467 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600468 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800469
Alex Elder42382b72012-11-16 09:29:16 -0600470 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600471 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600472 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800473
474 return 0;
475}
476
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700477static const struct block_device_operations rbd_bd_ops = {
478 .owner = THIS_MODULE,
479 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800480 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700481};
482
483/*
484 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500485 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700486 */
Alex Elderf8c38922012-08-10 13:12:07 -0700487static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700488{
489 struct rbd_client *rbdc;
490 int ret = -ENOMEM;
491
Alex Elder37206ee2013-02-20 17:32:08 -0600492 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700493 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
494 if (!rbdc)
495 goto out_opt;
496
497 kref_init(&rbdc->kref);
498 INIT_LIST_HEAD(&rbdc->node);
499
Alex Elderbc534d82012-01-29 13:57:44 -0600500 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
501
Alex Elder43ae4702012-07-03 16:01:18 -0500502 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700503 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600504 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500505 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700506
507 ret = ceph_open_session(rbdc->client);
508 if (ret < 0)
509 goto out_err;
510
Alex Elder432b8582012-01-29 13:57:44 -0600511 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700512 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600513 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514
Alex Elderbc534d82012-01-29 13:57:44 -0600515 mutex_unlock(&ctl_mutex);
Alex Elder37206ee2013-02-20 17:32:08 -0600516 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600517
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700518 return rbdc;
519
520out_err:
521 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600522out_mutex:
523 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700524 kfree(rbdc);
525out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500526 if (ceph_opts)
527 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600528 dout("%s: error %d\n", __func__, ret);
529
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400530 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700531}
532
Alex Elder2f82ee52012-10-30 19:40:33 -0500533static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
534{
535 kref_get(&rbdc->kref);
536
537 return rbdc;
538}
539
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700541 * Find a ceph client with specific addr and configuration. If
542 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700543 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700544static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545{
546 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700547 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700548
Alex Elder43ae4702012-07-03 16:01:18 -0500549 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700550 return NULL;
551
Alex Elder1f7ba332012-08-10 13:12:07 -0700552 spin_lock(&rbd_client_list_lock);
553 list_for_each_entry(client_node, &rbd_client_list, node) {
554 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500555 __rbd_get_client(client_node);
556
Alex Elder1f7ba332012-08-10 13:12:07 -0700557 found = true;
558 break;
559 }
560 }
561 spin_unlock(&rbd_client_list_lock);
562
563 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700564}
565
566/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700567 * mount options
568 */
569enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700570 Opt_last_int,
571 /* int args above */
572 Opt_last_string,
573 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700574 Opt_read_only,
575 Opt_read_write,
576 /* Boolean args above */
577 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700578};
579
Alex Elder43ae4702012-07-03 16:01:18 -0500580static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700581 /* int args above */
582 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500583 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700584 {Opt_read_only, "ro"}, /* Alternate spelling */
585 {Opt_read_write, "read_write"},
586 {Opt_read_write, "rw"}, /* Alternate spelling */
587 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700588 {-1, NULL}
589};
590
Alex Elder98571b52013-01-20 14:44:42 -0600591struct rbd_options {
592 bool read_only;
593};
594
595#define RBD_READ_ONLY_DEFAULT false
596
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700597static int parse_rbd_opts_token(char *c, void *private)
598{
Alex Elder43ae4702012-07-03 16:01:18 -0500599 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700600 substring_t argstr[MAX_OPT_ARGS];
601 int token, intval, ret;
602
Alex Elder43ae4702012-07-03 16:01:18 -0500603 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700604 if (token < 0)
605 return -EINVAL;
606
607 if (token < Opt_last_int) {
608 ret = match_int(&argstr[0], &intval);
609 if (ret < 0) {
610 pr_err("bad mount option arg (not int) "
611 "at '%s'\n", c);
612 return ret;
613 }
614 dout("got int token %d val %d\n", token, intval);
615 } else if (token > Opt_last_int && token < Opt_last_string) {
616 dout("got string token %d val %s\n", token,
617 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700618 } else if (token > Opt_last_string && token < Opt_last_bool) {
619 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700620 } else {
621 dout("got token %d\n", token);
622 }
623
624 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700625 case Opt_read_only:
626 rbd_opts->read_only = true;
627 break;
628 case Opt_read_write:
629 rbd_opts->read_only = false;
630 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700631 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500632 rbd_assert(false);
633 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700634 }
635 return 0;
636}
637
638/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639 * Get a ceph client with specific addr and configuration, if one does
640 * not exist create it.
641 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500642static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643{
Alex Elderf8c38922012-08-10 13:12:07 -0700644 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700645
Alex Elder1f7ba332012-08-10 13:12:07 -0700646 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500647 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500648 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500649 else
Alex Elderf8c38922012-08-10 13:12:07 -0700650 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651
Alex Elder9d3997f2012-10-25 23:34:42 -0500652 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700653}
654
655/*
656 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600657 *
Alex Elder432b8582012-01-29 13:57:44 -0600658 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659 */
660static void rbd_client_release(struct kref *kref)
661{
662 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
663
Alex Elder37206ee2013-02-20 17:32:08 -0600664 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500665 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500667 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668
669 ceph_destroy_client(rbdc->client);
670 kfree(rbdc);
671}
672
673/*
674 * Drop reference to ceph client node. If it's not referenced anymore, release
675 * it.
676 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500677static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678{
Alex Elderc53d5892012-10-25 23:34:42 -0500679 if (rbdc)
680 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681}
682
Alex Eldera30b71b2012-07-10 20:30:11 -0500683static bool rbd_image_format_valid(u32 image_format)
684{
685 return image_format == 1 || image_format == 2;
686}
687
Alex Elder8e94af82012-07-25 09:32:40 -0500688static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
689{
Alex Elder103a1502012-08-02 11:29:45 -0500690 size_t size;
691 u32 snap_count;
692
693 /* The header has to start with the magic rbd header text */
694 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
695 return false;
696
Alex Elderdb2388b2012-10-20 22:17:27 -0500697 /* The bio layer requires at least sector-sized I/O */
698
699 if (ondisk->options.order < SECTOR_SHIFT)
700 return false;
701
702 /* If we use u64 in a few spots we may be able to loosen this */
703
704 if (ondisk->options.order > 8 * sizeof (int) - 1)
705 return false;
706
Alex Elder103a1502012-08-02 11:29:45 -0500707 /*
708 * The size of a snapshot header has to fit in a size_t, and
709 * that limits the number of snapshots.
710 */
711 snap_count = le32_to_cpu(ondisk->snap_count);
712 size = SIZE_MAX - sizeof (struct ceph_snap_context);
713 if (snap_count > size / sizeof (__le64))
714 return false;
715
716 /*
717 * Not only that, but the size of the entire the snapshot
718 * header must also be representable in a size_t.
719 */
720 size -= snap_count * sizeof (__le64);
721 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
722 return false;
723
724 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500725}
726
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727/*
728 * Create a new header structure, translate header format from the on-disk
729 * header.
730 */
731static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500732 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733{
Alex Elderccece232012-07-10 20:30:10 -0500734 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500735 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500736 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500737 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738
Alex Elder6a523252012-07-19 17:12:59 -0500739 memset(header, 0, sizeof (*header));
740
Alex Elder103a1502012-08-02 11:29:45 -0500741 snap_count = le32_to_cpu(ondisk->snap_count);
742
Alex Elder58c17b02012-08-23 23:22:06 -0500743 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
744 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500745 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700746 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500747 memcpy(header->object_prefix, ondisk->object_prefix, len);
748 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600749
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500751 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
752
Alex Elder621901d2012-08-23 23:22:06 -0500753 /* Save a copy of the snapshot names */
754
Alex Elderf785cc12012-08-23 23:22:06 -0500755 if (snap_names_len > (u64) SIZE_MAX)
756 return -EIO;
757 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500759 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500760 /*
761 * Note that rbd_dev_v1_header_read() guarantees
762 * the ondisk buffer we're working with has
763 * snap_names_len bytes beyond the end of the
764 * snapshot id array, this memcpy() is safe.
765 */
766 memcpy(header->snap_names, &ondisk->snaps[snap_count],
767 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500768
Alex Elder621901d2012-08-23 23:22:06 -0500769 /* Record each snapshot's size */
770
Alex Elderd2bb24e2012-07-26 23:37:14 -0500771 size = snap_count * sizeof (*header->snap_sizes);
772 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700773 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500774 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500775 for (i = 0; i < snap_count; i++)
776 header->snap_sizes[i] =
777 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700778 } else {
Alex Elderccece232012-07-10 20:30:10 -0500779 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700780 header->snap_names = NULL;
781 header->snap_sizes = NULL;
782 }
Alex Elder849b4262012-07-09 21:04:24 -0500783
Alex Elder34b13182012-07-13 20:35:12 -0500784 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700785 header->obj_order = ondisk->options.order;
786 header->crypt_type = ondisk->options.crypt_type;
787 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500788
Alex Elder621901d2012-08-23 23:22:06 -0500789 /* Allocate and fill in the snapshot context */
790
Alex Elderf84344f2012-08-31 17:29:51 -0500791 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500792 size = sizeof (struct ceph_snap_context);
793 size += snap_count * sizeof (header->snapc->snaps[0]);
794 header->snapc = kzalloc(size, GFP_KERNEL);
795 if (!header->snapc)
796 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700797
798 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500799 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700800 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500801 for (i = 0; i < snap_count; i++)
802 header->snapc->snaps[i] =
803 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700804
805 return 0;
806
Alex Elder6a523252012-07-19 17:12:59 -0500807out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500808 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500809 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700810 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500811 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500812 kfree(header->object_prefix);
813 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500814
Alex Elder00f1f362012-02-07 12:03:36 -0600815 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700816}
817
Alex Elder9e15b772012-10-30 19:40:33 -0500818static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
819{
820 struct rbd_snap *snap;
821
822 if (snap_id == CEPH_NOSNAP)
823 return RBD_SNAP_HEAD_NAME;
824
825 list_for_each_entry(snap, &rbd_dev->snaps, node)
826 if (snap_id == snap->id)
827 return snap->name;
828
829 return NULL;
830}
831
Alex Elder8836b992012-08-30 14:42:15 -0500832static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700833{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834
Alex Eldere86924a2012-07-10 20:30:11 -0500835 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600836
Alex Eldere86924a2012-07-10 20:30:11 -0500837 list_for_each_entry(snap, &rbd_dev->snaps, node) {
838 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500839 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500840 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500841 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600842
Alex Eldere86924a2012-07-10 20:30:11 -0500843 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600844 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845 }
Alex Eldere86924a2012-07-10 20:30:11 -0500846
Alex Elder00f1f362012-02-07 12:03:36 -0600847 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700848}
849
Alex Elder819d52b2012-10-25 23:34:41 -0500850static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700851{
Alex Elder78dc4472012-07-19 08:49:18 -0500852 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700853
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500854 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800855 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500856 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500857 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500858 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500859 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700860 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500861 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700862 if (ret < 0)
863 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500864 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865 }
Alex Elder6d292902013-01-14 12:43:31 -0600866 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
867
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700868done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700869 return ret;
870}
871
872static void rbd_header_free(struct rbd_image_header *header)
873{
Alex Elder849b4262012-07-09 21:04:24 -0500874 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500875 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500877 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500878 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500879 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800880 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500881 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882}
883
Alex Elder98571b52013-01-20 14:44:42 -0600884static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885{
Alex Elder65ccfe22012-08-09 10:33:26 -0700886 char *name;
887 u64 segment;
888 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889
Alex Elder2fd82b92012-11-09 15:05:54 -0600890 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700891 if (!name)
892 return NULL;
893 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600894 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700895 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600896 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700897 pr_err("error formatting segment name for #%llu (%d)\n",
898 segment, ret);
899 kfree(name);
900 name = NULL;
901 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700902
Alex Elder65ccfe22012-08-09 10:33:26 -0700903 return name;
904}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700905
Alex Elder65ccfe22012-08-09 10:33:26 -0700906static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
907{
908 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700909
Alex Elder65ccfe22012-08-09 10:33:26 -0700910 return offset & (segment_size - 1);
911}
912
913static u64 rbd_segment_length(struct rbd_device *rbd_dev,
914 u64 offset, u64 length)
915{
916 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
917
918 offset &= segment_size - 1;
919
Alex Elderaafb2302012-09-06 16:00:54 -0500920 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700921 if (offset + length > segment_size)
922 length = segment_size - offset;
923
924 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925}
926
927/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700928 * returns the size of an object in the image
929 */
930static u64 rbd_obj_bytes(struct rbd_image_header *header)
931{
932 return 1 << header->obj_order;
933}
934
935/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700936 * bio helpers
937 */
938
939static void bio_chain_put(struct bio *chain)
940{
941 struct bio *tmp;
942
943 while (chain) {
944 tmp = chain;
945 chain = chain->bi_next;
946 bio_put(tmp);
947 }
948}
949
950/*
951 * zeros a bio chain, starting at specific offset
952 */
953static void zero_bio_chain(struct bio *chain, int start_ofs)
954{
955 struct bio_vec *bv;
956 unsigned long flags;
957 void *buf;
958 int i;
959 int pos = 0;
960
961 while (chain) {
962 bio_for_each_segment(bv, chain, i) {
963 if (pos + bv->bv_len > start_ofs) {
964 int remainder = max(start_ofs - pos, 0);
965 buf = bvec_kmap_irq(bv, &flags);
966 memset(buf + remainder, 0,
967 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200968 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700969 }
970 pos += bv->bv_len;
971 }
972
973 chain = chain->bi_next;
974 }
975}
976
977/*
Alex Elderb9434c52013-04-19 15:34:50 -0500978 * similar to zero_bio_chain(), zeros data defined by a page array,
979 * starting at the given byte offset from the start of the array and
980 * continuing up to the given end offset. The pages array is
981 * assumed to be big enough to hold all bytes up to the end.
982 */
983static void zero_pages(struct page **pages, u64 offset, u64 end)
984{
985 struct page **page = &pages[offset >> PAGE_SHIFT];
986
987 rbd_assert(end > offset);
988 rbd_assert(end - offset <= (u64)SIZE_MAX);
989 while (offset < end) {
990 size_t page_offset;
991 size_t length;
992 unsigned long flags;
993 void *kaddr;
994
995 page_offset = (size_t)(offset & ~PAGE_MASK);
996 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
997 local_irq_save(flags);
998 kaddr = kmap_atomic(*page);
999 memset(kaddr + page_offset, 0, length);
1000 kunmap_atomic(kaddr);
1001 local_irq_restore(flags);
1002
1003 offset += length;
1004 page++;
1005 }
1006}
1007
1008/*
Alex Elderf7760da2012-10-20 22:17:27 -05001009 * Clone a portion of a bio, starting at the given byte offset
1010 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001011 */
Alex Elderf7760da2012-10-20 22:17:27 -05001012static struct bio *bio_clone_range(struct bio *bio_src,
1013 unsigned int offset,
1014 unsigned int len,
1015 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001016{
Alex Elderf7760da2012-10-20 22:17:27 -05001017 struct bio_vec *bv;
1018 unsigned int resid;
1019 unsigned short idx;
1020 unsigned int voff;
1021 unsigned short end_idx;
1022 unsigned short vcnt;
1023 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024
Alex Elderf7760da2012-10-20 22:17:27 -05001025 /* Handle the easy case for the caller */
1026
1027 if (!offset && len == bio_src->bi_size)
1028 return bio_clone(bio_src, gfpmask);
1029
1030 if (WARN_ON_ONCE(!len))
1031 return NULL;
1032 if (WARN_ON_ONCE(len > bio_src->bi_size))
1033 return NULL;
1034 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1035 return NULL;
1036
1037 /* Find first affected segment... */
1038
1039 resid = offset;
1040 __bio_for_each_segment(bv, bio_src, idx, 0) {
1041 if (resid < bv->bv_len)
1042 break;
1043 resid -= bv->bv_len;
1044 }
1045 voff = resid;
1046
1047 /* ...and the last affected segment */
1048
1049 resid += len;
1050 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1051 if (resid <= bv->bv_len)
1052 break;
1053 resid -= bv->bv_len;
1054 }
1055 vcnt = end_idx - idx + 1;
1056
1057 /* Build the clone */
1058
1059 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1060 if (!bio)
1061 return NULL; /* ENOMEM */
1062
1063 bio->bi_bdev = bio_src->bi_bdev;
1064 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1065 bio->bi_rw = bio_src->bi_rw;
1066 bio->bi_flags |= 1 << BIO_CLONED;
1067
1068 /*
1069 * Copy over our part of the bio_vec, then update the first
1070 * and last (or only) entries.
1071 */
1072 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1073 vcnt * sizeof (struct bio_vec));
1074 bio->bi_io_vec[0].bv_offset += voff;
1075 if (vcnt > 1) {
1076 bio->bi_io_vec[0].bv_len -= voff;
1077 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1078 } else {
1079 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 }
1081
Alex Elderf7760da2012-10-20 22:17:27 -05001082 bio->bi_vcnt = vcnt;
1083 bio->bi_size = len;
1084 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001085
Alex Elderf7760da2012-10-20 22:17:27 -05001086 return bio;
1087}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088
Alex Elderf7760da2012-10-20 22:17:27 -05001089/*
1090 * Clone a portion of a bio chain, starting at the given byte offset
1091 * into the first bio in the source chain and continuing for the
1092 * number of bytes indicated. The result is another bio chain of
1093 * exactly the given length, or a null pointer on error.
1094 *
1095 * The bio_src and offset parameters are both in-out. On entry they
1096 * refer to the first source bio and the offset into that bio where
1097 * the start of data to be cloned is located.
1098 *
1099 * On return, bio_src is updated to refer to the bio in the source
1100 * chain that contains first un-cloned byte, and *offset will
1101 * contain the offset of that byte within that bio.
1102 */
1103static struct bio *bio_chain_clone_range(struct bio **bio_src,
1104 unsigned int *offset,
1105 unsigned int len,
1106 gfp_t gfpmask)
1107{
1108 struct bio *bi = *bio_src;
1109 unsigned int off = *offset;
1110 struct bio *chain = NULL;
1111 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001112
Alex Elderf7760da2012-10-20 22:17:27 -05001113 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001114
Alex Elderf7760da2012-10-20 22:17:27 -05001115 if (!bi || off >= bi->bi_size || !len)
1116 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001117
Alex Elderf7760da2012-10-20 22:17:27 -05001118 end = &chain;
1119 while (len) {
1120 unsigned int bi_size;
1121 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001122
Alex Elderf5400b72012-11-01 10:17:15 -05001123 if (!bi) {
1124 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001125 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001126 }
Alex Elderf7760da2012-10-20 22:17:27 -05001127 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1128 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1129 if (!bio)
1130 goto out_err; /* ENOMEM */
1131
1132 *end = bio;
1133 end = &bio->bi_next;
1134
1135 off += bi_size;
1136 if (off == bi->bi_size) {
1137 bi = bi->bi_next;
1138 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001139 }
Alex Elderf7760da2012-10-20 22:17:27 -05001140 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001141 }
Alex Elderf7760da2012-10-20 22:17:27 -05001142 *bio_src = bi;
1143 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144
Alex Elderf7760da2012-10-20 22:17:27 -05001145 return chain;
1146out_err:
1147 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149 return NULL;
1150}
1151
Alex Elder926f9b32013-02-11 12:33:24 -06001152/*
1153 * The default/initial value for all object request flags is 0. For
1154 * each flag, once its value is set to 1 it is never reset to 0
1155 * again.
1156 */
Alex Elder6365d332013-02-11 12:33:24 -06001157static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1158{
1159 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001160 struct rbd_device *rbd_dev;
1161
Alex Elder57acbaa2013-02-11 12:33:24 -06001162 rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder6365d332013-02-11 12:33:24 -06001163 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1164 obj_request);
1165 }
1166}
1167
1168static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1169{
1170 smp_mb();
1171 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1172}
1173
Alex Elder57acbaa2013-02-11 12:33:24 -06001174static void obj_request_done_set(struct rbd_obj_request *obj_request)
1175{
1176 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1177 struct rbd_device *rbd_dev = NULL;
1178
1179 if (obj_request_img_data_test(obj_request))
1180 rbd_dev = obj_request->img_request->rbd_dev;
1181 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1182 obj_request);
1183 }
1184}
1185
1186static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1187{
1188 smp_mb();
1189 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1190}
1191
Alex Elder5679c592013-02-11 12:33:24 -06001192/*
1193 * This sets the KNOWN flag after (possibly) setting the EXISTS
1194 * flag. The latter is set based on the "exists" value provided.
1195 *
1196 * Note that for our purposes once an object exists it never goes
1197 * away again. It's possible that the response from two existence
1198 * checks are separated by the creation of the target object, and
1199 * the first ("doesn't exist") response arrives *after* the second
1200 * ("does exist"). In that case we ignore the second one.
1201 */
1202static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1203 bool exists)
1204{
1205 if (exists)
1206 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1207 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1208 smp_mb();
1209}
1210
1211static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1212{
1213 smp_mb();
1214 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1215}
1216
1217static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1218{
1219 smp_mb();
1220 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1221}
1222
Alex Elderbf0d5f502012-11-22 00:00:08 -06001223static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1224{
Alex Elder37206ee2013-02-20 17:32:08 -06001225 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1226 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001227 kref_get(&obj_request->kref);
1228}
1229
1230static void rbd_obj_request_destroy(struct kref *kref);
1231static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1232{
1233 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001234 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1235 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001236 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1237}
1238
1239static void rbd_img_request_get(struct rbd_img_request *img_request)
1240{
Alex Elder37206ee2013-02-20 17:32:08 -06001241 dout("%s: img %p (was %d)\n", __func__, img_request,
1242 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001243 kref_get(&img_request->kref);
1244}
1245
1246static void rbd_img_request_destroy(struct kref *kref);
1247static void rbd_img_request_put(struct rbd_img_request *img_request)
1248{
1249 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001250 dout("%s: img %p (was %d)\n", __func__, img_request,
1251 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001252 kref_put(&img_request->kref, rbd_img_request_destroy);
1253}
1254
1255static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1256 struct rbd_obj_request *obj_request)
1257{
Alex Elder25dcf952013-01-25 17:08:55 -06001258 rbd_assert(obj_request->img_request == NULL);
1259
Alex Elderb155e862013-04-15 14:50:37 -05001260 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001261 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001262 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001263 rbd_assert(!obj_request_img_data_test(obj_request));
1264 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001265 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001266 img_request->obj_request_count++;
1267 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001268 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1269 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001270}
1271
1272static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1273 struct rbd_obj_request *obj_request)
1274{
1275 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001276
Alex Elder37206ee2013-02-20 17:32:08 -06001277 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1278 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001279 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001280 rbd_assert(img_request->obj_request_count > 0);
1281 img_request->obj_request_count--;
1282 rbd_assert(obj_request->which == img_request->obj_request_count);
1283 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001284 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001285 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001286 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001287 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001288 rbd_obj_request_put(obj_request);
1289}
1290
1291static bool obj_request_type_valid(enum obj_request_type type)
1292{
1293 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001294 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001295 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001296 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001297 return true;
1298 default:
1299 return false;
1300 }
1301}
1302
Alex Elderbf0d5f502012-11-22 00:00:08 -06001303static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1304 struct rbd_obj_request *obj_request)
1305{
Alex Elder37206ee2013-02-20 17:32:08 -06001306 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1307
Alex Elderbf0d5f502012-11-22 00:00:08 -06001308 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1309}
1310
1311static void rbd_img_request_complete(struct rbd_img_request *img_request)
1312{
Alex Elder55f27e02013-04-10 12:34:25 -05001313
Alex Elder37206ee2013-02-20 17:32:08 -06001314 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001315
1316 /*
1317 * If no error occurred, compute the aggregate transfer
1318 * count for the image request. We could instead use
1319 * atomic64_cmpxchg() to update it as each object request
1320 * completes; not clear which way is better off hand.
1321 */
1322 if (!img_request->result) {
1323 struct rbd_obj_request *obj_request;
1324 u64 xferred = 0;
1325
1326 for_each_obj_request(img_request, obj_request)
1327 xferred += obj_request->xferred;
1328 img_request->xferred = xferred;
1329 }
1330
Alex Elderbf0d5f502012-11-22 00:00:08 -06001331 if (img_request->callback)
1332 img_request->callback(img_request);
1333 else
1334 rbd_img_request_put(img_request);
1335}
1336
Alex Elder788e2df2013-01-17 12:25:27 -06001337/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1338
1339static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1340{
Alex Elder37206ee2013-02-20 17:32:08 -06001341 dout("%s: obj %p\n", __func__, obj_request);
1342
Alex Elder788e2df2013-01-17 12:25:27 -06001343 return wait_for_completion_interruptible(&obj_request->completion);
1344}
1345
Alex Elder0c425242013-02-08 09:55:49 -06001346/*
1347 * The default/initial value for all image request flags is 0. Each
1348 * is conditionally set to 1 at image request initialization time
1349 * and currently never change thereafter.
1350 */
1351static void img_request_write_set(struct rbd_img_request *img_request)
1352{
1353 set_bit(IMG_REQ_WRITE, &img_request->flags);
1354 smp_mb();
1355}
1356
1357static bool img_request_write_test(struct rbd_img_request *img_request)
1358{
1359 smp_mb();
1360 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1361}
1362
Alex Elder9849e982013-01-24 16:13:36 -06001363static void img_request_child_set(struct rbd_img_request *img_request)
1364{
1365 set_bit(IMG_REQ_CHILD, &img_request->flags);
1366 smp_mb();
1367}
1368
1369static bool img_request_child_test(struct rbd_img_request *img_request)
1370{
1371 smp_mb();
1372 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1373}
1374
Alex Elderd0b2e942013-01-24 16:13:36 -06001375static void img_request_layered_set(struct rbd_img_request *img_request)
1376{
1377 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1378 smp_mb();
1379}
1380
1381static bool img_request_layered_test(struct rbd_img_request *img_request)
1382{
1383 smp_mb();
1384 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1385}
1386
Alex Elder6e2a4502013-03-27 09:16:30 -05001387static void
1388rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1389{
Alex Elderb9434c52013-04-19 15:34:50 -05001390 u64 xferred = obj_request->xferred;
1391 u64 length = obj_request->length;
1392
Alex Elder6e2a4502013-03-27 09:16:30 -05001393 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1394 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001395 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001396 /*
1397 * ENOENT means a hole in the image. We zero-fill the
1398 * entire length of the request. A short read also implies
1399 * zero-fill to the end of the request. Either way we
1400 * update the xferred count to indicate the whole request
1401 * was satisfied.
1402 */
Alex Elderb9434c52013-04-19 15:34:50 -05001403 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001404 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001405 if (obj_request->type == OBJ_REQUEST_BIO)
1406 zero_bio_chain(obj_request->bio_list, 0);
1407 else
1408 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001409 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001410 obj_request->xferred = length;
1411 } else if (xferred < length && !obj_request->result) {
1412 if (obj_request->type == OBJ_REQUEST_BIO)
1413 zero_bio_chain(obj_request->bio_list, xferred);
1414 else
1415 zero_pages(obj_request->pages, xferred, length);
1416 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001417 }
1418 obj_request_done_set(obj_request);
1419}
1420
Alex Elderbf0d5f502012-11-22 00:00:08 -06001421static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1422{
Alex Elder37206ee2013-02-20 17:32:08 -06001423 dout("%s: obj %p cb %p\n", __func__, obj_request,
1424 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001425 if (obj_request->callback)
1426 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001427 else
1428 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001429}
1430
Alex Elderc47f9372013-02-26 14:23:07 -06001431static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001432{
1433 dout("%s: obj %p\n", __func__, obj_request);
1434 obj_request_done_set(obj_request);
1435}
1436
Alex Elderc47f9372013-02-26 14:23:07 -06001437static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001438{
Alex Elder57acbaa2013-02-11 12:33:24 -06001439 struct rbd_img_request *img_request = NULL;
1440 bool layered = false;
1441
1442 if (obj_request_img_data_test(obj_request)) {
1443 img_request = obj_request->img_request;
1444 layered = img_request && img_request_layered_test(img_request);
1445 } else {
1446 img_request = NULL;
1447 layered = false;
1448 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001449
1450 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1451 obj_request, img_request, obj_request->result,
1452 obj_request->xferred, obj_request->length);
1453 if (layered && obj_request->result == -ENOENT)
1454 rbd_img_parent_read(obj_request);
1455 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001456 rbd_img_obj_request_read_callback(obj_request);
1457 else
1458 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001459}
1460
Alex Elderc47f9372013-02-26 14:23:07 -06001461static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001462{
Sage Weil1b83bef2013-02-25 16:11:12 -08001463 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1464 obj_request->result, obj_request->length);
1465 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001466 * There is no such thing as a successful short write. Set
1467 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001468 */
1469 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001470 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001471}
1472
Alex Elderfbfab532013-02-08 09:55:48 -06001473/*
1474 * For a simple stat call there's nothing to do. We'll do more if
1475 * this is part of a write sequence for a layered image.
1476 */
Alex Elderc47f9372013-02-26 14:23:07 -06001477static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001478{
Alex Elder37206ee2013-02-20 17:32:08 -06001479 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001480 obj_request_done_set(obj_request);
1481}
1482
Alex Elderbf0d5f502012-11-22 00:00:08 -06001483static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1484 struct ceph_msg *msg)
1485{
1486 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001487 u16 opcode;
1488
Alex Elder37206ee2013-02-20 17:32:08 -06001489 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001490 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001491 if (obj_request_img_data_test(obj_request)) {
1492 rbd_assert(obj_request->img_request);
1493 rbd_assert(obj_request->which != BAD_WHICH);
1494 } else {
1495 rbd_assert(obj_request->which == BAD_WHICH);
1496 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001497
Sage Weil1b83bef2013-02-25 16:11:12 -08001498 if (osd_req->r_result < 0)
1499 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001500 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1501
Alex Elder0eefd472013-04-19 15:34:50 -05001502 BUG_ON(osd_req->r_num_ops > 2);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001503
Alex Elderc47f9372013-02-26 14:23:07 -06001504 /*
1505 * We support a 64-bit length, but ultimately it has to be
1506 * passed to blk_end_request(), which takes an unsigned int.
1507 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001508 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elder8b3e1a52013-01-24 16:13:36 -06001509 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Alex Elder79528732013-04-03 21:32:51 -05001510 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001511 switch (opcode) {
1512 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001513 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001514 break;
1515 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001516 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001517 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001518 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001519 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001520 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001521 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001522 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001523 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001524 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001525 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001526 default:
1527 rbd_warn(NULL, "%s: unsupported op %hu\n",
1528 obj_request->object_name, (unsigned short) opcode);
1529 break;
1530 }
1531
Alex Elder07741302013-02-05 23:41:50 -06001532 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001533 rbd_obj_request_complete(obj_request);
1534}
1535
Alex Elder9d4df012013-04-19 15:34:50 -05001536static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001537{
1538 struct rbd_img_request *img_request = obj_request->img_request;
Alex Elder8c042b02013-04-03 01:28:58 -05001539 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001540 u64 snap_id;
Alex Elder430c28c2013-04-03 21:32:51 -05001541
Alex Elder8c042b02013-04-03 01:28:58 -05001542 rbd_assert(osd_req != NULL);
Alex Elder430c28c2013-04-03 21:32:51 -05001543
Alex Elder9d4df012013-04-19 15:34:50 -05001544 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
Alex Elder8c042b02013-04-03 01:28:58 -05001545 ceph_osdc_build_request(osd_req, obj_request->offset,
Alex Elder9d4df012013-04-19 15:34:50 -05001546 NULL, snap_id, NULL);
1547}
1548
1549static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1550{
1551 struct rbd_img_request *img_request = obj_request->img_request;
1552 struct ceph_osd_request *osd_req = obj_request->osd_req;
1553 struct ceph_snap_context *snapc;
1554 struct timespec mtime = CURRENT_TIME;
1555
1556 rbd_assert(osd_req != NULL);
1557
1558 snapc = img_request ? img_request->snapc : NULL;
1559 ceph_osdc_build_request(osd_req, obj_request->offset,
1560 snapc, CEPH_NOSNAP, &mtime);
Alex Elder430c28c2013-04-03 21:32:51 -05001561}
1562
Alex Elderbf0d5f502012-11-22 00:00:08 -06001563static struct ceph_osd_request *rbd_osd_req_create(
1564 struct rbd_device *rbd_dev,
1565 bool write_request,
Alex Elder430c28c2013-04-03 21:32:51 -05001566 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001568 struct ceph_snap_context *snapc = NULL;
1569 struct ceph_osd_client *osdc;
1570 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001571
Alex Elder6365d332013-02-11 12:33:24 -06001572 if (obj_request_img_data_test(obj_request)) {
1573 struct rbd_img_request *img_request = obj_request->img_request;
1574
Alex Elder0c425242013-02-08 09:55:49 -06001575 rbd_assert(write_request ==
1576 img_request_write_test(img_request));
1577 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001578 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001579 }
1580
1581 /* Allocate and initialize the request, for the single op */
1582
1583 osdc = &rbd_dev->rbd_client->client->osdc;
1584 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1585 if (!osd_req)
1586 return NULL; /* ENOMEM */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001587
Alex Elder430c28c2013-04-03 21:32:51 -05001588 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001589 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05001590 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06001591 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001592
1593 osd_req->r_callback = rbd_osd_req_callback;
1594 osd_req->r_priv = obj_request;
1595
1596 osd_req->r_oid_len = strlen(obj_request->object_name);
1597 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1598 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1599
1600 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1601
Alex Elderbf0d5f502012-11-22 00:00:08 -06001602 return osd_req;
1603}
1604
Alex Elder0eefd472013-04-19 15:34:50 -05001605/*
1606 * Create a copyup osd request based on the information in the
1607 * object request supplied. A copyup request has two osd ops,
1608 * a copyup method call, and a "normal" write request.
1609 */
1610static struct ceph_osd_request *
1611rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1612{
1613 struct rbd_img_request *img_request;
1614 struct ceph_snap_context *snapc;
1615 struct rbd_device *rbd_dev;
1616 struct ceph_osd_client *osdc;
1617 struct ceph_osd_request *osd_req;
1618
1619 rbd_assert(obj_request_img_data_test(obj_request));
1620 img_request = obj_request->img_request;
1621 rbd_assert(img_request);
1622 rbd_assert(img_request_write_test(img_request));
1623
1624 /* Allocate and initialize the request, for the two ops */
1625
1626 snapc = img_request->snapc;
1627 rbd_dev = img_request->rbd_dev;
1628 osdc = &rbd_dev->rbd_client->client->osdc;
1629 osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1630 if (!osd_req)
1631 return NULL; /* ENOMEM */
1632
1633 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1634 osd_req->r_callback = rbd_osd_req_callback;
1635 osd_req->r_priv = obj_request;
1636
1637 osd_req->r_oid_len = strlen(obj_request->object_name);
1638 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1639 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1640
1641 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1642
1643 return osd_req;
1644}
1645
1646
Alex Elderbf0d5f502012-11-22 00:00:08 -06001647static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1648{
1649 ceph_osdc_put_request(osd_req);
1650}
1651
1652/* object_name is assumed to be a non-null pointer and NUL-terminated */
1653
1654static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1655 u64 offset, u64 length,
1656 enum obj_request_type type)
1657{
1658 struct rbd_obj_request *obj_request;
1659 size_t size;
1660 char *name;
1661
1662 rbd_assert(obj_request_type_valid(type));
1663
1664 size = strlen(object_name) + 1;
1665 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1666 if (!obj_request)
1667 return NULL;
1668
1669 name = (char *)(obj_request + 1);
1670 obj_request->object_name = memcpy(name, object_name, size);
1671 obj_request->offset = offset;
1672 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06001673 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001674 obj_request->which = BAD_WHICH;
1675 obj_request->type = type;
1676 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06001677 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001678 kref_init(&obj_request->kref);
1679
Alex Elder37206ee2013-02-20 17:32:08 -06001680 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1681 offset, length, (int)type, obj_request);
1682
Alex Elderbf0d5f502012-11-22 00:00:08 -06001683 return obj_request;
1684}
1685
1686static void rbd_obj_request_destroy(struct kref *kref)
1687{
1688 struct rbd_obj_request *obj_request;
1689
1690 obj_request = container_of(kref, struct rbd_obj_request, kref);
1691
Alex Elder37206ee2013-02-20 17:32:08 -06001692 dout("%s: obj %p\n", __func__, obj_request);
1693
Alex Elderbf0d5f502012-11-22 00:00:08 -06001694 rbd_assert(obj_request->img_request == NULL);
1695 rbd_assert(obj_request->which == BAD_WHICH);
1696
1697 if (obj_request->osd_req)
1698 rbd_osd_req_destroy(obj_request->osd_req);
1699
1700 rbd_assert(obj_request_type_valid(obj_request->type));
1701 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001702 case OBJ_REQUEST_NODATA:
1703 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001704 case OBJ_REQUEST_BIO:
1705 if (obj_request->bio_list)
1706 bio_chain_put(obj_request->bio_list);
1707 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001708 case OBJ_REQUEST_PAGES:
1709 if (obj_request->pages)
1710 ceph_release_page_vector(obj_request->pages,
1711 obj_request->page_count);
1712 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001713 }
1714
1715 kfree(obj_request);
1716}
1717
1718/*
1719 * Caller is responsible for filling in the list of object requests
1720 * that comprises the image request, and the Linux request pointer
1721 * (if there is one).
1722 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001723static struct rbd_img_request *rbd_img_request_create(
1724 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001725 u64 offset, u64 length,
Alex Elder9849e982013-01-24 16:13:36 -06001726 bool write_request,
1727 bool child_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001728{
1729 struct rbd_img_request *img_request;
1730 struct ceph_snap_context *snapc = NULL;
1731
1732 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1733 if (!img_request)
1734 return NULL;
1735
1736 if (write_request) {
1737 down_read(&rbd_dev->header_rwsem);
1738 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1739 up_read(&rbd_dev->header_rwsem);
1740 if (WARN_ON(!snapc)) {
1741 kfree(img_request);
1742 return NULL; /* Shouldn't happen */
1743 }
Alex Elder0c425242013-02-08 09:55:49 -06001744
Alex Elderbf0d5f502012-11-22 00:00:08 -06001745 }
1746
1747 img_request->rq = NULL;
1748 img_request->rbd_dev = rbd_dev;
1749 img_request->offset = offset;
1750 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06001751 img_request->flags = 0;
1752 if (write_request) {
1753 img_request_write_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001754 img_request->snapc = snapc;
Alex Elder0c425242013-02-08 09:55:49 -06001755 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001756 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06001757 }
Alex Elder9849e982013-01-24 16:13:36 -06001758 if (child_request)
1759 img_request_child_set(img_request);
Alex Elderd0b2e942013-01-24 16:13:36 -06001760 if (rbd_dev->parent_spec)
1761 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001762 spin_lock_init(&img_request->completion_lock);
1763 img_request->next_completion = 0;
1764 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06001765 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001766 img_request->obj_request_count = 0;
1767 INIT_LIST_HEAD(&img_request->obj_requests);
1768 kref_init(&img_request->kref);
1769
1770 rbd_img_request_get(img_request); /* Avoid a warning */
1771 rbd_img_request_put(img_request); /* TEMPORARY */
1772
Alex Elder37206ee2013-02-20 17:32:08 -06001773 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1774 write_request ? "write" : "read", offset, length,
1775 img_request);
1776
Alex Elderbf0d5f502012-11-22 00:00:08 -06001777 return img_request;
1778}
1779
1780static void rbd_img_request_destroy(struct kref *kref)
1781{
1782 struct rbd_img_request *img_request;
1783 struct rbd_obj_request *obj_request;
1784 struct rbd_obj_request *next_obj_request;
1785
1786 img_request = container_of(kref, struct rbd_img_request, kref);
1787
Alex Elder37206ee2013-02-20 17:32:08 -06001788 dout("%s: img %p\n", __func__, img_request);
1789
Alex Elderbf0d5f502012-11-22 00:00:08 -06001790 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1791 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001792 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001793
Alex Elder0c425242013-02-08 09:55:49 -06001794 if (img_request_write_test(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001795 ceph_put_snap_context(img_request->snapc);
1796
Alex Elder8b3e1a52013-01-24 16:13:36 -06001797 if (img_request_child_test(img_request))
1798 rbd_obj_request_put(img_request->obj_request);
1799
Alex Elderbf0d5f502012-11-22 00:00:08 -06001800 kfree(img_request);
1801}
1802
Alex Elder12178572013-02-08 09:55:49 -06001803static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1804{
Alex Elder6365d332013-02-11 12:33:24 -06001805 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001806 unsigned int xferred;
1807 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001808 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001809
Alex Elder6365d332013-02-11 12:33:24 -06001810 rbd_assert(obj_request_img_data_test(obj_request));
1811 img_request = obj_request->img_request;
1812
Alex Elder12178572013-02-08 09:55:49 -06001813 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1814 xferred = (unsigned int)obj_request->xferred;
1815 result = obj_request->result;
1816 if (result) {
1817 struct rbd_device *rbd_dev = img_request->rbd_dev;
1818
1819 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1820 img_request_write_test(img_request) ? "write" : "read",
1821 obj_request->length, obj_request->img_offset,
1822 obj_request->offset);
1823 rbd_warn(rbd_dev, " result %d xferred %x\n",
1824 result, xferred);
1825 if (!img_request->result)
1826 img_request->result = result;
1827 }
1828
Alex Elderf1a47392013-04-19 15:34:50 -05001829 /* Image object requests don't own their page array */
1830
1831 if (obj_request->type == OBJ_REQUEST_PAGES) {
1832 obj_request->pages = NULL;
1833 obj_request->page_count = 0;
1834 }
1835
Alex Elder8b3e1a52013-01-24 16:13:36 -06001836 if (img_request_child_test(img_request)) {
1837 rbd_assert(img_request->obj_request != NULL);
1838 more = obj_request->which < img_request->obj_request_count - 1;
1839 } else {
1840 rbd_assert(img_request->rq != NULL);
1841 more = blk_end_request(img_request->rq, result, xferred);
1842 }
1843
1844 return more;
Alex Elder12178572013-02-08 09:55:49 -06001845}
1846
Alex Elder21692382013-04-05 01:27:12 -05001847static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1848{
1849 struct rbd_img_request *img_request;
1850 u32 which = obj_request->which;
1851 bool more = true;
1852
Alex Elder6365d332013-02-11 12:33:24 -06001853 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001854 img_request = obj_request->img_request;
1855
1856 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1857 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001858 rbd_assert(img_request->obj_request_count > 0);
1859 rbd_assert(which != BAD_WHICH);
1860 rbd_assert(which < img_request->obj_request_count);
1861 rbd_assert(which >= img_request->next_completion);
1862
1863 spin_lock_irq(&img_request->completion_lock);
1864 if (which != img_request->next_completion)
1865 goto out;
1866
1867 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001868 rbd_assert(more);
1869 rbd_assert(which < img_request->obj_request_count);
1870
1871 if (!obj_request_done_test(obj_request))
1872 break;
Alex Elder12178572013-02-08 09:55:49 -06001873 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001874 which++;
1875 }
1876
1877 rbd_assert(more ^ (which == img_request->obj_request_count));
1878 img_request->next_completion = which;
1879out:
1880 spin_unlock_irq(&img_request->completion_lock);
1881
1882 if (!more)
1883 rbd_img_request_complete(img_request);
1884}
1885
Alex Elderf1a47392013-04-19 15:34:50 -05001886/*
1887 * Split up an image request into one or more object requests, each
1888 * to a different object. The "type" parameter indicates whether
1889 * "data_desc" is the pointer to the head of a list of bio
1890 * structures, or the base of a page array. In either case this
1891 * function assumes data_desc describes memory sufficient to hold
1892 * all data described by the image request.
1893 */
1894static int rbd_img_request_fill(struct rbd_img_request *img_request,
1895 enum obj_request_type type,
1896 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001897{
1898 struct rbd_device *rbd_dev = img_request->rbd_dev;
1899 struct rbd_obj_request *obj_request = NULL;
1900 struct rbd_obj_request *next_obj_request;
Alex Elder0c425242013-02-08 09:55:49 -06001901 bool write_request = img_request_write_test(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05001902 struct bio *bio_list;
1903 unsigned int bio_offset = 0;
1904 struct page **pages;
Alex Elder7da22d22013-01-24 16:13:36 -06001905 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001906 u64 resid;
1907 u16 opcode;
1908
Alex Elderf1a47392013-04-19 15:34:50 -05001909 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1910 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06001911
Alex Elder430c28c2013-04-03 21:32:51 -05001912 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
Alex Elder7da22d22013-01-24 16:13:36 -06001913 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001914 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06001915 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001916
1917 if (type == OBJ_REQUEST_BIO) {
1918 bio_list = data_desc;
1919 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1920 } else {
1921 rbd_assert(type == OBJ_REQUEST_PAGES);
1922 pages = data_desc;
1923 }
1924
Alex Elderbf0d5f502012-11-22 00:00:08 -06001925 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05001926 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001927 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001928 u64 offset;
1929 u64 length;
1930
Alex Elder7da22d22013-01-24 16:13:36 -06001931 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001932 if (!object_name)
1933 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06001934 offset = rbd_segment_offset(rbd_dev, img_offset);
1935 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001936 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05001937 offset, length, type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001938 kfree(object_name); /* object request has its own copy */
1939 if (!obj_request)
1940 goto out_unwind;
1941
Alex Elderf1a47392013-04-19 15:34:50 -05001942 if (type == OBJ_REQUEST_BIO) {
1943 unsigned int clone_size;
1944
1945 rbd_assert(length <= (u64)UINT_MAX);
1946 clone_size = (unsigned int)length;
1947 obj_request->bio_list =
1948 bio_chain_clone_range(&bio_list,
1949 &bio_offset,
1950 clone_size,
1951 GFP_ATOMIC);
1952 if (!obj_request->bio_list)
1953 goto out_partial;
1954 } else {
1955 unsigned int page_count;
1956
1957 obj_request->pages = pages;
1958 page_count = (u32)calc_pages_for(offset, length);
1959 obj_request->page_count = page_count;
1960 if ((offset + length) & ~PAGE_MASK)
1961 page_count--; /* more on last page */
1962 pages += page_count;
1963 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001964
Alex Elder2fa12322013-04-05 01:27:12 -05001965 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1966 obj_request);
1967 if (!osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001968 goto out_partial;
Alex Elder2fa12322013-04-05 01:27:12 -05001969 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05001970 obj_request->callback = rbd_img_obj_callback;
Alex Elder430c28c2013-04-03 21:32:51 -05001971
Alex Elder2fa12322013-04-05 01:27:12 -05001972 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1973 0, 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001974 if (type == OBJ_REQUEST_BIO)
1975 osd_req_op_extent_osd_data_bio(osd_req, 0,
1976 obj_request->bio_list, length);
1977 else
1978 osd_req_op_extent_osd_data_pages(osd_req, 0,
1979 obj_request->pages, length,
1980 offset & ~PAGE_MASK, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05001981
1982 if (write_request)
1983 rbd_osd_req_format_write(obj_request);
1984 else
1985 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05001986
Alex Elder7da22d22013-01-24 16:13:36 -06001987 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001988 rbd_img_obj_request_add(img_request, obj_request);
1989
Alex Elder7da22d22013-01-24 16:13:36 -06001990 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001991 resid -= length;
1992 }
1993
1994 return 0;
1995
1996out_partial:
1997 rbd_obj_request_put(obj_request);
1998out_unwind:
1999 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2000 rbd_obj_request_put(obj_request);
2001
2002 return -ENOMEM;
2003}
2004
Alex Elder3d7efd12013-04-19 15:34:50 -05002005static void
Alex Elder0eefd472013-04-19 15:34:50 -05002006rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
2007{
2008 struct rbd_img_request *img_request;
2009 struct rbd_device *rbd_dev;
2010 u64 length;
2011 u32 page_count;
2012
2013 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2014 rbd_assert(obj_request_img_data_test(obj_request));
2015 img_request = obj_request->img_request;
2016 rbd_assert(img_request);
2017
2018 rbd_dev = img_request->rbd_dev;
2019 rbd_assert(rbd_dev);
2020 length = (u64)1 << rbd_dev->header.obj_order;
2021 page_count = (u32)calc_pages_for(0, length);
2022
2023 rbd_assert(obj_request->copyup_pages);
2024 ceph_release_page_vector(obj_request->copyup_pages, page_count);
2025 obj_request->copyup_pages = NULL;
2026
2027 /*
2028 * We want the transfer count to reflect the size of the
2029 * original write request. There is no such thing as a
2030 * successful short write, so if the request was successful
2031 * we can just set it to the originally-requested length.
2032 */
2033 if (!obj_request->result)
2034 obj_request->xferred = obj_request->length;
2035
2036 /* Finish up with the normal image object callback */
2037
2038 rbd_img_obj_callback(obj_request);
2039}
2040
2041static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002042rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2043{
2044 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002045 struct ceph_osd_request *osd_req;
2046 struct ceph_osd_client *osdc;
2047 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002048 struct page **pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002049 int result;
2050 u64 obj_size;
2051 u64 xferred;
2052
2053 rbd_assert(img_request_child_test(img_request));
2054
2055 /* First get what we need from the image request */
2056
2057 pages = img_request->copyup_pages;
2058 rbd_assert(pages != NULL);
2059 img_request->copyup_pages = NULL;
2060
2061 orig_request = img_request->obj_request;
2062 rbd_assert(orig_request != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002063 rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
Alex Elder3d7efd12013-04-19 15:34:50 -05002064 result = img_request->result;
2065 obj_size = img_request->length;
2066 xferred = img_request->xferred;
2067
Alex Elder0eefd472013-04-19 15:34:50 -05002068 rbd_dev = img_request->rbd_dev;
2069 rbd_assert(rbd_dev);
2070 rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2071
Alex Elder3d7efd12013-04-19 15:34:50 -05002072 rbd_img_request_put(img_request);
2073
Alex Elder0eefd472013-04-19 15:34:50 -05002074 if (result)
2075 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002076
Alex Elder0eefd472013-04-19 15:34:50 -05002077 /* Allocate the new copyup osd request for the original request */
Alex Elder3d7efd12013-04-19 15:34:50 -05002078
Alex Elder0eefd472013-04-19 15:34:50 -05002079 result = -ENOMEM;
2080 rbd_assert(!orig_request->osd_req);
2081 osd_req = rbd_osd_req_create_copyup(orig_request);
2082 if (!osd_req)
2083 goto out_err;
2084 orig_request->osd_req = osd_req;
2085 orig_request->copyup_pages = pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002086
Alex Elder0eefd472013-04-19 15:34:50 -05002087 /* Initialize the copyup op */
2088
2089 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2090 osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2091 false, false);
2092
2093 /* Then the original write request op */
2094
2095 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2096 orig_request->offset,
2097 orig_request->length, 0, 0);
2098 osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2099 orig_request->length);
2100
2101 rbd_osd_req_format_write(orig_request);
2102
2103 /* All set, send it off. */
2104
2105 orig_request->callback = rbd_img_obj_copyup_callback;
2106 osdc = &rbd_dev->rbd_client->client->osdc;
2107 result = rbd_obj_request_submit(osdc, orig_request);
2108 if (!result)
2109 return;
2110out_err:
2111 /* Record the error code and complete the request */
2112
2113 orig_request->result = result;
2114 orig_request->xferred = 0;
2115 obj_request_done_set(orig_request);
2116 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002117}
2118
2119/*
2120 * Read from the parent image the range of data that covers the
2121 * entire target of the given object request. This is used for
2122 * satisfying a layered image write request when the target of an
2123 * object request from the image request does not exist.
2124 *
2125 * A page array big enough to hold the returned data is allocated
2126 * and supplied to rbd_img_request_fill() as the "data descriptor."
2127 * When the read completes, this page array will be transferred to
2128 * the original object request for the copyup operation.
2129 *
2130 * If an error occurs, record it as the result of the original
2131 * object request and mark it done so it gets completed.
2132 */
2133static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2134{
2135 struct rbd_img_request *img_request = NULL;
2136 struct rbd_img_request *parent_request = NULL;
2137 struct rbd_device *rbd_dev;
2138 u64 img_offset;
2139 u64 length;
2140 struct page **pages = NULL;
2141 u32 page_count;
2142 int result;
2143
2144 rbd_assert(obj_request_img_data_test(obj_request));
2145 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2146
2147 img_request = obj_request->img_request;
2148 rbd_assert(img_request != NULL);
2149 rbd_dev = img_request->rbd_dev;
2150 rbd_assert(rbd_dev->parent != NULL);
2151
2152 /*
Alex Elder0eefd472013-04-19 15:34:50 -05002153 * First things first. The original osd request is of no
2154 * use to use any more, we'll need a new one that can hold
2155 * the two ops in a copyup request. We'll get that later,
2156 * but for now we can release the old one.
2157 */
2158 rbd_osd_req_destroy(obj_request->osd_req);
2159 obj_request->osd_req = NULL;
2160
2161 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002162 * Determine the byte range covered by the object in the
2163 * child image to which the original request was to be sent.
2164 */
2165 img_offset = obj_request->img_offset - obj_request->offset;
2166 length = (u64)1 << rbd_dev->header.obj_order;
2167
2168 /*
2169 * Allocate a page array big enough to receive the data read
2170 * from the parent.
2171 */
2172 page_count = (u32)calc_pages_for(0, length);
2173 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2174 if (IS_ERR(pages)) {
2175 result = PTR_ERR(pages);
2176 pages = NULL;
2177 goto out_err;
2178 }
2179
2180 result = -ENOMEM;
2181 parent_request = rbd_img_request_create(rbd_dev->parent,
2182 img_offset, length,
2183 false, true);
2184 if (!parent_request)
2185 goto out_err;
2186 rbd_obj_request_get(obj_request);
2187 parent_request->obj_request = obj_request;
2188
2189 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2190 if (result)
2191 goto out_err;
2192 parent_request->copyup_pages = pages;
2193
2194 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2195 result = rbd_img_request_submit(parent_request);
2196 if (!result)
2197 return 0;
2198
2199 parent_request->copyup_pages = NULL;
2200 parent_request->obj_request = NULL;
2201 rbd_obj_request_put(obj_request);
2202out_err:
2203 if (pages)
2204 ceph_release_page_vector(pages, page_count);
2205 if (parent_request)
2206 rbd_img_request_put(parent_request);
2207 obj_request->result = result;
2208 obj_request->xferred = 0;
2209 obj_request_done_set(obj_request);
2210
2211 return result;
2212}
2213
Alex Elderc5b5ef62013-02-11 12:33:24 -06002214static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2215{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002216 struct rbd_obj_request *orig_request;
2217 int result;
2218
2219 rbd_assert(!obj_request_img_data_test(obj_request));
2220
2221 /*
2222 * All we need from the object request is the original
2223 * request and the result of the STAT op. Grab those, then
2224 * we're done with the request.
2225 */
2226 orig_request = obj_request->obj_request;
2227 obj_request->obj_request = NULL;
2228 rbd_assert(orig_request);
2229 rbd_assert(orig_request->img_request);
2230
2231 result = obj_request->result;
2232 obj_request->result = 0;
2233
2234 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2235 obj_request, orig_request, result,
2236 obj_request->xferred, obj_request->length);
2237 rbd_obj_request_put(obj_request);
2238
2239 rbd_assert(orig_request);
2240 rbd_assert(orig_request->img_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002241
2242 /*
2243 * Our only purpose here is to determine whether the object
2244 * exists, and we don't want to treat the non-existence as
2245 * an error. If something else comes back, transfer the
2246 * error to the original request and complete it now.
2247 */
2248 if (!result) {
2249 obj_request_existence_set(orig_request, true);
2250 } else if (result == -ENOENT) {
2251 obj_request_existence_set(orig_request, false);
2252 } else if (result) {
2253 orig_request->result = result;
Alex Elder3d7efd12013-04-19 15:34:50 -05002254 goto out;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002255 }
2256
2257 /*
2258 * Resubmit the original request now that we have recorded
2259 * whether the target object exists.
2260 */
Alex Elderb454e362013-04-19 15:34:50 -05002261 orig_request->result = rbd_img_obj_request_submit(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002262out:
Alex Elderc5b5ef62013-02-11 12:33:24 -06002263 if (orig_request->result)
2264 rbd_obj_request_complete(orig_request);
2265 rbd_obj_request_put(orig_request);
2266}
2267
2268static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2269{
2270 struct rbd_obj_request *stat_request;
2271 struct rbd_device *rbd_dev;
2272 struct ceph_osd_client *osdc;
2273 struct page **pages = NULL;
2274 u32 page_count;
2275 size_t size;
2276 int ret;
2277
2278 /*
2279 * The response data for a STAT call consists of:
2280 * le64 length;
2281 * struct {
2282 * le32 tv_sec;
2283 * le32 tv_nsec;
2284 * } mtime;
2285 */
2286 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2287 page_count = (u32)calc_pages_for(0, size);
2288 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2289 if (IS_ERR(pages))
2290 return PTR_ERR(pages);
2291
2292 ret = -ENOMEM;
2293 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2294 OBJ_REQUEST_PAGES);
2295 if (!stat_request)
2296 goto out;
2297
2298 rbd_obj_request_get(obj_request);
2299 stat_request->obj_request = obj_request;
2300 stat_request->pages = pages;
2301 stat_request->page_count = page_count;
2302
2303 rbd_assert(obj_request->img_request);
2304 rbd_dev = obj_request->img_request->rbd_dev;
2305 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2306 stat_request);
2307 if (!stat_request->osd_req)
2308 goto out;
2309 stat_request->callback = rbd_img_obj_exists_callback;
2310
2311 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2312 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2313 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002314 rbd_osd_req_format_read(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002315
2316 osdc = &rbd_dev->rbd_client->client->osdc;
2317 ret = rbd_obj_request_submit(osdc, stat_request);
2318out:
2319 if (ret)
2320 rbd_obj_request_put(obj_request);
2321
2322 return ret;
2323}
2324
Alex Elderb454e362013-04-19 15:34:50 -05002325static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2326{
2327 struct rbd_img_request *img_request;
Alex Elder3d7efd12013-04-19 15:34:50 -05002328 bool known;
Alex Elderb454e362013-04-19 15:34:50 -05002329
2330 rbd_assert(obj_request_img_data_test(obj_request));
2331
2332 img_request = obj_request->img_request;
2333 rbd_assert(img_request);
2334
Alex Elderb454e362013-04-19 15:34:50 -05002335 /*
2336 * Only layered writes need special handling. If it's not a
2337 * layered write, or it is a layered write but we know the
2338 * target object exists, it's no different from any other
2339 * object request.
2340 */
2341 if (!img_request_write_test(img_request) ||
2342 !img_request_layered_test(img_request) ||
Alex Elder3d7efd12013-04-19 15:34:50 -05002343 ((known = obj_request_known_test(obj_request)) &&
2344 obj_request_exists_test(obj_request))) {
Alex Elderb454e362013-04-19 15:34:50 -05002345
2346 struct rbd_device *rbd_dev;
2347 struct ceph_osd_client *osdc;
2348
2349 rbd_dev = obj_request->img_request->rbd_dev;
2350 osdc = &rbd_dev->rbd_client->client->osdc;
2351
2352 return rbd_obj_request_submit(osdc, obj_request);
2353 }
2354
2355 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002356 * It's a layered write. The target object might exist but
2357 * we may not know that yet. If we know it doesn't exist,
2358 * start by reading the data for the full target object from
2359 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002360 */
Alex Elder3d7efd12013-04-19 15:34:50 -05002361 if (known)
2362 return rbd_img_obj_parent_read_full(obj_request);
2363
2364 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002365
2366 return rbd_img_obj_exists_submit(obj_request);
2367}
2368
Alex Elderbf0d5f502012-11-22 00:00:08 -06002369static int rbd_img_request_submit(struct rbd_img_request *img_request)
2370{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002371 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002372 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002373
Alex Elder37206ee2013-02-20 17:32:08 -06002374 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05002375 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002376 int ret;
2377
Alex Elderb454e362013-04-19 15:34:50 -05002378 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002379 if (ret)
2380 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002381 }
2382
2383 return 0;
2384}
2385
Alex Elder8b3e1a52013-01-24 16:13:36 -06002386static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2387{
2388 struct rbd_obj_request *obj_request;
2389
2390 rbd_assert(img_request_child_test(img_request));
2391
2392 obj_request = img_request->obj_request;
2393 rbd_assert(obj_request != NULL);
2394 obj_request->result = img_request->result;
2395 obj_request->xferred = img_request->xferred;
2396
2397 rbd_img_obj_request_read_callback(obj_request);
2398 rbd_obj_request_complete(obj_request);
2399}
2400
2401static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2402{
2403 struct rbd_device *rbd_dev;
2404 struct rbd_img_request *img_request;
2405 int result;
2406
2407 rbd_assert(obj_request_img_data_test(obj_request));
2408 rbd_assert(obj_request->img_request != NULL);
2409 rbd_assert(obj_request->result == (s32) -ENOENT);
2410 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2411
2412 rbd_dev = obj_request->img_request->rbd_dev;
2413 rbd_assert(rbd_dev->parent != NULL);
2414 /* rbd_read_finish(obj_request, obj_request->length); */
2415 img_request = rbd_img_request_create(rbd_dev->parent,
2416 obj_request->img_offset,
2417 obj_request->length,
2418 false, true);
2419 result = -ENOMEM;
2420 if (!img_request)
2421 goto out_err;
2422
2423 rbd_obj_request_get(obj_request);
2424 img_request->obj_request = obj_request;
2425
Alex Elderf1a47392013-04-19 15:34:50 -05002426 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2427 obj_request->bio_list);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002428 if (result)
2429 goto out_err;
2430
2431 img_request->callback = rbd_img_parent_read_callback;
2432 result = rbd_img_request_submit(img_request);
2433 if (result)
2434 goto out_err;
2435
2436 return;
2437out_err:
2438 if (img_request)
2439 rbd_img_request_put(img_request);
2440 obj_request->result = result;
2441 obj_request->xferred = 0;
2442 obj_request_done_set(obj_request);
2443}
2444
Alex Eldercf81b602013-01-17 12:18:46 -06002445static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06002446 u64 ver, u64 notify_id)
2447{
2448 struct rbd_obj_request *obj_request;
Alex Elder21692382013-04-05 01:27:12 -05002449 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderb8d70032012-11-30 17:53:04 -06002450 int ret;
2451
2452 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2453 OBJ_REQUEST_NODATA);
2454 if (!obj_request)
2455 return -ENOMEM;
2456
2457 ret = -ENOMEM;
Alex Elder430c28c2013-04-03 21:32:51 -05002458 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002459 if (!obj_request->osd_req)
2460 goto out;
Alex Elder21692382013-04-05 01:27:12 -05002461 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06002462
Alex Elderc99d2d42013-04-05 01:27:11 -05002463 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
2464 notify_id, ver, 0);
Alex Elder9d4df012013-04-19 15:34:50 -05002465 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002466
Alex Elderb8d70032012-11-30 17:53:04 -06002467 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002468out:
Alex Eldercf81b602013-01-17 12:18:46 -06002469 if (ret)
2470 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002471
2472 return ret;
2473}
2474
2475static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2476{
2477 struct rbd_device *rbd_dev = (struct rbd_device *)data;
2478 u64 hver;
2479 int rc;
2480
2481 if (!rbd_dev)
2482 return;
2483
Alex Elder37206ee2013-02-20 17:32:08 -06002484 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Elderb8d70032012-11-30 17:53:04 -06002485 rbd_dev->header_name, (unsigned long long) notify_id,
2486 (unsigned int) opcode);
2487 rc = rbd_dev_refresh(rbd_dev, &hver);
2488 if (rc)
2489 rbd_warn(rbd_dev, "got notification but failed to "
2490 " update snaps: %d\n", rc);
2491
Alex Eldercf81b602013-01-17 12:18:46 -06002492 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06002493}
2494
Alex Elder9969ebc2013-01-18 12:31:10 -06002495/*
2496 * Request sync osd watch/unwatch. The value of "start" determines
2497 * whether a watch request is being initiated or torn down.
2498 */
2499static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2500{
2501 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2502 struct rbd_obj_request *obj_request;
Alex Elder9969ebc2013-01-18 12:31:10 -06002503 int ret;
2504
2505 rbd_assert(start ^ !!rbd_dev->watch_event);
2506 rbd_assert(start ^ !!rbd_dev->watch_request);
2507
2508 if (start) {
Alex Elder3c663bb2013-02-15 11:42:30 -06002509 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
Alex Elder9969ebc2013-01-18 12:31:10 -06002510 &rbd_dev->watch_event);
2511 if (ret < 0)
2512 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06002513 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06002514 }
2515
2516 ret = -ENOMEM;
2517 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2518 OBJ_REQUEST_NODATA);
2519 if (!obj_request)
2520 goto out_cancel;
2521
Alex Elder430c28c2013-04-03 21:32:51 -05002522 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2523 if (!obj_request->osd_req)
2524 goto out_cancel;
2525
Alex Elder8eb87562013-01-25 17:08:55 -06002526 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06002527 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06002528 else
Alex Elder6977c3f2013-01-25 17:08:55 -06002529 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06002530 rbd_dev->watch_request->osd_req);
Alex Elder21692382013-04-05 01:27:12 -05002531
2532 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
2533 rbd_dev->watch_event->cookie,
2534 rbd_dev->header.obj_version, start);
Alex Elder9d4df012013-04-19 15:34:50 -05002535 rbd_osd_req_format_write(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002536
Alex Elder9969ebc2013-01-18 12:31:10 -06002537 ret = rbd_obj_request_submit(osdc, obj_request);
2538 if (ret)
2539 goto out_cancel;
2540 ret = rbd_obj_request_wait(obj_request);
2541 if (ret)
2542 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06002543 ret = obj_request->result;
2544 if (ret)
2545 goto out_cancel;
2546
Alex Elder8eb87562013-01-25 17:08:55 -06002547 /*
2548 * A watch request is set to linger, so the underlying osd
2549 * request won't go away until we unregister it. We retain
2550 * a pointer to the object request during that time (in
2551 * rbd_dev->watch_request), so we'll keep a reference to
2552 * it. We'll drop that reference (below) after we've
2553 * unregistered it.
2554 */
2555 if (start) {
2556 rbd_dev->watch_request = obj_request;
2557
2558 return 0;
2559 }
2560
2561 /* We have successfully torn down the watch request */
2562
2563 rbd_obj_request_put(rbd_dev->watch_request);
2564 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002565out_cancel:
2566 /* Cancel the event if we're tearing down, or on error */
2567 ceph_osdc_cancel_event(rbd_dev->watch_event);
2568 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002569 if (obj_request)
2570 rbd_obj_request_put(obj_request);
2571
2572 return ret;
2573}
2574
Alex Elder36be9a72013-01-19 00:30:28 -06002575/*
2576 * Synchronous osd object method call
2577 */
2578static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2579 const char *object_name,
2580 const char *class_name,
2581 const char *method_name,
2582 const char *outbound,
2583 size_t outbound_size,
2584 char *inbound,
2585 size_t inbound_size,
2586 u64 *version)
2587{
Alex Elder21692382013-04-05 01:27:12 -05002588 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder36be9a72013-01-19 00:30:28 -06002589 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06002590 struct page **pages;
2591 u32 page_count;
2592 int ret;
2593
2594 /*
Alex Elder6010a452013-04-05 01:27:11 -05002595 * Method calls are ultimately read operations. The result
2596 * should placed into the inbound buffer provided. They
2597 * also supply outbound data--parameters for the object
2598 * method. Currently if this is present it will be a
2599 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06002600 */
2601 page_count = (u32) calc_pages_for(0, inbound_size);
2602 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2603 if (IS_ERR(pages))
2604 return PTR_ERR(pages);
2605
2606 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05002607 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06002608 OBJ_REQUEST_PAGES);
2609 if (!obj_request)
2610 goto out;
2611
2612 obj_request->pages = pages;
2613 obj_request->page_count = page_count;
2614
Alex Elder430c28c2013-04-03 21:32:51 -05002615 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06002616 if (!obj_request->osd_req)
2617 goto out;
2618
Alex Elderc99d2d42013-04-05 01:27:11 -05002619 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05002620 class_name, method_name);
2621 if (outbound_size) {
2622 struct ceph_pagelist *pagelist;
2623
2624 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2625 if (!pagelist)
2626 goto out;
2627
2628 ceph_pagelist_init(pagelist);
2629 ceph_pagelist_append(pagelist, outbound, outbound_size);
2630 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2631 pagelist);
2632 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05002633 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2634 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05002635 0, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002636 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002637
Alex Elder36be9a72013-01-19 00:30:28 -06002638 ret = rbd_obj_request_submit(osdc, obj_request);
2639 if (ret)
2640 goto out;
2641 ret = rbd_obj_request_wait(obj_request);
2642 if (ret)
2643 goto out;
2644
2645 ret = obj_request->result;
2646 if (ret < 0)
2647 goto out;
Alex Elder23ed6e12013-02-06 13:11:38 -06002648 ret = 0;
Alex Elder903bb322013-02-06 13:11:38 -06002649 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06002650 if (version)
2651 *version = obj_request->version;
2652out:
2653 if (obj_request)
2654 rbd_obj_request_put(obj_request);
2655 else
2656 ceph_release_page_vector(pages, page_count);
2657
2658 return ret;
2659}
2660
Alex Elderbf0d5f502012-11-22 00:00:08 -06002661static void rbd_request_fn(struct request_queue *q)
Alex Eldercc344fa2013-02-19 12:25:56 -06002662 __releases(q->queue_lock) __acquires(q->queue_lock)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002663{
2664 struct rbd_device *rbd_dev = q->queuedata;
2665 bool read_only = rbd_dev->mapping.read_only;
2666 struct request *rq;
2667 int result;
2668
2669 while ((rq = blk_fetch_request(q))) {
2670 bool write_request = rq_data_dir(rq) == WRITE;
2671 struct rbd_img_request *img_request;
2672 u64 offset;
2673 u64 length;
2674
2675 /* Ignore any non-FS requests that filter through. */
2676
2677 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06002678 dout("%s: non-fs request type %d\n", __func__,
2679 (int) rq->cmd_type);
2680 __blk_end_request_all(rq, 0);
2681 continue;
2682 }
2683
2684 /* Ignore/skip any zero-length requests */
2685
2686 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2687 length = (u64) blk_rq_bytes(rq);
2688
2689 if (!length) {
2690 dout("%s: zero-length request\n", __func__);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002691 __blk_end_request_all(rq, 0);
2692 continue;
2693 }
2694
2695 spin_unlock_irq(q->queue_lock);
2696
2697 /* Disallow writes to a read-only device */
2698
2699 if (write_request) {
2700 result = -EROFS;
2701 if (read_only)
2702 goto end_request;
2703 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2704 }
2705
Alex Elder6d292902013-01-14 12:43:31 -06002706 /*
2707 * Quit early if the mapped snapshot no longer
2708 * exists. It's still possible the snapshot will
2709 * have disappeared by the time our request arrives
2710 * at the osd, but there's no sense in sending it if
2711 * we already know.
2712 */
2713 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002714 dout("request for non-existent snapshot");
2715 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2716 result = -ENXIO;
2717 goto end_request;
2718 }
2719
Alex Elderbf0d5f502012-11-22 00:00:08 -06002720 result = -EINVAL;
2721 if (WARN_ON(offset && length > U64_MAX - offset + 1))
2722 goto end_request; /* Shouldn't happen */
2723
2724 result = -ENOMEM;
2725 img_request = rbd_img_request_create(rbd_dev, offset, length,
Alex Elder9849e982013-01-24 16:13:36 -06002726 write_request, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002727 if (!img_request)
2728 goto end_request;
2729
2730 img_request->rq = rq;
2731
Alex Elderf1a47392013-04-19 15:34:50 -05002732 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2733 rq->bio);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002734 if (!result)
2735 result = rbd_img_request_submit(img_request);
2736 if (result)
2737 rbd_img_request_put(img_request);
2738end_request:
2739 spin_lock_irq(q->queue_lock);
2740 if (result < 0) {
Alex Elder7da22d22013-01-24 16:13:36 -06002741 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2742 write_request ? "write" : "read",
2743 length, offset, result);
2744
Alex Elderbf0d5f502012-11-22 00:00:08 -06002745 __blk_end_request_all(rq, result);
2746 }
2747 }
2748}
2749
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002750/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002751 * a queue callback. Makes sure that we don't create a bio that spans across
2752 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002753 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002754 */
2755static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2756 struct bio_vec *bvec)
2757{
2758 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05002759 sector_t sector_offset;
2760 sector_t sectors_per_obj;
2761 sector_t obj_sector_offset;
2762 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002763
Alex Eldere5cfeed22012-10-20 22:17:27 -05002764 /*
2765 * Find how far into its rbd object the partition-relative
2766 * bio start sector is to offset relative to the enclosing
2767 * device.
2768 */
2769 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2770 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2771 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002772
Alex Eldere5cfeed22012-10-20 22:17:27 -05002773 /*
2774 * Compute the number of bytes from that offset to the end
2775 * of the object. Account for what's already used by the bio.
2776 */
2777 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2778 if (ret > bmd->bi_size)
2779 ret -= bmd->bi_size;
2780 else
2781 ret = 0;
2782
2783 /*
2784 * Don't send back more than was asked for. And if the bio
2785 * was empty, let the whole thing through because: "Note
2786 * that a block device *must* allow a single page to be
2787 * added to an empty bio."
2788 */
2789 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2790 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2791 ret = (int) bvec->bv_len;
2792
2793 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002794}
2795
2796static void rbd_free_disk(struct rbd_device *rbd_dev)
2797{
2798 struct gendisk *disk = rbd_dev->disk;
2799
2800 if (!disk)
2801 return;
2802
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002803 if (disk->flags & GENHD_FL_UP)
2804 del_gendisk(disk);
2805 if (disk->queue)
2806 blk_cleanup_queue(disk->queue);
2807 put_disk(disk);
2808}
2809
Alex Elder788e2df2013-01-17 12:25:27 -06002810static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2811 const char *object_name,
2812 u64 offset, u64 length,
2813 char *buf, u64 *version)
2814
2815{
Alex Elder21692382013-04-05 01:27:12 -05002816 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder788e2df2013-01-17 12:25:27 -06002817 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06002818 struct page **pages = NULL;
2819 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002820 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06002821 int ret;
2822
2823 page_count = (u32) calc_pages_for(offset, length);
2824 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2825 if (IS_ERR(pages))
2826 ret = PTR_ERR(pages);
2827
2828 ret = -ENOMEM;
2829 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002830 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002831 if (!obj_request)
2832 goto out;
2833
2834 obj_request->pages = pages;
2835 obj_request->page_count = page_count;
2836
Alex Elder430c28c2013-04-03 21:32:51 -05002837 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06002838 if (!obj_request->osd_req)
2839 goto out;
2840
Alex Elderc99d2d42013-04-05 01:27:11 -05002841 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2842 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05002843 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05002844 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05002845 obj_request->length,
2846 obj_request->offset & ~PAGE_MASK,
2847 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002848 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002849
Alex Elder788e2df2013-01-17 12:25:27 -06002850 ret = rbd_obj_request_submit(osdc, obj_request);
2851 if (ret)
2852 goto out;
2853 ret = rbd_obj_request_wait(obj_request);
2854 if (ret)
2855 goto out;
2856
2857 ret = obj_request->result;
2858 if (ret < 0)
2859 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002860
2861 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2862 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002863 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder23ed6e12013-02-06 13:11:38 -06002864 rbd_assert(size <= (size_t) INT_MAX);
2865 ret = (int) size;
Alex Elder788e2df2013-01-17 12:25:27 -06002866 if (version)
2867 *version = obj_request->version;
2868out:
2869 if (obj_request)
2870 rbd_obj_request_put(obj_request);
2871 else
2872 ceph_release_page_vector(pages, page_count);
2873
2874 return ret;
2875}
2876
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002877/*
Alex Elder4156d992012-08-02 11:29:46 -05002878 * Read the complete header for the given rbd device.
2879 *
2880 * Returns a pointer to a dynamically-allocated buffer containing
2881 * the complete and validated header. Caller can pass the address
2882 * of a variable that will be filled in with the version of the
2883 * header object at the time it was read.
2884 *
2885 * Returns a pointer-coded errno if a failure occurs.
2886 */
2887static struct rbd_image_header_ondisk *
2888rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2889{
2890 struct rbd_image_header_ondisk *ondisk = NULL;
2891 u32 snap_count = 0;
2892 u64 names_size = 0;
2893 u32 want_count;
2894 int ret;
2895
2896 /*
2897 * The complete header will include an array of its 64-bit
2898 * snapshot ids, followed by the names of those snapshots as
2899 * a contiguous block of NUL-terminated strings. Note that
2900 * the number of snapshots could change by the time we read
2901 * it in, in which case we re-read it.
2902 */
2903 do {
2904 size_t size;
2905
2906 kfree(ondisk);
2907
2908 size = sizeof (*ondisk);
2909 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2910 size += names_size;
2911 ondisk = kmalloc(size, GFP_KERNEL);
2912 if (!ondisk)
2913 return ERR_PTR(-ENOMEM);
2914
Alex Elder788e2df2013-01-17 12:25:27 -06002915 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002916 0, size,
2917 (char *) ondisk, version);
Alex Elder4156d992012-08-02 11:29:46 -05002918 if (ret < 0)
2919 goto out_err;
2920 if (WARN_ON((size_t) ret < size)) {
2921 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002922 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2923 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002924 goto out_err;
2925 }
2926 if (!rbd_dev_ondisk_valid(ondisk)) {
2927 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002928 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002929 goto out_err;
2930 }
2931
2932 names_size = le64_to_cpu(ondisk->snap_names_len);
2933 want_count = snap_count;
2934 snap_count = le32_to_cpu(ondisk->snap_count);
2935 } while (snap_count != want_count);
2936
2937 return ondisk;
2938
2939out_err:
2940 kfree(ondisk);
2941
2942 return ERR_PTR(ret);
2943}
2944
2945/*
2946 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002947 */
2948static int rbd_read_header(struct rbd_device *rbd_dev,
2949 struct rbd_image_header *header)
2950{
Alex Elder4156d992012-08-02 11:29:46 -05002951 struct rbd_image_header_ondisk *ondisk;
2952 u64 ver = 0;
2953 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002954
Alex Elder4156d992012-08-02 11:29:46 -05002955 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2956 if (IS_ERR(ondisk))
2957 return PTR_ERR(ondisk);
2958 ret = rbd_header_from_disk(header, ondisk);
2959 if (ret >= 0)
2960 header->obj_version = ver;
2961 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002962
Alex Elder4156d992012-08-02 11:29:46 -05002963 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002964}
2965
Alex Elder41f38c22012-10-25 23:34:40 -05002966static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002967{
2968 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002969 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002970
Alex Eldera0593292012-07-19 09:09:27 -05002971 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002972 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002973}
2974
Alex Elder94785542012-10-09 13:50:17 -07002975static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2976{
2977 sector_t size;
2978
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002979 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002980 return;
2981
2982 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2983 dout("setting size to %llu sectors", (unsigned long long) size);
2984 rbd_dev->mapping.size = (u64) size;
2985 set_capacity(rbd_dev->disk, size);
2986}
2987
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002988/*
2989 * only read the first part of the ondisk header, without the snaps info
2990 */
Alex Elder117973f2012-08-31 17:29:55 -05002991static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002992{
2993 int ret;
2994 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002995
2996 ret = rbd_read_header(rbd_dev, &h);
2997 if (ret < 0)
2998 return ret;
2999
Josh Durgina51aa0c2011-12-05 10:35:04 -08003000 down_write(&rbd_dev->header_rwsem);
3001
Alex Elder94785542012-10-09 13:50:17 -07003002 /* Update image size, and check for resize of mapped image */
3003 rbd_dev->header.image_size = h.image_size;
3004 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07003005
Alex Elder849b4262012-07-09 21:04:24 -05003006 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003007 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05003008 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08003009 /* osd requests may still refer to snapc */
3010 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003011
Alex Elderb8136232012-07-25 09:32:41 -05003012 if (hver)
3013 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08003014 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08003015 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003016 rbd_dev->header.snapc = h.snapc;
3017 rbd_dev->header.snap_names = h.snap_names;
3018 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05003019 /* Free the extra copy of the object prefix */
3020 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
3021 kfree(h.object_prefix);
3022
Alex Elder304f6802012-08-31 17:29:52 -05003023 ret = rbd_dev_snaps_update(rbd_dev);
3024 if (!ret)
3025 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003026
Josh Durginc6666012011-11-21 17:11:12 -08003027 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003028
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003029 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003030}
3031
Alex Elder117973f2012-08-31 17:29:55 -05003032static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05003033{
3034 int ret;
3035
Alex Elder117973f2012-08-31 17:29:55 -05003036 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05003037 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05003038 if (rbd_dev->image_format == 1)
3039 ret = rbd_dev_v1_refresh(rbd_dev, hver);
3040 else
3041 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05003042 mutex_unlock(&ctl_mutex);
Laurent Barbed98df632013-04-10 17:47:46 -05003043 revalidate_disk(rbd_dev->disk);
Alex Elder1fe5e992012-07-25 09:32:41 -05003044
3045 return ret;
3046}
3047
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003048static int rbd_init_disk(struct rbd_device *rbd_dev)
3049{
3050 struct gendisk *disk;
3051 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003052 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003053
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003054 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003055 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3056 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003057 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003058
Alex Elderf0f8cef2012-01-29 13:57:44 -06003059 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003060 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003061 disk->major = rbd_dev->major;
3062 disk->first_minor = 0;
3063 disk->fops = &rbd_bd_ops;
3064 disk->private_data = rbd_dev;
3065
Alex Elderbf0d5f502012-11-22 00:00:08 -06003066 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003067 if (!q)
3068 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003069
Alex Elder593a9e72012-02-07 12:03:37 -06003070 /* We use the default size, but let's be explicit about it. */
3071 blk_queue_physical_block_size(q, SECTOR_SIZE);
3072
Josh Durgin029bcbd2011-07-22 11:35:23 -07003073 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003074 segment_size = rbd_obj_bytes(&rbd_dev->header);
3075 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3076 blk_queue_max_segment_size(q, segment_size);
3077 blk_queue_io_min(q, segment_size);
3078 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003079
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003080 blk_queue_merge_bvec(q, rbd_merge_bvec);
3081 disk->queue = q;
3082
3083 q->queuedata = rbd_dev;
3084
3085 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003086
Alex Elder12f02942012-08-29 17:11:07 -05003087 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
3088
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003089 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003090out_disk:
3091 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003092
3093 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003094}
3095
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003096/*
3097 sysfs
3098*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003099
Alex Elder593a9e72012-02-07 12:03:37 -06003100static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3101{
3102 return container_of(dev, struct rbd_device, dev);
3103}
3104
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003105static ssize_t rbd_size_show(struct device *dev,
3106 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003107{
Alex Elder593a9e72012-02-07 12:03:37 -06003108 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08003109 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003110
Josh Durgina51aa0c2011-12-05 10:35:04 -08003111 down_read(&rbd_dev->header_rwsem);
3112 size = get_capacity(rbd_dev->disk);
3113 up_read(&rbd_dev->header_rwsem);
3114
3115 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003116}
3117
Alex Elder34b13182012-07-13 20:35:12 -05003118/*
3119 * Note this shows the features for whatever's mapped, which is not
3120 * necessarily the base image.
3121 */
3122static ssize_t rbd_features_show(struct device *dev,
3123 struct device_attribute *attr, char *buf)
3124{
3125 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3126
3127 return sprintf(buf, "0x%016llx\n",
3128 (unsigned long long) rbd_dev->mapping.features);
3129}
3130
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003131static ssize_t rbd_major_show(struct device *dev,
3132 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003133{
Alex Elder593a9e72012-02-07 12:03:37 -06003134 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003135
3136 return sprintf(buf, "%d\n", rbd_dev->major);
3137}
3138
3139static ssize_t rbd_client_id_show(struct device *dev,
3140 struct device_attribute *attr, char *buf)
3141{
Alex Elder593a9e72012-02-07 12:03:37 -06003142 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003143
Alex Elder1dbb4392012-01-24 10:08:37 -06003144 return sprintf(buf, "client%lld\n",
3145 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003146}
3147
3148static ssize_t rbd_pool_show(struct device *dev,
3149 struct device_attribute *attr, char *buf)
3150{
Alex Elder593a9e72012-02-07 12:03:37 -06003151 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003152
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003153 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003154}
3155
Alex Elder9bb2f332012-07-12 10:46:35 -05003156static ssize_t rbd_pool_id_show(struct device *dev,
3157 struct device_attribute *attr, char *buf)
3158{
3159 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3160
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003161 return sprintf(buf, "%llu\n",
3162 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003163}
3164
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003165static ssize_t rbd_name_show(struct device *dev,
3166 struct device_attribute *attr, char *buf)
3167{
Alex Elder593a9e72012-02-07 12:03:37 -06003168 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003169
Alex Eldera92ffdf2012-10-30 19:40:33 -05003170 if (rbd_dev->spec->image_name)
3171 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3172
3173 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003174}
3175
Alex Elder589d30e2012-07-10 20:30:11 -05003176static ssize_t rbd_image_id_show(struct device *dev,
3177 struct device_attribute *attr, char *buf)
3178{
3179 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3180
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003181 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003182}
3183
Alex Elder34b13182012-07-13 20:35:12 -05003184/*
3185 * Shows the name of the currently-mapped snapshot (or
3186 * RBD_SNAP_HEAD_NAME for the base image).
3187 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003188static ssize_t rbd_snap_show(struct device *dev,
3189 struct device_attribute *attr,
3190 char *buf)
3191{
Alex Elder593a9e72012-02-07 12:03:37 -06003192 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003193
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003194 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003195}
3196
Alex Elder86b00e02012-10-25 23:34:42 -05003197/*
3198 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3199 * for the parent image. If there is no parent, simply shows
3200 * "(no parent image)".
3201 */
3202static ssize_t rbd_parent_show(struct device *dev,
3203 struct device_attribute *attr,
3204 char *buf)
3205{
3206 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3207 struct rbd_spec *spec = rbd_dev->parent_spec;
3208 int count;
3209 char *bufp = buf;
3210
3211 if (!spec)
3212 return sprintf(buf, "(no parent image)\n");
3213
3214 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3215 (unsigned long long) spec->pool_id, spec->pool_name);
3216 if (count < 0)
3217 return count;
3218 bufp += count;
3219
3220 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3221 spec->image_name ? spec->image_name : "(unknown)");
3222 if (count < 0)
3223 return count;
3224 bufp += count;
3225
3226 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3227 (unsigned long long) spec->snap_id, spec->snap_name);
3228 if (count < 0)
3229 return count;
3230 bufp += count;
3231
3232 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3233 if (count < 0)
3234 return count;
3235 bufp += count;
3236
3237 return (ssize_t) (bufp - buf);
3238}
3239
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003240static ssize_t rbd_image_refresh(struct device *dev,
3241 struct device_attribute *attr,
3242 const char *buf,
3243 size_t size)
3244{
Alex Elder593a9e72012-02-07 12:03:37 -06003245 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003246 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003247
Alex Elder117973f2012-08-31 17:29:55 -05003248 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05003249
3250 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003251}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003252
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003253static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003254static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003255static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3256static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3257static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003258static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003259static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003260static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003261static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3262static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003263static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003264
3265static struct attribute *rbd_attrs[] = {
3266 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003267 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003268 &dev_attr_major.attr,
3269 &dev_attr_client_id.attr,
3270 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05003271 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003272 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05003273 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003274 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05003275 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003276 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003277 NULL
3278};
3279
3280static struct attribute_group rbd_attr_group = {
3281 .attrs = rbd_attrs,
3282};
3283
3284static const struct attribute_group *rbd_attr_groups[] = {
3285 &rbd_attr_group,
3286 NULL
3287};
3288
3289static void rbd_sysfs_dev_release(struct device *dev)
3290{
3291}
3292
3293static struct device_type rbd_device_type = {
3294 .name = "rbd",
3295 .groups = rbd_attr_groups,
3296 .release = rbd_sysfs_dev_release,
3297};
3298
3299
3300/*
3301 sysfs - snapshots
3302*/
3303
3304static ssize_t rbd_snap_size_show(struct device *dev,
3305 struct device_attribute *attr,
3306 char *buf)
3307{
3308 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3309
Josh Durgin3591538f2011-12-05 18:25:13 -08003310 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003311}
3312
3313static ssize_t rbd_snap_id_show(struct device *dev,
3314 struct device_attribute *attr,
3315 char *buf)
3316{
3317 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3318
Josh Durgin3591538f2011-12-05 18:25:13 -08003319 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003320}
3321
Alex Elder34b13182012-07-13 20:35:12 -05003322static ssize_t rbd_snap_features_show(struct device *dev,
3323 struct device_attribute *attr,
3324 char *buf)
3325{
3326 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3327
3328 return sprintf(buf, "0x%016llx\n",
3329 (unsigned long long) snap->features);
3330}
3331
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003332static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
3333static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003334static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003335
3336static struct attribute *rbd_snap_attrs[] = {
3337 &dev_attr_snap_size.attr,
3338 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003339 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003340 NULL,
3341};
3342
3343static struct attribute_group rbd_snap_attr_group = {
3344 .attrs = rbd_snap_attrs,
3345};
3346
3347static void rbd_snap_dev_release(struct device *dev)
3348{
3349 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
3350 kfree(snap->name);
3351 kfree(snap);
3352}
3353
3354static const struct attribute_group *rbd_snap_attr_groups[] = {
3355 &rbd_snap_attr_group,
3356 NULL
3357};
3358
3359static struct device_type rbd_snap_device_type = {
3360 .groups = rbd_snap_attr_groups,
3361 .release = rbd_snap_dev_release,
3362};
3363
Alex Elder8b8fb992012-10-26 17:25:24 -05003364static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3365{
3366 kref_get(&spec->kref);
3367
3368 return spec;
3369}
3370
3371static void rbd_spec_free(struct kref *kref);
3372static void rbd_spec_put(struct rbd_spec *spec)
3373{
3374 if (spec)
3375 kref_put(&spec->kref, rbd_spec_free);
3376}
3377
3378static struct rbd_spec *rbd_spec_alloc(void)
3379{
3380 struct rbd_spec *spec;
3381
3382 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3383 if (!spec)
3384 return NULL;
3385 kref_init(&spec->kref);
3386
Alex Elder8b8fb992012-10-26 17:25:24 -05003387 return spec;
3388}
3389
3390static void rbd_spec_free(struct kref *kref)
3391{
3392 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3393
3394 kfree(spec->pool_name);
3395 kfree(spec->image_id);
3396 kfree(spec->image_name);
3397 kfree(spec->snap_name);
3398 kfree(spec);
3399}
3400
Alex Eldercc344fa2013-02-19 12:25:56 -06003401static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05003402 struct rbd_spec *spec)
3403{
3404 struct rbd_device *rbd_dev;
3405
3406 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3407 if (!rbd_dev)
3408 return NULL;
3409
3410 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06003411 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05003412 INIT_LIST_HEAD(&rbd_dev->node);
3413 INIT_LIST_HEAD(&rbd_dev->snaps);
3414 init_rwsem(&rbd_dev->header_rwsem);
3415
3416 rbd_dev->spec = spec;
3417 rbd_dev->rbd_client = rbdc;
3418
Alex Elder0903e872012-11-14 12:25:19 -06003419 /* Initialize the layout used for all rbd requests */
3420
3421 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3422 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3423 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3424 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3425
Alex Elderc53d5892012-10-25 23:34:42 -05003426 return rbd_dev;
3427}
3428
3429static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3430{
Alex Elder86b00e02012-10-25 23:34:42 -05003431 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05003432 kfree(rbd_dev->header_name);
3433 rbd_put_client(rbd_dev->rbd_client);
3434 rbd_spec_put(rbd_dev->spec);
3435 kfree(rbd_dev);
3436}
3437
Alex Elder304f6802012-08-31 17:29:52 -05003438static bool rbd_snap_registered(struct rbd_snap *snap)
3439{
3440 bool ret = snap->dev.type == &rbd_snap_device_type;
3441 bool reg = device_is_registered(&snap->dev);
3442
3443 rbd_assert(!ret ^ reg);
3444
3445 return ret;
3446}
3447
Alex Elder41f38c22012-10-25 23:34:40 -05003448static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003449{
3450 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05003451 if (device_is_registered(&snap->dev))
3452 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003453}
3454
Alex Elder14e70852012-07-19 09:09:27 -05003455static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003456 struct device *parent)
3457{
3458 struct device *dev = &snap->dev;
3459 int ret;
3460
3461 dev->type = &rbd_snap_device_type;
3462 dev->parent = parent;
3463 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05003464 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05003465 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
3466
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003467 ret = device_register(dev);
3468
3469 return ret;
3470}
3471
Alex Elder4e891e02012-07-10 20:30:10 -05003472static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05003473 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05003474 u64 snap_id, u64 snap_size,
3475 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003476{
Alex Elder4e891e02012-07-10 20:30:10 -05003477 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003478 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05003479
3480 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003481 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05003482 return ERR_PTR(-ENOMEM);
3483
3484 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05003485 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05003486 if (!snap->name)
3487 goto err;
3488
Alex Elderc8d18422012-07-10 20:30:11 -05003489 snap->id = snap_id;
3490 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05003491 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05003492
3493 return snap;
3494
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003495err:
3496 kfree(snap->name);
3497 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05003498
3499 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003500}
3501
Alex Eldercd892122012-07-03 16:01:19 -05003502static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
3503 u64 *snap_size, u64 *snap_features)
3504{
3505 char *snap_name;
3506
3507 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3508
3509 *snap_size = rbd_dev->header.snap_sizes[which];
3510 *snap_features = 0; /* No features for v1 */
3511
3512 /* Skip over names until we find the one we are looking for */
3513
3514 snap_name = rbd_dev->header.snap_names;
3515 while (which--)
3516 snap_name += strlen(snap_name) + 1;
3517
3518 return snap_name;
3519}
3520
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003521/*
Alex Elder9d475de2012-07-03 16:01:19 -05003522 * Get the size and object order for an image snapshot, or if
3523 * snap_id is CEPH_NOSNAP, gets this information for the base
3524 * image.
3525 */
3526static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3527 u8 *order, u64 *snap_size)
3528{
3529 __le64 snapid = cpu_to_le64(snap_id);
3530 int ret;
3531 struct {
3532 u8 order;
3533 __le64 size;
3534 } __attribute__ ((packed)) size_buf = { 0 };
3535
Alex Elder36be9a72013-01-19 00:30:28 -06003536 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05003537 "rbd", "get_size",
3538 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06003539 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003540 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05003541 if (ret < 0)
3542 return ret;
3543
3544 *order = size_buf.order;
3545 *snap_size = le64_to_cpu(size_buf.size);
3546
3547 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
3548 (unsigned long long) snap_id, (unsigned int) *order,
3549 (unsigned long long) *snap_size);
3550
3551 return 0;
3552}
3553
3554static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3555{
3556 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3557 &rbd_dev->header.obj_order,
3558 &rbd_dev->header.image_size);
3559}
3560
Alex Elder1e130192012-07-03 16:01:19 -05003561static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3562{
3563 void *reply_buf;
3564 int ret;
3565 void *p;
3566
3567 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3568 if (!reply_buf)
3569 return -ENOMEM;
3570
Alex Elder36be9a72013-01-19 00:30:28 -06003571 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder1e130192012-07-03 16:01:19 -05003572 "rbd", "get_object_prefix",
3573 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003574 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003575 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05003576 if (ret < 0)
3577 goto out;
3578
3579 p = reply_buf;
3580 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
3581 p + RBD_OBJ_PREFIX_LEN_MAX,
3582 NULL, GFP_NOIO);
3583
3584 if (IS_ERR(rbd_dev->header.object_prefix)) {
3585 ret = PTR_ERR(rbd_dev->header.object_prefix);
3586 rbd_dev->header.object_prefix = NULL;
3587 } else {
3588 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
3589 }
3590
3591out:
3592 kfree(reply_buf);
3593
3594 return ret;
3595}
3596
Alex Elderb1b54022012-07-03 16:01:19 -05003597static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3598 u64 *snap_features)
3599{
3600 __le64 snapid = cpu_to_le64(snap_id);
3601 struct {
3602 __le64 features;
3603 __le64 incompat;
3604 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07003605 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05003606 int ret;
3607
Alex Elder36be9a72013-01-19 00:30:28 -06003608 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05003609 "rbd", "get_features",
3610 (char *) &snapid, sizeof (snapid),
3611 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06003612 NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003613 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05003614 if (ret < 0)
3615 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07003616
3617 incompat = le64_to_cpu(features_buf.incompat);
Alex Elder5cbf6f122013-04-11 09:29:48 -05003618 if (incompat & ~RBD_FEATURES_SUPPORTED)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05003619 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07003620
Alex Elderb1b54022012-07-03 16:01:19 -05003621 *snap_features = le64_to_cpu(features_buf.features);
3622
3623 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
3624 (unsigned long long) snap_id,
3625 (unsigned long long) *snap_features,
3626 (unsigned long long) le64_to_cpu(features_buf.incompat));
3627
3628 return 0;
3629}
3630
3631static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3632{
3633 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3634 &rbd_dev->header.features);
3635}
3636
Alex Elder86b00e02012-10-25 23:34:42 -05003637static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3638{
3639 struct rbd_spec *parent_spec;
3640 size_t size;
3641 void *reply_buf = NULL;
3642 __le64 snapid;
3643 void *p;
3644 void *end;
3645 char *image_id;
3646 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05003647 int ret;
3648
3649 parent_spec = rbd_spec_alloc();
3650 if (!parent_spec)
3651 return -ENOMEM;
3652
3653 size = sizeof (__le64) + /* pool_id */
3654 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
3655 sizeof (__le64) + /* snap_id */
3656 sizeof (__le64); /* overlap */
3657 reply_buf = kmalloc(size, GFP_KERNEL);
3658 if (!reply_buf) {
3659 ret = -ENOMEM;
3660 goto out_err;
3661 }
3662
3663 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06003664 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05003665 "rbd", "get_parent",
3666 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06003667 (char *) reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003668 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05003669 if (ret < 0)
3670 goto out_err;
3671
3672 ret = -ERANGE;
3673 p = reply_buf;
3674 end = (char *) reply_buf + size;
3675 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3676 if (parent_spec->pool_id == CEPH_NOPOOL)
3677 goto out; /* No parent? No problem. */
3678
Alex Elder0903e872012-11-14 12:25:19 -06003679 /* The ceph file layout needs to fit pool id in 32 bits */
3680
3681 ret = -EIO;
3682 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
3683 goto out;
3684
Alex Elder979ed482012-11-01 08:39:26 -05003685 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05003686 if (IS_ERR(image_id)) {
3687 ret = PTR_ERR(image_id);
3688 goto out_err;
3689 }
3690 parent_spec->image_id = image_id;
3691 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3692 ceph_decode_64_safe(&p, end, overlap, out_err);
3693
3694 rbd_dev->parent_overlap = overlap;
3695 rbd_dev->parent_spec = parent_spec;
3696 parent_spec = NULL; /* rbd_dev now owns this */
3697out:
3698 ret = 0;
3699out_err:
3700 kfree(reply_buf);
3701 rbd_spec_put(parent_spec);
3702
3703 return ret;
3704}
3705
Alex Elder9e15b772012-10-30 19:40:33 -05003706static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3707{
3708 size_t image_id_size;
3709 char *image_id;
3710 void *p;
3711 void *end;
3712 size_t size;
3713 void *reply_buf = NULL;
3714 size_t len = 0;
3715 char *image_name = NULL;
3716 int ret;
3717
3718 rbd_assert(!rbd_dev->spec->image_name);
3719
Alex Elder69e7a022012-11-01 08:39:26 -05003720 len = strlen(rbd_dev->spec->image_id);
3721 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003722 image_id = kmalloc(image_id_size, GFP_KERNEL);
3723 if (!image_id)
3724 return NULL;
3725
3726 p = image_id;
3727 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05003728 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05003729
3730 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3731 reply_buf = kmalloc(size, GFP_KERNEL);
3732 if (!reply_buf)
3733 goto out;
3734
Alex Elder36be9a72013-01-19 00:30:28 -06003735 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003736 "rbd", "dir_get_name",
3737 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06003738 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05003739 if (ret < 0)
3740 goto out;
3741 p = reply_buf;
3742 end = (char *) reply_buf + size;
3743 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3744 if (IS_ERR(image_name))
3745 image_name = NULL;
3746 else
3747 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3748out:
3749 kfree(reply_buf);
3750 kfree(image_id);
3751
3752 return image_name;
3753}
3754
3755/*
3756 * When a parent image gets probed, we only have the pool, image,
3757 * and snapshot ids but not the names of any of them. This call
3758 * is made later to fill in those names. It has to be done after
3759 * rbd_dev_snaps_update() has completed because some of the
3760 * information (in particular, snapshot name) is not available
3761 * until then.
3762 */
3763static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
3764{
3765 struct ceph_osd_client *osdc;
3766 const char *name;
3767 void *reply_buf = NULL;
3768 int ret;
3769
3770 if (rbd_dev->spec->pool_name)
3771 return 0; /* Already have the names */
3772
3773 /* Look up the pool name */
3774
3775 osdc = &rbd_dev->rbd_client->client->osdc;
3776 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003777 if (!name) {
3778 rbd_warn(rbd_dev, "there is no pool with id %llu",
3779 rbd_dev->spec->pool_id); /* Really a BUG() */
3780 return -EIO;
3781 }
Alex Elder9e15b772012-10-30 19:40:33 -05003782
3783 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
3784 if (!rbd_dev->spec->pool_name)
3785 return -ENOMEM;
3786
3787 /* Fetch the image name; tolerate failure here */
3788
3789 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05003790 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05003791 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05003792 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05003793 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003794
3795 /* Look up the snapshot name. */
3796
3797 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
3798 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05003799 rbd_warn(rbd_dev, "no snapshot with id %llu",
3800 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05003801 ret = -EIO;
3802 goto out_err;
3803 }
3804 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
3805 if(!rbd_dev->spec->snap_name)
3806 goto out_err;
3807
3808 return 0;
3809out_err:
3810 kfree(reply_buf);
3811 kfree(rbd_dev->spec->pool_name);
3812 rbd_dev->spec->pool_name = NULL;
3813
3814 return ret;
3815}
3816
Alex Elder6e14b1a2012-07-03 16:01:19 -05003817static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05003818{
3819 size_t size;
3820 int ret;
3821 void *reply_buf;
3822 void *p;
3823 void *end;
3824 u64 seq;
3825 u32 snap_count;
3826 struct ceph_snap_context *snapc;
3827 u32 i;
3828
3829 /*
3830 * We'll need room for the seq value (maximum snapshot id),
3831 * snapshot count, and array of that many snapshot ids.
3832 * For now we have a fixed upper limit on the number we're
3833 * prepared to receive.
3834 */
3835 size = sizeof (__le64) + sizeof (__le32) +
3836 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3837 reply_buf = kzalloc(size, GFP_KERNEL);
3838 if (!reply_buf)
3839 return -ENOMEM;
3840
Alex Elder36be9a72013-01-19 00:30:28 -06003841 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder35d489f2012-07-03 16:01:19 -05003842 "rbd", "get_snapcontext",
3843 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003844 reply_buf, size, ver);
Alex Elder36be9a72013-01-19 00:30:28 -06003845 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003846 if (ret < 0)
3847 goto out;
3848
3849 ret = -ERANGE;
3850 p = reply_buf;
3851 end = (char *) reply_buf + size;
3852 ceph_decode_64_safe(&p, end, seq, out);
3853 ceph_decode_32_safe(&p, end, snap_count, out);
3854
3855 /*
3856 * Make sure the reported number of snapshot ids wouldn't go
3857 * beyond the end of our buffer. But before checking that,
3858 * make sure the computed size of the snapshot context we
3859 * allocate is representable in a size_t.
3860 */
3861 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3862 / sizeof (u64)) {
3863 ret = -EINVAL;
3864 goto out;
3865 }
3866 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3867 goto out;
3868
3869 size = sizeof (struct ceph_snap_context) +
3870 snap_count * sizeof (snapc->snaps[0]);
3871 snapc = kmalloc(size, GFP_KERNEL);
3872 if (!snapc) {
3873 ret = -ENOMEM;
3874 goto out;
3875 }
3876
3877 atomic_set(&snapc->nref, 1);
3878 snapc->seq = seq;
3879 snapc->num_snaps = snap_count;
3880 for (i = 0; i < snap_count; i++)
3881 snapc->snaps[i] = ceph_decode_64(&p);
3882
3883 rbd_dev->header.snapc = snapc;
3884
3885 dout(" snap context seq = %llu, snap_count = %u\n",
3886 (unsigned long long) seq, (unsigned int) snap_count);
3887
3888out:
3889 kfree(reply_buf);
3890
3891 return 0;
3892}
3893
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003894static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3895{
3896 size_t size;
3897 void *reply_buf;
3898 __le64 snap_id;
3899 int ret;
3900 void *p;
3901 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003902 char *snap_name;
3903
3904 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3905 reply_buf = kmalloc(size, GFP_KERNEL);
3906 if (!reply_buf)
3907 return ERR_PTR(-ENOMEM);
3908
3909 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003910 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003911 "rbd", "get_snapshot_name",
3912 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003913 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003914 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003915 if (ret < 0)
3916 goto out;
3917
3918 p = reply_buf;
3919 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003920 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003921 if (IS_ERR(snap_name)) {
3922 ret = PTR_ERR(snap_name);
3923 goto out;
3924 } else {
3925 dout(" snap_id 0x%016llx snap_name = %s\n",
3926 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3927 }
3928 kfree(reply_buf);
3929
3930 return snap_name;
3931out:
3932 kfree(reply_buf);
3933
3934 return ERR_PTR(ret);
3935}
3936
3937static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3938 u64 *snap_size, u64 *snap_features)
3939{
Alex Eldere0b49862013-01-09 14:44:18 -06003940 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003941 u8 order;
3942 int ret;
3943
3944 snap_id = rbd_dev->header.snapc->snaps[which];
3945 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3946 if (ret)
3947 return ERR_PTR(ret);
3948 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3949 if (ret)
3950 return ERR_PTR(ret);
3951
3952 return rbd_dev_v2_snap_name(rbd_dev, which);
3953}
3954
3955static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3956 u64 *snap_size, u64 *snap_features)
3957{
3958 if (rbd_dev->image_format == 1)
3959 return rbd_dev_v1_snap_info(rbd_dev, which,
3960 snap_size, snap_features);
3961 if (rbd_dev->image_format == 2)
3962 return rbd_dev_v2_snap_info(rbd_dev, which,
3963 snap_size, snap_features);
3964 return ERR_PTR(-EINVAL);
3965}
3966
Alex Elder117973f2012-08-31 17:29:55 -05003967static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3968{
3969 int ret;
3970 __u8 obj_order;
3971
3972 down_write(&rbd_dev->header_rwsem);
3973
3974 /* Grab old order first, to see if it changes */
3975
3976 obj_order = rbd_dev->header.obj_order,
3977 ret = rbd_dev_v2_image_size(rbd_dev);
3978 if (ret)
3979 goto out;
3980 if (rbd_dev->header.obj_order != obj_order) {
3981 ret = -EIO;
3982 goto out;
3983 }
3984 rbd_update_mapping_size(rbd_dev);
3985
3986 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3987 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3988 if (ret)
3989 goto out;
3990 ret = rbd_dev_snaps_update(rbd_dev);
3991 dout("rbd_dev_snaps_update returned %d\n", ret);
3992 if (ret)
3993 goto out;
3994 ret = rbd_dev_snaps_register(rbd_dev);
3995 dout("rbd_dev_snaps_register returned %d\n", ret);
3996out:
3997 up_write(&rbd_dev->header_rwsem);
3998
3999 return ret;
4000}
4001
Alex Elder9d475de2012-07-03 16:01:19 -05004002/*
Alex Elder35938152012-08-02 11:29:46 -05004003 * Scan the rbd device's current snapshot list and compare it to the
4004 * newly-received snapshot context. Remove any existing snapshots
4005 * not present in the new snapshot context. Add a new snapshot for
4006 * any snaphots in the snapshot context not in the current list.
4007 * And verify there are no changes to snapshots we already know
4008 * about.
4009 *
4010 * Assumes the snapshots in the snapshot context are sorted by
4011 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
4012 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004013 */
Alex Elder304f6802012-08-31 17:29:52 -05004014static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004015{
Alex Elder35938152012-08-02 11:29:46 -05004016 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4017 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05004018 struct list_head *head = &rbd_dev->snaps;
4019 struct list_head *links = head->next;
4020 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004021
Alex Elder9fcbb802012-08-23 23:48:49 -05004022 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05004023 while (index < snap_count || links != head) {
4024 u64 snap_id;
4025 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05004026 char *snap_name;
4027 u64 snap_size = 0;
4028 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004029
Alex Elder35938152012-08-02 11:29:46 -05004030 snap_id = index < snap_count ? snapc->snaps[index]
4031 : CEPH_NOSNAP;
4032 snap = links != head ? list_entry(links, struct rbd_snap, node)
4033 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05004034 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004035
Alex Elder35938152012-08-02 11:29:46 -05004036 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
4037 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004038
Alex Elder6d292902013-01-14 12:43:31 -06004039 /*
4040 * A previously-existing snapshot is not in
4041 * the new snap context.
4042 *
4043 * If the now missing snapshot is the one the
4044 * image is mapped to, clear its exists flag
4045 * so we can avoid sending any more requests
4046 * to it.
4047 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004048 if (rbd_dev->spec->snap_id == snap->id)
Alex Elder6d292902013-01-14 12:43:31 -06004049 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder41f38c22012-10-25 23:34:40 -05004050 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05004051 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004052 rbd_dev->spec->snap_id == snap->id ?
4053 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05004054 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004055
Alex Elder35938152012-08-02 11:29:46 -05004056 /* Done with this list entry; advance */
4057
4058 links = next;
4059 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004060 }
Alex Elder35938152012-08-02 11:29:46 -05004061
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004062 snap_name = rbd_dev_snap_info(rbd_dev, index,
4063 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05004064 if (IS_ERR(snap_name))
4065 return PTR_ERR(snap_name);
4066
Alex Elder9fcbb802012-08-23 23:48:49 -05004067 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
4068 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05004069 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
4070 struct rbd_snap *new_snap;
4071
4072 /* We haven't seen this snapshot before */
4073
Alex Elderc8d18422012-07-10 20:30:11 -05004074 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05004075 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05004076 if (IS_ERR(new_snap)) {
4077 int err = PTR_ERR(new_snap);
4078
4079 dout(" failed to add dev, error %d\n", err);
4080
4081 return err;
4082 }
Alex Elder35938152012-08-02 11:29:46 -05004083
4084 /* New goes before existing, or at end of list */
4085
Alex Elder9fcbb802012-08-23 23:48:49 -05004086 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05004087 if (snap)
4088 list_add_tail(&new_snap->node, &snap->node);
4089 else
Alex Elder523f3252012-08-30 00:16:37 -05004090 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05004091 } else {
4092 /* Already have this one */
4093
Alex Elder9fcbb802012-08-23 23:48:49 -05004094 dout(" already present\n");
4095
Alex Eldercd892122012-07-03 16:01:19 -05004096 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05004097 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05004098 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05004099
4100 /* Done with this list entry; advance */
4101
4102 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004103 }
Alex Elder35938152012-08-02 11:29:46 -05004104
4105 /* Advance to the next entry in the snapshot context */
4106
4107 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004108 }
Alex Elder9fcbb802012-08-23 23:48:49 -05004109 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004110
4111 return 0;
4112}
4113
Alex Elder304f6802012-08-31 17:29:52 -05004114/*
4115 * Scan the list of snapshots and register the devices for any that
4116 * have not already been registered.
4117 */
4118static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
4119{
4120 struct rbd_snap *snap;
4121 int ret = 0;
4122
Alex Elder37206ee2013-02-20 17:32:08 -06004123 dout("%s:\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05004124 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
4125 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05004126
4127 list_for_each_entry(snap, &rbd_dev->snaps, node) {
4128 if (!rbd_snap_registered(snap)) {
4129 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
4130 if (ret < 0)
4131 break;
4132 }
4133 }
4134 dout("%s: returning %d\n", __func__, ret);
4135
4136 return ret;
4137}
4138
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004139static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4140{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004141 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05004142 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004143
4144 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004145
Alex Eldercd789ab2012-08-30 00:16:38 -05004146 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004147 dev->bus = &rbd_bus_type;
4148 dev->type = &rbd_device_type;
4149 dev->parent = &rbd_root_dev;
4150 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05004151 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004152 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004153
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004154 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05004155
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004156 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004157}
4158
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004159static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4160{
4161 device_unregister(&rbd_dev->dev);
4162}
4163
Alex Eldere2839302012-08-29 17:11:06 -05004164static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06004165
4166/*
Alex Elder499afd52012-02-02 08:13:29 -06004167 * Get a unique rbd identifier for the given new rbd_dev, and add
4168 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06004169 */
Alex Eldere2839302012-08-29 17:11:06 -05004170static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06004171{
Alex Eldere2839302012-08-29 17:11:06 -05004172 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06004173
4174 spin_lock(&rbd_dev_list_lock);
4175 list_add_tail(&rbd_dev->node, &rbd_dev_list);
4176 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05004177 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4178 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06004179}
Alex Elderb7f23c32012-01-29 13:57:43 -06004180
Alex Elder1ddbe942012-01-29 13:57:44 -06004181/*
Alex Elder499afd52012-02-02 08:13:29 -06004182 * Remove an rbd_dev from the global list, and record that its
4183 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06004184 */
Alex Eldere2839302012-08-29 17:11:06 -05004185static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06004186{
Alex Elderd184f6b2012-01-29 13:57:44 -06004187 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05004188 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004189 int max_id;
4190
Alex Elderaafb2302012-09-06 16:00:54 -05004191 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06004192
Alex Eldere2839302012-08-29 17:11:06 -05004193 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4194 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06004195 spin_lock(&rbd_dev_list_lock);
4196 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06004197
4198 /*
4199 * If the id being "put" is not the current maximum, there
4200 * is nothing special we need to do.
4201 */
Alex Eldere2839302012-08-29 17:11:06 -05004202 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06004203 spin_unlock(&rbd_dev_list_lock);
4204 return;
4205 }
4206
4207 /*
4208 * We need to update the current maximum id. Search the
4209 * list to find out what it is. We're more likely to find
4210 * the maximum at the end, so search the list backward.
4211 */
4212 max_id = 0;
4213 list_for_each_prev(tmp, &rbd_dev_list) {
4214 struct rbd_device *rbd_dev;
4215
4216 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07004217 if (rbd_dev->dev_id > max_id)
4218 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004219 }
Alex Elder499afd52012-02-02 08:13:29 -06004220 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06004221
Alex Elder1ddbe942012-01-29 13:57:44 -06004222 /*
Alex Eldere2839302012-08-29 17:11:06 -05004223 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06004224 * which case it now accurately reflects the new maximum.
4225 * Be careful not to overwrite the maximum value in that
4226 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06004227 */
Alex Eldere2839302012-08-29 17:11:06 -05004228 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4229 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06004230}
4231
Alex Eldera725f65e2012-02-02 08:13:30 -06004232/*
Alex Eldere28fff262012-02-02 08:13:30 -06004233 * Skips over white space at *buf, and updates *buf to point to the
4234 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004235 * the token (string of non-white space characters) found. Note
4236 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004237 */
4238static inline size_t next_token(const char **buf)
4239{
4240 /*
4241 * These are the characters that produce nonzero for
4242 * isspace() in the "C" and "POSIX" locales.
4243 */
4244 const char *spaces = " \f\n\r\t\v";
4245
4246 *buf += strspn(*buf, spaces); /* Find start of token */
4247
4248 return strcspn(*buf, spaces); /* Return token length */
4249}
4250
4251/*
4252 * Finds the next token in *buf, and if the provided token buffer is
4253 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06004254 * copied, is guaranteed to be terminated with '\0'. Note that *buf
4255 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06004256 *
4257 * Returns the length of the token found (not including the '\0').
4258 * Return value will be 0 if no token is found, and it will be >=
4259 * token_size if the token would not fit.
4260 *
Alex Elder593a9e72012-02-07 12:03:37 -06004261 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06004262 * found token. Note that this occurs even if the token buffer is
4263 * too small to hold it.
4264 */
4265static inline size_t copy_token(const char **buf,
4266 char *token,
4267 size_t token_size)
4268{
4269 size_t len;
4270
4271 len = next_token(buf);
4272 if (len < token_size) {
4273 memcpy(token, *buf, len);
4274 *(token + len) = '\0';
4275 }
4276 *buf += len;
4277
4278 return len;
4279}
4280
4281/*
Alex Elderea3352f2012-07-09 21:04:23 -05004282 * Finds the next token in *buf, dynamically allocates a buffer big
4283 * enough to hold a copy of it, and copies the token into the new
4284 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4285 * that a duplicate buffer is created even for a zero-length token.
4286 *
4287 * Returns a pointer to the newly-allocated duplicate, or a null
4288 * pointer if memory for the duplicate was not available. If
4289 * the lenp argument is a non-null pointer, the length of the token
4290 * (not including the '\0') is returned in *lenp.
4291 *
4292 * If successful, the *buf pointer will be updated to point beyond
4293 * the end of the found token.
4294 *
4295 * Note: uses GFP_KERNEL for allocation.
4296 */
4297static inline char *dup_token(const char **buf, size_t *lenp)
4298{
4299 char *dup;
4300 size_t len;
4301
4302 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004303 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004304 if (!dup)
4305 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004306 *(dup + len) = '\0';
4307 *buf += len;
4308
4309 if (lenp)
4310 *lenp = len;
4311
4312 return dup;
4313}
4314
4315/*
Alex Elder859c31d2012-10-25 23:34:42 -05004316 * Parse the options provided for an "rbd add" (i.e., rbd image
4317 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4318 * and the data written is passed here via a NUL-terminated buffer.
4319 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004320 *
Alex Elder859c31d2012-10-25 23:34:42 -05004321 * The information extracted from these options is recorded in
4322 * the other parameters which return dynamically-allocated
4323 * structures:
4324 * ceph_opts
4325 * The address of a pointer that will refer to a ceph options
4326 * structure. Caller must release the returned pointer using
4327 * ceph_destroy_options() when it is no longer needed.
4328 * rbd_opts
4329 * Address of an rbd options pointer. Fully initialized by
4330 * this function; caller must release with kfree().
4331 * spec
4332 * Address of an rbd image specification pointer. Fully
4333 * initialized by this function based on parsed options.
4334 * Caller must release with rbd_spec_put().
4335 *
4336 * The options passed take this form:
4337 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4338 * where:
4339 * <mon_addrs>
4340 * A comma-separated list of one or more monitor addresses.
4341 * A monitor address is an ip address, optionally followed
4342 * by a port number (separated by a colon).
4343 * I.e.: ip1[:port1][,ip2[:port2]...]
4344 * <options>
4345 * A comma-separated list of ceph and/or rbd options.
4346 * <pool_name>
4347 * The name of the rados pool containing the rbd image.
4348 * <image_name>
4349 * The name of the image in that pool to map.
4350 * <snap_id>
4351 * An optional snapshot id. If provided, the mapping will
4352 * present data from the image at the time that snapshot was
4353 * created. The image head is used if no snapshot id is
4354 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004355 */
Alex Elder859c31d2012-10-25 23:34:42 -05004356static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004357 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004358 struct rbd_options **opts,
4359 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004360{
Alex Elderd22f76e2012-07-12 10:46:35 -05004361 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004362 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004363 const char *mon_addrs;
4364 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004365 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004366 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004367 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004368 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004369
4370 /* The first four tokens are required */
4371
Alex Elder7ef32142012-02-02 08:13:30 -06004372 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004373 if (!len) {
4374 rbd_warn(NULL, "no monitor address(es) provided");
4375 return -EINVAL;
4376 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004377 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004378 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004379 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004380
Alex Elderdc79b112012-10-25 23:34:41 -05004381 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004382 options = dup_token(&buf, NULL);
4383 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004384 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004385 if (!*options) {
4386 rbd_warn(NULL, "no options provided");
4387 goto out_err;
4388 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004389
Alex Elder859c31d2012-10-25 23:34:42 -05004390 spec = rbd_spec_alloc();
4391 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004392 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004393
4394 spec->pool_name = dup_token(&buf, NULL);
4395 if (!spec->pool_name)
4396 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004397 if (!*spec->pool_name) {
4398 rbd_warn(NULL, "no pool name provided");
4399 goto out_err;
4400 }
Alex Eldere28fff262012-02-02 08:13:30 -06004401
Alex Elder69e7a022012-11-01 08:39:26 -05004402 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004403 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004404 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004405 if (!*spec->image_name) {
4406 rbd_warn(NULL, "no image name provided");
4407 goto out_err;
4408 }
Alex Eldere28fff262012-02-02 08:13:30 -06004409
Alex Elderf28e5652012-10-25 23:34:41 -05004410 /*
4411 * Snapshot name is optional; default is to use "-"
4412 * (indicating the head/no snapshot).
4413 */
Alex Elder3feeb8942012-08-31 17:29:52 -05004414 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05004415 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05004416 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4417 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05004418 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05004419 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05004420 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05004421 }
Alex Elder4caf35f2012-11-01 08:39:27 -05004422 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05004423 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004424 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004425 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05004426
Alex Elder0ddebc02012-10-25 23:34:41 -05004427 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06004428
Alex Elder4e9afeb2012-10-25 23:34:41 -05004429 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4430 if (!rbd_opts)
4431 goto out_mem;
4432
4433 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05004434
Alex Elder859c31d2012-10-25 23:34:42 -05004435 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05004436 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05004437 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004438 if (IS_ERR(copts)) {
4439 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05004440 goto out_err;
4441 }
Alex Elder859c31d2012-10-25 23:34:42 -05004442 kfree(options);
4443
4444 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004445 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05004446 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05004447
Alex Elderdc79b112012-10-25 23:34:41 -05004448 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05004449out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05004450 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05004451out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05004452 kfree(rbd_opts);
4453 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05004454 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05004455
Alex Elderdc79b112012-10-25 23:34:41 -05004456 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06004457}
4458
Alex Elder589d30e2012-07-10 20:30:11 -05004459/*
4460 * An rbd format 2 image has a unique identifier, distinct from the
4461 * name given to it by the user. Internally, that identifier is
4462 * what's used to specify the names of objects related to the image.
4463 *
4464 * A special "rbd id" object is used to map an rbd image name to its
4465 * id. If that object doesn't exist, then there is no v2 rbd image
4466 * with the supplied name.
4467 *
4468 * This function will record the given rbd_dev's image_id field if
4469 * it can be determined, and in that case will return 0. If any
4470 * errors occur a negative errno will be returned and the rbd_dev's
4471 * image_id field will be unchanged (and should be NULL).
4472 */
4473static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4474{
4475 int ret;
4476 size_t size;
4477 char *object_name;
4478 void *response;
4479 void *p;
4480
Alex Elder2f82ee52012-10-30 19:40:33 -05004481 /* If we already have it we don't need to look it up */
4482
4483 if (rbd_dev->spec->image_id)
4484 return 0;
4485
Alex Elder589d30e2012-07-10 20:30:11 -05004486 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05004487 * When probing a parent image, the image id is already
4488 * known (and the image name likely is not). There's no
4489 * need to fetch the image id again in this case.
4490 */
4491 if (rbd_dev->spec->image_id)
4492 return 0;
4493
4494 /*
Alex Elder589d30e2012-07-10 20:30:11 -05004495 * First, see if the format 2 image id file exists, and if
4496 * so, get the image's persistent id from it.
4497 */
Alex Elder69e7a022012-11-01 08:39:26 -05004498 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004499 object_name = kmalloc(size, GFP_NOIO);
4500 if (!object_name)
4501 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004502 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004503 dout("rbd id object name is %s\n", object_name);
4504
4505 /* Response will be an encoded string, which includes a length */
4506
4507 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4508 response = kzalloc(size, GFP_NOIO);
4509 if (!response) {
4510 ret = -ENOMEM;
4511 goto out;
4512 }
4513
Alex Elder36be9a72013-01-19 00:30:28 -06004514 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder589d30e2012-07-10 20:30:11 -05004515 "rbd", "get_id",
4516 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06004517 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06004518 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder589d30e2012-07-10 20:30:11 -05004519 if (ret < 0)
4520 goto out;
4521
4522 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004523 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05004524 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05004525 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004526 if (IS_ERR(rbd_dev->spec->image_id)) {
4527 ret = PTR_ERR(rbd_dev->spec->image_id);
4528 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05004529 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004530 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004531 }
4532out:
4533 kfree(response);
4534 kfree(object_name);
4535
4536 return ret;
4537}
4538
Alex Eldera30b71b2012-07-10 20:30:11 -05004539static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4540{
4541 int ret;
4542 size_t size;
4543
4544 /* Version 1 images have no id; empty string is used */
4545
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004546 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
4547 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05004548 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05004549
4550 /* Record the header object name for this rbd image. */
4551
Alex Elder69e7a022012-11-01 08:39:26 -05004552 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05004553 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4554 if (!rbd_dev->header_name) {
4555 ret = -ENOMEM;
4556 goto out_err;
4557 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004558 sprintf(rbd_dev->header_name, "%s%s",
4559 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05004560
4561 /* Populate rbd image metadata */
4562
4563 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4564 if (ret < 0)
4565 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05004566
4567 /* Version 1 images have no parent (no layering) */
4568
4569 rbd_dev->parent_spec = NULL;
4570 rbd_dev->parent_overlap = 0;
4571
Alex Eldera30b71b2012-07-10 20:30:11 -05004572 rbd_dev->image_format = 1;
4573
4574 dout("discovered version 1 image, header name is %s\n",
4575 rbd_dev->header_name);
4576
4577 return 0;
4578
4579out_err:
4580 kfree(rbd_dev->header_name);
4581 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004582 kfree(rbd_dev->spec->image_id);
4583 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05004584
4585 return ret;
4586}
4587
4588static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4589{
4590 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05004591 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004592 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05004593
4594 /*
4595 * Image id was filled in by the caller. Record the header
4596 * object name for this rbd image.
4597 */
Alex Elder979ed482012-11-01 08:39:26 -05004598 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05004599 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4600 if (!rbd_dev->header_name)
4601 return -ENOMEM;
4602 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004603 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05004604
4605 /* Get the size and object order for the image */
4606
4607 ret = rbd_dev_v2_image_size(rbd_dev);
4608 if (ret < 0)
4609 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05004610
4611 /* Get the object prefix (a.k.a. block_name) for the image */
4612
4613 ret = rbd_dev_v2_object_prefix(rbd_dev);
4614 if (ret < 0)
4615 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05004616
Alex Elderd8891402012-10-09 13:50:17 -07004617 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05004618
4619 ret = rbd_dev_v2_features(rbd_dev);
4620 if (ret < 0)
4621 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05004622
Alex Elder86b00e02012-10-25 23:34:42 -05004623 /* If the image supports layering, get the parent info */
4624
4625 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4626 ret = rbd_dev_v2_parent_info(rbd_dev);
4627 if (ret < 0)
4628 goto out_err;
4629 }
4630
Alex Elder6e14b1a2012-07-03 16:01:19 -05004631 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05004632
Alex Elder6e14b1a2012-07-03 16:01:19 -05004633 rbd_dev->header.crypt_type = 0;
4634 rbd_dev->header.comp_type = 0;
4635
4636 /* Get the snapshot context, plus the header version */
4637
4638 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05004639 if (ret)
4640 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004641 rbd_dev->header.obj_version = ver;
4642
Alex Eldera30b71b2012-07-10 20:30:11 -05004643 rbd_dev->image_format = 2;
4644
4645 dout("discovered version 2 image, header name is %s\n",
4646 rbd_dev->header_name);
4647
Alex Elder35152972012-08-31 17:29:55 -05004648 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05004649out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05004650 rbd_dev->parent_overlap = 0;
4651 rbd_spec_put(rbd_dev->parent_spec);
4652 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004653 kfree(rbd_dev->header_name);
4654 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05004655 kfree(rbd_dev->header.object_prefix);
4656 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004657
4658 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004659}
4660
Alex Elder83a06262012-10-30 15:47:17 -05004661static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
4662{
Alex Elder2f82ee52012-10-30 19:40:33 -05004663 struct rbd_device *parent = NULL;
4664 struct rbd_spec *parent_spec = NULL;
4665 struct rbd_client *rbdc = NULL;
Alex Elder83a06262012-10-30 15:47:17 -05004666 int ret;
4667
4668 /* no need to lock here, as rbd_dev is not registered yet */
4669 ret = rbd_dev_snaps_update(rbd_dev);
4670 if (ret)
4671 return ret;
4672
Alex Elder9e15b772012-10-30 19:40:33 -05004673 ret = rbd_dev_probe_update_spec(rbd_dev);
4674 if (ret)
4675 goto err_out_snaps;
4676
Alex Elder83a06262012-10-30 15:47:17 -05004677 ret = rbd_dev_set_mapping(rbd_dev);
4678 if (ret)
4679 goto err_out_snaps;
4680
4681 /* generate unique id: find highest unique id, add one */
4682 rbd_dev_id_get(rbd_dev);
4683
4684 /* Fill in the device name, now that we have its id. */
4685 BUILD_BUG_ON(DEV_NAME_LEN
4686 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4687 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4688
4689 /* Get our block major device number. */
4690
4691 ret = register_blkdev(0, rbd_dev->name);
4692 if (ret < 0)
4693 goto err_out_id;
4694 rbd_dev->major = ret;
4695
4696 /* Set up the blkdev mapping. */
4697
4698 ret = rbd_init_disk(rbd_dev);
4699 if (ret)
4700 goto err_out_blkdev;
4701
4702 ret = rbd_bus_add_dev(rbd_dev);
4703 if (ret)
4704 goto err_out_disk;
4705
4706 /*
4707 * At this point cleanup in the event of an error is the job
4708 * of the sysfs code (initiated by rbd_bus_del_dev()).
4709 */
Alex Elder2f82ee52012-10-30 19:40:33 -05004710 /* Probe the parent if there is one */
4711
4712 if (rbd_dev->parent_spec) {
4713 /*
4714 * We need to pass a reference to the client and the
4715 * parent spec when creating the parent rbd_dev.
4716 * Images related by parent/child relationships
4717 * always share both.
4718 */
4719 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4720 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4721
4722 parent = rbd_dev_create(rbdc, parent_spec);
4723 if (!parent) {
4724 ret = -ENOMEM;
4725 goto err_out_spec;
4726 }
4727 rbdc = NULL; /* parent now owns reference */
4728 parent_spec = NULL; /* parent now owns reference */
4729 ret = rbd_dev_probe(parent);
4730 if (ret < 0)
4731 goto err_out_parent;
4732 rbd_dev->parent = parent;
4733 }
4734
Alex Elder83a06262012-10-30 15:47:17 -05004735 down_write(&rbd_dev->header_rwsem);
4736 ret = rbd_dev_snaps_register(rbd_dev);
4737 up_write(&rbd_dev->header_rwsem);
4738 if (ret)
4739 goto err_out_bus;
4740
Alex Elder9969ebc2013-01-18 12:31:10 -06004741 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05004742 if (ret)
4743 goto err_out_bus;
4744
4745 /* Everything's ready. Announce the disk to the world. */
4746
4747 add_disk(rbd_dev->disk);
4748
4749 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4750 (unsigned long long) rbd_dev->mapping.size);
4751
4752 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05004753
4754err_out_parent:
4755 rbd_dev_destroy(parent);
4756err_out_spec:
4757 rbd_spec_put(parent_spec);
4758 rbd_put_client(rbdc);
Alex Elder83a06262012-10-30 15:47:17 -05004759err_out_bus:
4760 /* this will also clean up rest of rbd_dev stuff */
4761
4762 rbd_bus_del_dev(rbd_dev);
4763
4764 return ret;
4765err_out_disk:
4766 rbd_free_disk(rbd_dev);
4767err_out_blkdev:
4768 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4769err_out_id:
4770 rbd_dev_id_put(rbd_dev);
4771err_out_snaps:
4772 rbd_remove_all_snaps(rbd_dev);
4773
4774 return ret;
4775}
4776
Alex Eldera30b71b2012-07-10 20:30:11 -05004777/*
4778 * Probe for the existence of the header object for the given rbd
4779 * device. For format 2 images this includes determining the image
4780 * id.
4781 */
4782static int rbd_dev_probe(struct rbd_device *rbd_dev)
4783{
4784 int ret;
4785
4786 /*
4787 * Get the id from the image id object. If it's not a
4788 * format 2 image, we'll get ENOENT back, and we'll assume
4789 * it's a format 1 image.
4790 */
4791 ret = rbd_dev_image_id(rbd_dev);
4792 if (ret)
4793 ret = rbd_dev_v1_probe(rbd_dev);
4794 else
4795 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004796 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05004797 dout("probe failed, returning %d\n", ret);
4798
Alex Elder83a06262012-10-30 15:47:17 -05004799 return ret;
4800 }
4801
4802 ret = rbd_dev_probe_finish(rbd_dev);
4803 if (ret)
4804 rbd_header_free(&rbd_dev->header);
4805
Alex Eldera30b71b2012-07-10 20:30:11 -05004806 return ret;
4807}
4808
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004809static ssize_t rbd_add(struct bus_type *bus,
4810 const char *buf,
4811 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004812{
Alex Eldercb8627c2012-07-09 21:04:23 -05004813 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004814 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004815 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004816 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004817 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004818 struct ceph_osd_client *osdc;
4819 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004820
4821 if (!try_module_get(THIS_MODULE))
4822 return -ENODEV;
4823
Alex Eldera725f65e2012-02-02 08:13:30 -06004824 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004825 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004826 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004827 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004828
Alex Elder9d3997f2012-10-25 23:34:42 -05004829 rbdc = rbd_get_client(ceph_opts);
4830 if (IS_ERR(rbdc)) {
4831 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004832 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004833 }
Alex Elderc53d5892012-10-25 23:34:42 -05004834 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004835
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004836 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004837 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004838 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004839 if (rc < 0)
4840 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05004841 spec->pool_id = (u64) rc;
4842
Alex Elder0903e872012-11-14 12:25:19 -06004843 /* The ceph file layout needs to fit pool id in 32 bits */
4844
4845 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
4846 rc = -EIO;
4847 goto err_out_client;
4848 }
4849
Alex Elderc53d5892012-10-25 23:34:42 -05004850 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004851 if (!rbd_dev)
4852 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004853 rbdc = NULL; /* rbd_dev now owns this */
4854 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004855
Alex Elderbd4ba652012-10-25 23:34:42 -05004856 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004857 kfree(rbd_opts);
4858 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004859
Alex Eldera30b71b2012-07-10 20:30:11 -05004860 rc = rbd_dev_probe(rbd_dev);
4861 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004862 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004863
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004864 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004865err_out_rbd_dev:
4866 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004867err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004868 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004869err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004870 if (ceph_opts)
4871 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004872 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004873 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004874err_out_module:
4875 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004876
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004877 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004878
4879 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004880}
4881
Alex Elderde71a292012-07-03 16:01:19 -05004882static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004883{
4884 struct list_head *tmp;
4885 struct rbd_device *rbd_dev;
4886
Alex Eldere124a82f2012-01-29 13:57:44 -06004887 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004888 list_for_each(tmp, &rbd_dev_list) {
4889 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004890 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06004891 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004892 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06004893 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004894 }
Alex Eldere124a82f2012-01-29 13:57:44 -06004895 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004896 return NULL;
4897}
4898
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004899static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004900{
Alex Elder593a9e72012-02-07 12:03:37 -06004901 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004902
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004903 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004904 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004905
4906 /* clean up and free blkdev */
4907 rbd_free_disk(rbd_dev);
4908 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004909
Alex Elder2ac4e752012-07-10 20:30:10 -05004910 /* release allocated disk header fields */
4911 rbd_header_free(&rbd_dev->header);
4912
Alex Elder32eec682012-02-08 16:11:14 -06004913 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004914 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004915 rbd_assert(rbd_dev->rbd_client != NULL);
4916 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004917
4918 /* release module ref */
4919 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004920}
4921
Alex Elder2f82ee52012-10-30 19:40:33 -05004922static void __rbd_remove(struct rbd_device *rbd_dev)
4923{
4924 rbd_remove_all_snaps(rbd_dev);
4925 rbd_bus_del_dev(rbd_dev);
4926}
4927
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004928static ssize_t rbd_remove(struct bus_type *bus,
4929 const char *buf,
4930 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004931{
4932 struct rbd_device *rbd_dev = NULL;
4933 int target_id, rc;
4934 unsigned long ul;
4935 int ret = count;
4936
4937 rc = strict_strtoul(buf, 10, &ul);
4938 if (rc)
4939 return rc;
4940
4941 /* convert to int; abort if we lost anything in the conversion */
4942 target_id = (int) ul;
4943 if (target_id != ul)
4944 return -EINVAL;
4945
4946 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4947
4948 rbd_dev = __rbd_get_dev(target_id);
4949 if (!rbd_dev) {
4950 ret = -ENOENT;
4951 goto done;
4952 }
4953
Alex Eldera14ea262013-02-05 13:23:12 -06004954 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004955 if (rbd_dev->open_count)
Alex Elder42382b72012-11-16 09:29:16 -06004956 ret = -EBUSY;
Alex Elderb82d1672013-01-14 12:43:31 -06004957 else
4958 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
Alex Eldera14ea262013-02-05 13:23:12 -06004959 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06004960 if (ret < 0)
Alex Elder42382b72012-11-16 09:29:16 -06004961 goto done;
Alex Elder42382b72012-11-16 09:29:16 -06004962
Alex Elder2f82ee52012-10-30 19:40:33 -05004963 while (rbd_dev->parent_spec) {
4964 struct rbd_device *first = rbd_dev;
4965 struct rbd_device *second = first->parent;
4966 struct rbd_device *third;
4967
4968 /*
4969 * Follow to the parent with no grandparent and
4970 * remove it.
4971 */
4972 while (second && (third = second->parent)) {
4973 first = second;
4974 second = third;
4975 }
4976 __rbd_remove(second);
4977 rbd_spec_put(first->parent_spec);
4978 first->parent_spec = NULL;
4979 first->parent_overlap = 0;
4980 first->parent = NULL;
4981 }
4982 __rbd_remove(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004983
4984done:
4985 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05004986
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004987 return ret;
4988}
4989
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004990/*
4991 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004992 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004993 */
4994static int rbd_sysfs_init(void)
4995{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004996 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004997
Alex Elderfed4c142012-02-07 12:03:36 -06004998 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004999 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005000 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005001
Alex Elderfed4c142012-02-07 12:03:36 -06005002 ret = bus_register(&rbd_bus_type);
5003 if (ret < 0)
5004 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005005
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005006 return ret;
5007}
5008
5009static void rbd_sysfs_cleanup(void)
5010{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005011 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005012 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005013}
5014
Alex Eldercc344fa2013-02-19 12:25:56 -06005015static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005016{
5017 int rc;
5018
Alex Elder1e32d342013-01-30 11:13:33 -06005019 if (!libceph_compatible(NULL)) {
5020 rbd_warn(NULL, "libceph incompatibility (quitting)");
5021
5022 return -EINVAL;
5023 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005024 rc = rbd_sysfs_init();
5025 if (rc)
5026 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06005027 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005028 return 0;
5029}
5030
Alex Eldercc344fa2013-02-19 12:25:56 -06005031static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005032{
5033 rbd_sysfs_cleanup();
5034}
5035
5036module_init(rbd_init);
5037module_exit(rbd_exit);
5038
5039MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5040MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5041MODULE_DESCRIPTION("rados block device");
5042
5043/* following authorship retained from original osdblk.c */
5044MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5045
5046MODULE_LICENSE("GPL");