blob: 1e13dffc13d541679dd176eaf8ee3699742433cd [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
34#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070035#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036
37#include <linux/kernel.h>
38#include <linux/device.h>
39#include <linux/module.h>
40#include <linux/fs.h>
41#include <linux/blkdev.h>
42
43#include "rbd_types.h"
44
Alex Elderaafb2302012-09-06 16:00:54 -050045#define RBD_DEBUG /* Activate rbd_assert() calls */
46
Alex Elder593a9e72012-02-07 12:03:37 -060047/*
48 * The basic unit of block I/O is a sector. It is interpreted in a
49 * number of contexts in Linux (blk, bio, genhd), but the default is
50 * universally 512 bytes. These symbols are just slightly more
51 * meaningful than the bare numbers they represent.
52 */
53#define SECTOR_SHIFT 9
54#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
55
Alex Elderf0f8cef2012-01-29 13:57:44 -060056#define RBD_DRV_NAME "rbd"
57#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058
59#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
60
Alex Elderd4b125e2012-07-03 16:01:19 -050061#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
62#define RBD_MAX_SNAP_NAME_LEN \
63 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
64
Alex Elder35d489f2012-07-03 16:01:19 -050065#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070066
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder9e15b772012-10-30 19:40:33 -050069/* This allows a single page to hold an image name sent by OSD */
70#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050071#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050072
Alex Elder1e130192012-07-03 16:01:19 -050073#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050074
Alex Elderd8891402012-10-09 13:50:17 -070075/* Feature bits */
76
Alex Elder5cbf6f122013-04-11 09:29:48 -050077#define RBD_FEATURE_LAYERING (1<<0)
78#define RBD_FEATURE_STRIPINGV2 (1<<1)
79#define RBD_FEATURES_ALL \
80 (RBD_FEATURE_LAYERING | RBD_FEATURE_STRIPINGV2)
Alex Elderd8891402012-10-09 13:50:17 -070081
82/* Features supported by this (client software) implementation. */
83
Alex Elder770eba62012-10-25 23:34:40 -050084#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -070085
Alex Elder81a89792012-02-02 08:13:30 -060086/*
87 * An RBD device name will be "rbd#", where the "rbd" comes from
88 * RBD_DRV_NAME above, and # is a unique integer identifier.
89 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
90 * enough to hold all possible device names.
91 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070092#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060093#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070094
95/*
96 * block device image metadata (in-memory version)
97 */
98struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050099 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500100 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500101 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700102 __u8 obj_order;
103 __u8 crypt_type;
104 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105
Alex Elderf84344f2012-08-31 17:29:51 -0500106 /* The remaining fields need to be updated occasionally */
107 u64 image_size;
108 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 char *snap_names;
110 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700111
Alex Elder500d0c02013-04-26 09:43:47 -0500112 u64 stripe_unit;
113 u64 stripe_count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114};
115
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500116/*
117 * An rbd image specification.
118 *
119 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500120 * identify an image. Each rbd_dev structure includes a pointer to
121 * an rbd_spec structure that encapsulates this identity.
122 *
123 * Each of the id's in an rbd_spec has an associated name. For a
124 * user-mapped image, the names are supplied and the id's associated
125 * with them are looked up. For a layered image, a parent image is
126 * defined by the tuple, and the names are looked up.
127 *
128 * An rbd_dev structure contains a parent_spec pointer which is
129 * non-null if the image it represents is a child in a layered
130 * image. This pointer will refer to the rbd_spec structure used
131 * by the parent rbd_dev for its own identity (i.e., the structure
132 * is shared between the parent and child).
133 *
134 * Since these structures are populated once, during the discovery
135 * phase of image construction, they are effectively immutable so
136 * we make no effort to synchronize access to them.
137 *
138 * Note that code herein does not assume the image name is known (it
139 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500140 */
141struct rbd_spec {
142 u64 pool_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500143 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500144
Alex Elderecb4dc222013-04-26 09:43:47 -0500145 const char *image_id;
146 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500147
148 u64 snap_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500149 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500150
151 struct kref kref;
152};
153
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600155 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156 */
157struct rbd_client {
158 struct ceph_client *client;
159 struct kref kref;
160 struct list_head node;
161};
162
Alex Elderbf0d5f502012-11-22 00:00:08 -0600163struct rbd_img_request;
164typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
165
166#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
167
168struct rbd_obj_request;
169typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
170
Alex Elder9969ebc2013-01-18 12:31:10 -0600171enum obj_request_type {
172 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
173};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600174
Alex Elder926f9b32013-02-11 12:33:24 -0600175enum obj_req_flags {
176 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600177 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600178 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
179 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600180};
181
Alex Elderbf0d5f502012-11-22 00:00:08 -0600182struct rbd_obj_request {
183 const char *object_name;
184 u64 offset; /* object start byte */
185 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600186 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600187
Alex Elderc5b5ef62013-02-11 12:33:24 -0600188 /*
189 * An object request associated with an image will have its
190 * img_data flag set; a standalone object request will not.
191 *
192 * A standalone object request will have which == BAD_WHICH
193 * and a null obj_request pointer.
194 *
195 * An object request initiated in support of a layered image
196 * object (to check for its existence before a write) will
197 * have which == BAD_WHICH and a non-null obj_request pointer.
198 *
199 * Finally, an object request for rbd image data will have
200 * which != BAD_WHICH, and will have a non-null img_request
201 * pointer. The value of which will be in the range
202 * 0..(img_request->obj_request_count-1).
203 */
204 union {
205 struct rbd_obj_request *obj_request; /* STAT op */
206 struct {
207 struct rbd_img_request *img_request;
208 u64 img_offset;
209 /* links for img_request->obj_requests list */
210 struct list_head links;
211 };
212 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600213 u32 which; /* posn image request list */
214
215 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600216 union {
217 struct bio *bio_list;
218 struct {
219 struct page **pages;
220 u32 page_count;
221 };
222 };
Alex Elder0eefd472013-04-19 15:34:50 -0500223 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600224
225 struct ceph_osd_request *osd_req;
226
227 u64 xferred; /* bytes transferred */
228 u64 version;
Sage Weil1b83bef2013-02-25 16:11:12 -0800229 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600230
231 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600232 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600233
234 struct kref kref;
235};
236
Alex Elder0c425242013-02-08 09:55:49 -0600237enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600238 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
239 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600240 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600241};
242
Alex Elderbf0d5f502012-11-22 00:00:08 -0600243struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600244 struct rbd_device *rbd_dev;
245 u64 offset; /* starting image byte offset */
246 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600247 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600248 union {
Alex Elder9849e982013-01-24 16:13:36 -0600249 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600250 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600251 };
252 union {
253 struct request *rq; /* block request */
254 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500256 struct page **copyup_pages;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600257 spinlock_t completion_lock;/* protects next_completion */
258 u32 next_completion;
259 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500260 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600261 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600262
263 u32 obj_request_count;
264 struct list_head obj_requests; /* rbd_obj_request structs */
265
266 struct kref kref;
267};
268
269#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600270 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600271#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600272 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600273#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600274 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600275
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800276struct rbd_snap {
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800277 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800278 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800279 struct list_head node;
280 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500281 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800282};
283
Alex Elderf84344f2012-08-31 17:29:51 -0500284struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500285 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500286 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500287 bool read_only;
288};
289
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700290/*
291 * a single device
292 */
293struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500294 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700295
296 int major; /* blkdev assigned major */
297 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700298
Alex Eldera30b71b2012-07-10 20:30:11 -0500299 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700300 struct rbd_client *rbd_client;
301
302 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
303
Alex Elderb82d1672013-01-14 12:43:31 -0600304 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305
306 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600307 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500308 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500310 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500311
Alex Elder0903e872012-11-14 12:25:19 -0600312 struct ceph_file_layout layout;
313
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700314 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600315 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700316
Alex Elder86b00e02012-10-25 23:34:42 -0500317 struct rbd_spec *parent_spec;
318 u64 parent_overlap;
Alex Elder2f82ee52012-10-30 19:40:33 -0500319 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500320
Josh Durginc6666012011-11-21 17:11:12 -0800321 /* protects updating the header */
322 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500323
324 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700325
326 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800327
328 /* list of snapshots */
329 struct list_head snaps;
330
331 /* sysfs related */
332 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600333 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800334};
335
Alex Elderb82d1672013-01-14 12:43:31 -0600336/*
337 * Flag bits for rbd_dev->flags. If atomicity is required,
338 * rbd_dev->lock is used to protect access.
339 *
340 * Currently, only the "removing" flag (which is coupled with the
341 * "open_count" field) requires atomic access.
342 */
Alex Elder6d292902013-01-14 12:43:31 -0600343enum rbd_dev_flags {
344 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600345 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Alex Elder6d292902013-01-14 12:43:31 -0600346};
347
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700348static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600349
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600351static DEFINE_SPINLOCK(rbd_dev_list_lock);
352
Alex Elder432b8582012-01-29 13:57:44 -0600353static LIST_HEAD(rbd_client_list); /* clients */
354static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700355
Alex Elder3d7efd12013-04-19 15:34:50 -0500356static int rbd_img_request_submit(struct rbd_img_request *img_request);
357
Alex Elder304f6802012-08-31 17:29:52 -0500358static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
Alex Elder304f6802012-08-31 17:29:52 -0500359
Alex Elder200a6a82013-04-28 23:32:34 -0500360static void rbd_dev_device_release(struct device *dev);
Alex Elder6087b512013-04-25 15:09:41 -0500361static void rbd_snap_destroy(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800362
Alex Elderf0f8cef2012-01-29 13:57:44 -0600363static ssize_t rbd_add(struct bus_type *bus, const char *buf,
364 size_t count);
365static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
366 size_t count);
Alex Elder71f293e2013-04-26 09:43:48 -0500367static int rbd_dev_image_probe(struct rbd_device *rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600368
369static struct bus_attribute rbd_bus_attrs[] = {
370 __ATTR(add, S_IWUSR, NULL, rbd_add),
371 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
372 __ATTR_NULL
373};
374
375static struct bus_type rbd_bus_type = {
376 .name = "rbd",
377 .bus_attrs = rbd_bus_attrs,
378};
379
380static void rbd_root_dev_release(struct device *dev)
381{
382}
383
384static struct device rbd_root_dev = {
385 .init_name = "rbd",
386 .release = rbd_root_dev_release,
387};
388
Alex Elder06ecc6c2012-11-01 10:17:15 -0500389static __printf(2, 3)
390void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
391{
392 struct va_format vaf;
393 va_list args;
394
395 va_start(args, fmt);
396 vaf.fmt = fmt;
397 vaf.va = &args;
398
399 if (!rbd_dev)
400 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
401 else if (rbd_dev->disk)
402 printk(KERN_WARNING "%s: %s: %pV\n",
403 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
404 else if (rbd_dev->spec && rbd_dev->spec->image_name)
405 printk(KERN_WARNING "%s: image %s: %pV\n",
406 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
407 else if (rbd_dev->spec && rbd_dev->spec->image_id)
408 printk(KERN_WARNING "%s: id %s: %pV\n",
409 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
410 else /* punt */
411 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
412 RBD_DRV_NAME, rbd_dev, &vaf);
413 va_end(args);
414}
415
Alex Elderaafb2302012-09-06 16:00:54 -0500416#ifdef RBD_DEBUG
417#define rbd_assert(expr) \
418 if (unlikely(!(expr))) { \
419 printk(KERN_ERR "\nAssertion failure in %s() " \
420 "at line %d:\n\n" \
421 "\trbd_assert(%s);\n\n", \
422 __func__, __LINE__, #expr); \
423 BUG(); \
424 }
425#else /* !RBD_DEBUG */
426# define rbd_assert(expr) ((void) 0)
427#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800428
Alex Elderb454e362013-04-19 15:34:50 -0500429static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder05a46af2013-04-26 15:44:36 -0500430static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
431static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600432
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500433static int rbd_dev_refresh(struct rbd_device *rbd_dev);
434static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700435
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436static int rbd_open(struct block_device *bdev, fmode_t mode)
437{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600438 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600439 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440
Alex Elderf84344f2012-08-31 17:29:51 -0500441 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700442 return -EROFS;
443
Alex Eldera14ea262013-02-05 13:23:12 -0600444 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600445 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
446 removing = true;
447 else
448 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600449 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600450 if (removing)
451 return -ENOENT;
452
Alex Elder42382b72012-11-16 09:29:16 -0600453 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600454 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500455 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600456 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700457
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700458 return 0;
459}
460
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800461static int rbd_release(struct gendisk *disk, fmode_t mode)
462{
463 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600464 unsigned long open_count_before;
465
Alex Eldera14ea262013-02-05 13:23:12 -0600466 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600467 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600468 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600469 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800470
Alex Elder42382b72012-11-16 09:29:16 -0600471 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600472 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600473 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800474
475 return 0;
476}
477
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700478static const struct block_device_operations rbd_bd_ops = {
479 .owner = THIS_MODULE,
480 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800481 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700482};
483
484/*
485 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500486 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700487 */
Alex Elderf8c38922012-08-10 13:12:07 -0700488static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700489{
490 struct rbd_client *rbdc;
491 int ret = -ENOMEM;
492
Alex Elder37206ee2013-02-20 17:32:08 -0600493 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700494 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
495 if (!rbdc)
496 goto out_opt;
497
498 kref_init(&rbdc->kref);
499 INIT_LIST_HEAD(&rbdc->node);
500
Alex Elderbc534d82012-01-29 13:57:44 -0600501 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
502
Alex Elder43ae4702012-07-03 16:01:18 -0500503 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700504 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600505 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500506 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700507
508 ret = ceph_open_session(rbdc->client);
509 if (ret < 0)
510 goto out_err;
511
Alex Elder432b8582012-01-29 13:57:44 -0600512 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700513 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600514 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700515
Alex Elderbc534d82012-01-29 13:57:44 -0600516 mutex_unlock(&ctl_mutex);
Alex Elder37206ee2013-02-20 17:32:08 -0600517 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600518
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700519 return rbdc;
520
521out_err:
522 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600523out_mutex:
524 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700525 kfree(rbdc);
526out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500527 if (ceph_opts)
528 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600529 dout("%s: error %d\n", __func__, ret);
530
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400531 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700532}
533
Alex Elder2f82ee52012-10-30 19:40:33 -0500534static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
535{
536 kref_get(&rbdc->kref);
537
538 return rbdc;
539}
540
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700541/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700542 * Find a ceph client with specific addr and configuration. If
543 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700544 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700545static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700546{
547 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700548 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700549
Alex Elder43ae4702012-07-03 16:01:18 -0500550 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700551 return NULL;
552
Alex Elder1f7ba332012-08-10 13:12:07 -0700553 spin_lock(&rbd_client_list_lock);
554 list_for_each_entry(client_node, &rbd_client_list, node) {
555 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500556 __rbd_get_client(client_node);
557
Alex Elder1f7ba332012-08-10 13:12:07 -0700558 found = true;
559 break;
560 }
561 }
562 spin_unlock(&rbd_client_list_lock);
563
564 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565}
566
567/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700568 * mount options
569 */
570enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700571 Opt_last_int,
572 /* int args above */
573 Opt_last_string,
574 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700575 Opt_read_only,
576 Opt_read_write,
577 /* Boolean args above */
578 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700579};
580
Alex Elder43ae4702012-07-03 16:01:18 -0500581static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700582 /* int args above */
583 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500584 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700585 {Opt_read_only, "ro"}, /* Alternate spelling */
586 {Opt_read_write, "read_write"},
587 {Opt_read_write, "rw"}, /* Alternate spelling */
588 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700589 {-1, NULL}
590};
591
Alex Elder98571b52013-01-20 14:44:42 -0600592struct rbd_options {
593 bool read_only;
594};
595
596#define RBD_READ_ONLY_DEFAULT false
597
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700598static int parse_rbd_opts_token(char *c, void *private)
599{
Alex Elder43ae4702012-07-03 16:01:18 -0500600 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700601 substring_t argstr[MAX_OPT_ARGS];
602 int token, intval, ret;
603
Alex Elder43ae4702012-07-03 16:01:18 -0500604 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700605 if (token < 0)
606 return -EINVAL;
607
608 if (token < Opt_last_int) {
609 ret = match_int(&argstr[0], &intval);
610 if (ret < 0) {
611 pr_err("bad mount option arg (not int) "
612 "at '%s'\n", c);
613 return ret;
614 }
615 dout("got int token %d val %d\n", token, intval);
616 } else if (token > Opt_last_int && token < Opt_last_string) {
617 dout("got string token %d val %s\n", token,
618 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700619 } else if (token > Opt_last_string && token < Opt_last_bool) {
620 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700621 } else {
622 dout("got token %d\n", token);
623 }
624
625 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700626 case Opt_read_only:
627 rbd_opts->read_only = true;
628 break;
629 case Opt_read_write:
630 rbd_opts->read_only = false;
631 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700632 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500633 rbd_assert(false);
634 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700635 }
636 return 0;
637}
638
639/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700640 * Get a ceph client with specific addr and configuration, if one does
641 * not exist create it.
642 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500643static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644{
Alex Elderf8c38922012-08-10 13:12:07 -0700645 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700646
Alex Elder1f7ba332012-08-10 13:12:07 -0700647 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500648 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500649 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500650 else
Alex Elderf8c38922012-08-10 13:12:07 -0700651 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652
Alex Elder9d3997f2012-10-25 23:34:42 -0500653 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700654}
655
656/*
657 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600658 *
Alex Elder432b8582012-01-29 13:57:44 -0600659 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700660 */
661static void rbd_client_release(struct kref *kref)
662{
663 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
664
Alex Elder37206ee2013-02-20 17:32:08 -0600665 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500666 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500668 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669
670 ceph_destroy_client(rbdc->client);
671 kfree(rbdc);
672}
673
674/*
675 * Drop reference to ceph client node. If it's not referenced anymore, release
676 * it.
677 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500678static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679{
Alex Elderc53d5892012-10-25 23:34:42 -0500680 if (rbdc)
681 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682}
683
Alex Eldera30b71b2012-07-10 20:30:11 -0500684static bool rbd_image_format_valid(u32 image_format)
685{
686 return image_format == 1 || image_format == 2;
687}
688
Alex Elder8e94af82012-07-25 09:32:40 -0500689static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
690{
Alex Elder103a1502012-08-02 11:29:45 -0500691 size_t size;
692 u32 snap_count;
693
694 /* The header has to start with the magic rbd header text */
695 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
696 return false;
697
Alex Elderdb2388b2012-10-20 22:17:27 -0500698 /* The bio layer requires at least sector-sized I/O */
699
700 if (ondisk->options.order < SECTOR_SHIFT)
701 return false;
702
703 /* If we use u64 in a few spots we may be able to loosen this */
704
705 if (ondisk->options.order > 8 * sizeof (int) - 1)
706 return false;
707
Alex Elder103a1502012-08-02 11:29:45 -0500708 /*
709 * The size of a snapshot header has to fit in a size_t, and
710 * that limits the number of snapshots.
711 */
712 snap_count = le32_to_cpu(ondisk->snap_count);
713 size = SIZE_MAX - sizeof (struct ceph_snap_context);
714 if (snap_count > size / sizeof (__le64))
715 return false;
716
717 /*
718 * Not only that, but the size of the entire the snapshot
719 * header must also be representable in a size_t.
720 */
721 size -= snap_count * sizeof (__le64);
722 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
723 return false;
724
725 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500726}
727
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700728/*
729 * Create a new header structure, translate header format from the on-disk
730 * header.
731 */
732static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500733 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700734{
Alex Elderccece232012-07-10 20:30:10 -0500735 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500736 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500737 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500738 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700739
Alex Elder6a523252012-07-19 17:12:59 -0500740 memset(header, 0, sizeof (*header));
741
Alex Elder103a1502012-08-02 11:29:45 -0500742 snap_count = le32_to_cpu(ondisk->snap_count);
743
Alex Elder58c17b02012-08-23 23:22:06 -0500744 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
745 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500746 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700747 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500748 memcpy(header->object_prefix, ondisk->object_prefix, len);
749 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600750
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500752 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
753
Alex Elder621901d2012-08-23 23:22:06 -0500754 /* Save a copy of the snapshot names */
755
Alex Elderf785cc12012-08-23 23:22:06 -0500756 if (snap_names_len > (u64) SIZE_MAX)
757 return -EIO;
758 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700759 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500760 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500761 /*
762 * Note that rbd_dev_v1_header_read() guarantees
763 * the ondisk buffer we're working with has
764 * snap_names_len bytes beyond the end of the
765 * snapshot id array, this memcpy() is safe.
766 */
767 memcpy(header->snap_names, &ondisk->snaps[snap_count],
768 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500769
Alex Elder621901d2012-08-23 23:22:06 -0500770 /* Record each snapshot's size */
771
Alex Elderd2bb24e2012-07-26 23:37:14 -0500772 size = snap_count * sizeof (*header->snap_sizes);
773 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700774 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500775 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500776 for (i = 0; i < snap_count; i++)
777 header->snap_sizes[i] =
778 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779 } else {
780 header->snap_names = NULL;
781 header->snap_sizes = NULL;
782 }
Alex Elder849b4262012-07-09 21:04:24 -0500783
Alex Elder34b13182012-07-13 20:35:12 -0500784 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700785 header->obj_order = ondisk->options.order;
786 header->crypt_type = ondisk->options.crypt_type;
787 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500788
Alex Elder621901d2012-08-23 23:22:06 -0500789 /* Allocate and fill in the snapshot context */
790
Alex Elderf84344f2012-08-31 17:29:51 -0500791 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder468521c2013-04-26 09:43:47 -0500792
Alex Elder812164f82013-04-30 00:44:32 -0500793 header->snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500794 if (!header->snapc)
795 goto out_err;
Alex Elder505cbb92012-07-19 08:49:18 -0500796 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Alex Elder621901d2012-08-23 23:22:06 -0500797 for (i = 0; i < snap_count; i++)
Alex Elder468521c2013-04-26 09:43:47 -0500798 header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700799
800 return 0;
801
Alex Elder6a523252012-07-19 17:12:59 -0500802out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500803 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500804 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700805 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500806 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500807 kfree(header->object_prefix);
808 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500809
Alex Elder00f1f362012-02-07 12:03:36 -0600810 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700811}
812
Alex Elder9e15b772012-10-30 19:40:33 -0500813static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
814{
815 struct rbd_snap *snap;
816
817 if (snap_id == CEPH_NOSNAP)
818 return RBD_SNAP_HEAD_NAME;
819
820 list_for_each_entry(snap, &rbd_dev->snaps, node)
821 if (snap_id == snap->id)
822 return snap->name;
823
824 return NULL;
825}
826
Alex Elder8b0241f2013-04-25 23:15:08 -0500827static struct rbd_snap *snap_by_name(struct rbd_device *rbd_dev,
828 const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700829{
Alex Eldere86924a2012-07-10 20:30:11 -0500830 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600831
Alex Elder8b0241f2013-04-25 23:15:08 -0500832 list_for_each_entry(snap, &rbd_dev->snaps, node)
833 if (!strcmp(snap_name, snap->name))
834 return snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600835
Alex Elder8b0241f2013-04-25 23:15:08 -0500836 return NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700837}
838
Alex Elderd1cf5782013-04-27 09:59:30 -0500839static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700840{
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500841 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800842 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder99c1f082012-08-30 14:42:15 -0500843 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500844 rbd_dev->mapping.features = rbd_dev->header.features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845 } else {
Alex Elder8b0241f2013-04-25 23:15:08 -0500846 struct rbd_snap *snap;
847
848 snap = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
849 if (!snap)
850 return -ENOENT;
Alex Elder8b0241f2013-04-25 23:15:08 -0500851 rbd_dev->mapping.size = snap->size;
852 rbd_dev->mapping.features = snap->features;
Alex Elderf84344f2012-08-31 17:29:51 -0500853 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854 }
Alex Elder6d292902013-01-14 12:43:31 -0600855
Alex Elder8b0241f2013-04-25 23:15:08 -0500856 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700857}
858
Alex Elderd1cf5782013-04-27 09:59:30 -0500859static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
860{
861 rbd_dev->mapping.size = 0;
862 rbd_dev->mapping.features = 0;
863 rbd_dev->mapping.read_only = true;
864}
865
Alex Elder200a6a82013-04-28 23:32:34 -0500866static void rbd_dev_clear_mapping(struct rbd_device *rbd_dev)
867{
868 rbd_dev->mapping.size = 0;
869 rbd_dev->mapping.features = 0;
870 rbd_dev->mapping.read_only = true;
871}
872
Alex Elder98571b52013-01-20 14:44:42 -0600873static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874{
Alex Elder65ccfe22012-08-09 10:33:26 -0700875 char *name;
876 u64 segment;
877 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878
Alex Elder2fd82b92012-11-09 15:05:54 -0600879 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700880 if (!name)
881 return NULL;
882 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600883 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700884 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600885 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700886 pr_err("error formatting segment name for #%llu (%d)\n",
887 segment, ret);
888 kfree(name);
889 name = NULL;
890 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700891
Alex Elder65ccfe22012-08-09 10:33:26 -0700892 return name;
893}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700894
Alex Elder65ccfe22012-08-09 10:33:26 -0700895static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
896{
897 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898
Alex Elder65ccfe22012-08-09 10:33:26 -0700899 return offset & (segment_size - 1);
900}
901
902static u64 rbd_segment_length(struct rbd_device *rbd_dev,
903 u64 offset, u64 length)
904{
905 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
906
907 offset &= segment_size - 1;
908
Alex Elderaafb2302012-09-06 16:00:54 -0500909 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700910 if (offset + length > segment_size)
911 length = segment_size - offset;
912
913 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700914}
915
916/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700917 * returns the size of an object in the image
918 */
919static u64 rbd_obj_bytes(struct rbd_image_header *header)
920{
921 return 1 << header->obj_order;
922}
923
924/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925 * bio helpers
926 */
927
928static void bio_chain_put(struct bio *chain)
929{
930 struct bio *tmp;
931
932 while (chain) {
933 tmp = chain;
934 chain = chain->bi_next;
935 bio_put(tmp);
936 }
937}
938
939/*
940 * zeros a bio chain, starting at specific offset
941 */
942static void zero_bio_chain(struct bio *chain, int start_ofs)
943{
944 struct bio_vec *bv;
945 unsigned long flags;
946 void *buf;
947 int i;
948 int pos = 0;
949
950 while (chain) {
951 bio_for_each_segment(bv, chain, i) {
952 if (pos + bv->bv_len > start_ofs) {
953 int remainder = max(start_ofs - pos, 0);
954 buf = bvec_kmap_irq(bv, &flags);
955 memset(buf + remainder, 0,
956 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200957 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958 }
959 pos += bv->bv_len;
960 }
961
962 chain = chain->bi_next;
963 }
964}
965
966/*
Alex Elderb9434c52013-04-19 15:34:50 -0500967 * similar to zero_bio_chain(), zeros data defined by a page array,
968 * starting at the given byte offset from the start of the array and
969 * continuing up to the given end offset. The pages array is
970 * assumed to be big enough to hold all bytes up to the end.
971 */
972static void zero_pages(struct page **pages, u64 offset, u64 end)
973{
974 struct page **page = &pages[offset >> PAGE_SHIFT];
975
976 rbd_assert(end > offset);
977 rbd_assert(end - offset <= (u64)SIZE_MAX);
978 while (offset < end) {
979 size_t page_offset;
980 size_t length;
981 unsigned long flags;
982 void *kaddr;
983
984 page_offset = (size_t)(offset & ~PAGE_MASK);
985 length = min(PAGE_SIZE - page_offset, (size_t)(end - offset));
986 local_irq_save(flags);
987 kaddr = kmap_atomic(*page);
988 memset(kaddr + page_offset, 0, length);
989 kunmap_atomic(kaddr);
990 local_irq_restore(flags);
991
992 offset += length;
993 page++;
994 }
995}
996
997/*
Alex Elderf7760da2012-10-20 22:17:27 -0500998 * Clone a portion of a bio, starting at the given byte offset
999 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000 */
Alex Elderf7760da2012-10-20 22:17:27 -05001001static struct bio *bio_clone_range(struct bio *bio_src,
1002 unsigned int offset,
1003 unsigned int len,
1004 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005{
Alex Elderf7760da2012-10-20 22:17:27 -05001006 struct bio_vec *bv;
1007 unsigned int resid;
1008 unsigned short idx;
1009 unsigned int voff;
1010 unsigned short end_idx;
1011 unsigned short vcnt;
1012 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001013
Alex Elderf7760da2012-10-20 22:17:27 -05001014 /* Handle the easy case for the caller */
1015
1016 if (!offset && len == bio_src->bi_size)
1017 return bio_clone(bio_src, gfpmask);
1018
1019 if (WARN_ON_ONCE(!len))
1020 return NULL;
1021 if (WARN_ON_ONCE(len > bio_src->bi_size))
1022 return NULL;
1023 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
1024 return NULL;
1025
1026 /* Find first affected segment... */
1027
1028 resid = offset;
1029 __bio_for_each_segment(bv, bio_src, idx, 0) {
1030 if (resid < bv->bv_len)
1031 break;
1032 resid -= bv->bv_len;
1033 }
1034 voff = resid;
1035
1036 /* ...and the last affected segment */
1037
1038 resid += len;
1039 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
1040 if (resid <= bv->bv_len)
1041 break;
1042 resid -= bv->bv_len;
1043 }
1044 vcnt = end_idx - idx + 1;
1045
1046 /* Build the clone */
1047
1048 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
1049 if (!bio)
1050 return NULL; /* ENOMEM */
1051
1052 bio->bi_bdev = bio_src->bi_bdev;
1053 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
1054 bio->bi_rw = bio_src->bi_rw;
1055 bio->bi_flags |= 1 << BIO_CLONED;
1056
1057 /*
1058 * Copy over our part of the bio_vec, then update the first
1059 * and last (or only) entries.
1060 */
1061 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
1062 vcnt * sizeof (struct bio_vec));
1063 bio->bi_io_vec[0].bv_offset += voff;
1064 if (vcnt > 1) {
1065 bio->bi_io_vec[0].bv_len -= voff;
1066 bio->bi_io_vec[vcnt - 1].bv_len = resid;
1067 } else {
1068 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001069 }
1070
Alex Elderf7760da2012-10-20 22:17:27 -05001071 bio->bi_vcnt = vcnt;
1072 bio->bi_size = len;
1073 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -07001074
Alex Elderf7760da2012-10-20 22:17:27 -05001075 return bio;
1076}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001077
Alex Elderf7760da2012-10-20 22:17:27 -05001078/*
1079 * Clone a portion of a bio chain, starting at the given byte offset
1080 * into the first bio in the source chain and continuing for the
1081 * number of bytes indicated. The result is another bio chain of
1082 * exactly the given length, or a null pointer on error.
1083 *
1084 * The bio_src and offset parameters are both in-out. On entry they
1085 * refer to the first source bio and the offset into that bio where
1086 * the start of data to be cloned is located.
1087 *
1088 * On return, bio_src is updated to refer to the bio in the source
1089 * chain that contains first un-cloned byte, and *offset will
1090 * contain the offset of that byte within that bio.
1091 */
1092static struct bio *bio_chain_clone_range(struct bio **bio_src,
1093 unsigned int *offset,
1094 unsigned int len,
1095 gfp_t gfpmask)
1096{
1097 struct bio *bi = *bio_src;
1098 unsigned int off = *offset;
1099 struct bio *chain = NULL;
1100 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001101
Alex Elderf7760da2012-10-20 22:17:27 -05001102 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103
Alex Elderf7760da2012-10-20 22:17:27 -05001104 if (!bi || off >= bi->bi_size || !len)
1105 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106
Alex Elderf7760da2012-10-20 22:17:27 -05001107 end = &chain;
1108 while (len) {
1109 unsigned int bi_size;
1110 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001111
Alex Elderf5400b72012-11-01 10:17:15 -05001112 if (!bi) {
1113 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001114 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001115 }
Alex Elderf7760da2012-10-20 22:17:27 -05001116 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1117 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1118 if (!bio)
1119 goto out_err; /* ENOMEM */
1120
1121 *end = bio;
1122 end = &bio->bi_next;
1123
1124 off += bi_size;
1125 if (off == bi->bi_size) {
1126 bi = bi->bi_next;
1127 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001128 }
Alex Elderf7760da2012-10-20 22:17:27 -05001129 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001130 }
Alex Elderf7760da2012-10-20 22:17:27 -05001131 *bio_src = bi;
1132 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001133
Alex Elderf7760da2012-10-20 22:17:27 -05001134 return chain;
1135out_err:
1136 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001137
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001138 return NULL;
1139}
1140
Alex Elder926f9b32013-02-11 12:33:24 -06001141/*
1142 * The default/initial value for all object request flags is 0. For
1143 * each flag, once its value is set to 1 it is never reset to 0
1144 * again.
1145 */
Alex Elder6365d332013-02-11 12:33:24 -06001146static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1147{
1148 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001149 struct rbd_device *rbd_dev;
1150
Alex Elder57acbaa2013-02-11 12:33:24 -06001151 rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder6365d332013-02-11 12:33:24 -06001152 rbd_warn(rbd_dev, "obj_request %p already marked img_data\n",
1153 obj_request);
1154 }
1155}
1156
1157static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1158{
1159 smp_mb();
1160 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1161}
1162
Alex Elder57acbaa2013-02-11 12:33:24 -06001163static void obj_request_done_set(struct rbd_obj_request *obj_request)
1164{
1165 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1166 struct rbd_device *rbd_dev = NULL;
1167
1168 if (obj_request_img_data_test(obj_request))
1169 rbd_dev = obj_request->img_request->rbd_dev;
1170 rbd_warn(rbd_dev, "obj_request %p already marked done\n",
1171 obj_request);
1172 }
1173}
1174
1175static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1176{
1177 smp_mb();
1178 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1179}
1180
Alex Elder5679c592013-02-11 12:33:24 -06001181/*
1182 * This sets the KNOWN flag after (possibly) setting the EXISTS
1183 * flag. The latter is set based on the "exists" value provided.
1184 *
1185 * Note that for our purposes once an object exists it never goes
1186 * away again. It's possible that the response from two existence
1187 * checks are separated by the creation of the target object, and
1188 * the first ("doesn't exist") response arrives *after* the second
1189 * ("does exist"). In that case we ignore the second one.
1190 */
1191static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1192 bool exists)
1193{
1194 if (exists)
1195 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1196 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1197 smp_mb();
1198}
1199
1200static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1201{
1202 smp_mb();
1203 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1204}
1205
1206static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1207{
1208 smp_mb();
1209 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1210}
1211
Alex Elderbf0d5f502012-11-22 00:00:08 -06001212static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1213{
Alex Elder37206ee2013-02-20 17:32:08 -06001214 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1215 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001216 kref_get(&obj_request->kref);
1217}
1218
1219static void rbd_obj_request_destroy(struct kref *kref);
1220static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1221{
1222 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001223 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1224 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001225 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1226}
1227
1228static void rbd_img_request_get(struct rbd_img_request *img_request)
1229{
Alex Elder37206ee2013-02-20 17:32:08 -06001230 dout("%s: img %p (was %d)\n", __func__, img_request,
1231 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001232 kref_get(&img_request->kref);
1233}
1234
1235static void rbd_img_request_destroy(struct kref *kref);
1236static void rbd_img_request_put(struct rbd_img_request *img_request)
1237{
1238 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001239 dout("%s: img %p (was %d)\n", __func__, img_request,
1240 atomic_read(&img_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001241 kref_put(&img_request->kref, rbd_img_request_destroy);
1242}
1243
1244static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1245 struct rbd_obj_request *obj_request)
1246{
Alex Elder25dcf952013-01-25 17:08:55 -06001247 rbd_assert(obj_request->img_request == NULL);
1248
Alex Elderb155e862013-04-15 14:50:37 -05001249 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001250 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001251 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001252 rbd_assert(!obj_request_img_data_test(obj_request));
1253 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001254 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001255 img_request->obj_request_count++;
1256 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001257 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1258 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001259}
1260
1261static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1262 struct rbd_obj_request *obj_request)
1263{
1264 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001265
Alex Elder37206ee2013-02-20 17:32:08 -06001266 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1267 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001268 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001269 rbd_assert(img_request->obj_request_count > 0);
1270 img_request->obj_request_count--;
1271 rbd_assert(obj_request->which == img_request->obj_request_count);
1272 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001273 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001274 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001275 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001276 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001277 rbd_obj_request_put(obj_request);
1278}
1279
1280static bool obj_request_type_valid(enum obj_request_type type)
1281{
1282 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001283 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001284 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001285 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001286 return true;
1287 default:
1288 return false;
1289 }
1290}
1291
Alex Elderbf0d5f502012-11-22 00:00:08 -06001292static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1293 struct rbd_obj_request *obj_request)
1294{
Alex Elder37206ee2013-02-20 17:32:08 -06001295 dout("%s: osdc %p obj %p\n", __func__, osdc, obj_request);
1296
Alex Elderbf0d5f502012-11-22 00:00:08 -06001297 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1298}
1299
1300static void rbd_img_request_complete(struct rbd_img_request *img_request)
1301{
Alex Elder55f27e02013-04-10 12:34:25 -05001302
Alex Elder37206ee2013-02-20 17:32:08 -06001303 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001304
1305 /*
1306 * If no error occurred, compute the aggregate transfer
1307 * count for the image request. We could instead use
1308 * atomic64_cmpxchg() to update it as each object request
1309 * completes; not clear which way is better off hand.
1310 */
1311 if (!img_request->result) {
1312 struct rbd_obj_request *obj_request;
1313 u64 xferred = 0;
1314
1315 for_each_obj_request(img_request, obj_request)
1316 xferred += obj_request->xferred;
1317 img_request->xferred = xferred;
1318 }
1319
Alex Elderbf0d5f502012-11-22 00:00:08 -06001320 if (img_request->callback)
1321 img_request->callback(img_request);
1322 else
1323 rbd_img_request_put(img_request);
1324}
1325
Alex Elder788e2df2013-01-17 12:25:27 -06001326/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1327
1328static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1329{
Alex Elder37206ee2013-02-20 17:32:08 -06001330 dout("%s: obj %p\n", __func__, obj_request);
1331
Alex Elder788e2df2013-01-17 12:25:27 -06001332 return wait_for_completion_interruptible(&obj_request->completion);
1333}
1334
Alex Elder0c425242013-02-08 09:55:49 -06001335/*
1336 * The default/initial value for all image request flags is 0. Each
1337 * is conditionally set to 1 at image request initialization time
1338 * and currently never change thereafter.
1339 */
1340static void img_request_write_set(struct rbd_img_request *img_request)
1341{
1342 set_bit(IMG_REQ_WRITE, &img_request->flags);
1343 smp_mb();
1344}
1345
1346static bool img_request_write_test(struct rbd_img_request *img_request)
1347{
1348 smp_mb();
1349 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1350}
1351
Alex Elder9849e982013-01-24 16:13:36 -06001352static void img_request_child_set(struct rbd_img_request *img_request)
1353{
1354 set_bit(IMG_REQ_CHILD, &img_request->flags);
1355 smp_mb();
1356}
1357
1358static bool img_request_child_test(struct rbd_img_request *img_request)
1359{
1360 smp_mb();
1361 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1362}
1363
Alex Elderd0b2e942013-01-24 16:13:36 -06001364static void img_request_layered_set(struct rbd_img_request *img_request)
1365{
1366 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1367 smp_mb();
1368}
1369
1370static bool img_request_layered_test(struct rbd_img_request *img_request)
1371{
1372 smp_mb();
1373 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1374}
1375
Alex Elder6e2a4502013-03-27 09:16:30 -05001376static void
1377rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1378{
Alex Elderb9434c52013-04-19 15:34:50 -05001379 u64 xferred = obj_request->xferred;
1380 u64 length = obj_request->length;
1381
Alex Elder6e2a4502013-03-27 09:16:30 -05001382 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1383 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001384 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001385 /*
1386 * ENOENT means a hole in the image. We zero-fill the
1387 * entire length of the request. A short read also implies
1388 * zero-fill to the end of the request. Either way we
1389 * update the xferred count to indicate the whole request
1390 * was satisfied.
1391 */
Alex Elderb9434c52013-04-19 15:34:50 -05001392 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001393 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001394 if (obj_request->type == OBJ_REQUEST_BIO)
1395 zero_bio_chain(obj_request->bio_list, 0);
1396 else
1397 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001398 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001399 obj_request->xferred = length;
1400 } else if (xferred < length && !obj_request->result) {
1401 if (obj_request->type == OBJ_REQUEST_BIO)
1402 zero_bio_chain(obj_request->bio_list, xferred);
1403 else
1404 zero_pages(obj_request->pages, xferred, length);
1405 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001406 }
1407 obj_request_done_set(obj_request);
1408}
1409
Alex Elderbf0d5f502012-11-22 00:00:08 -06001410static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1411{
Alex Elder37206ee2013-02-20 17:32:08 -06001412 dout("%s: obj %p cb %p\n", __func__, obj_request,
1413 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001414 if (obj_request->callback)
1415 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001416 else
1417 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001418}
1419
Alex Elderc47f9372013-02-26 14:23:07 -06001420static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request)
Alex Elder39bf2c52013-02-26 14:23:07 -06001421{
1422 dout("%s: obj %p\n", __func__, obj_request);
1423 obj_request_done_set(obj_request);
1424}
1425
Alex Elderc47f9372013-02-26 14:23:07 -06001426static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001427{
Alex Elder57acbaa2013-02-11 12:33:24 -06001428 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001429 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001430 bool layered = false;
1431
1432 if (obj_request_img_data_test(obj_request)) {
1433 img_request = obj_request->img_request;
1434 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001435 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001436 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001437
1438 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1439 obj_request, img_request, obj_request->result,
1440 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001441 if (layered && obj_request->result == -ENOENT &&
1442 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001443 rbd_img_parent_read(obj_request);
1444 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001445 rbd_img_obj_request_read_callback(obj_request);
1446 else
1447 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001448}
1449
Alex Elderc47f9372013-02-26 14:23:07 -06001450static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001451{
Sage Weil1b83bef2013-02-25 16:11:12 -08001452 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1453 obj_request->result, obj_request->length);
1454 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001455 * There is no such thing as a successful short write. Set
1456 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001457 */
1458 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001459 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001460}
1461
Alex Elderfbfab532013-02-08 09:55:48 -06001462/*
1463 * For a simple stat call there's nothing to do. We'll do more if
1464 * this is part of a write sequence for a layered image.
1465 */
Alex Elderc47f9372013-02-26 14:23:07 -06001466static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001467{
Alex Elder37206ee2013-02-20 17:32:08 -06001468 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001469 obj_request_done_set(obj_request);
1470}
1471
Alex Elderbf0d5f502012-11-22 00:00:08 -06001472static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1473 struct ceph_msg *msg)
1474{
1475 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001476 u16 opcode;
1477
Alex Elder37206ee2013-02-20 17:32:08 -06001478 dout("%s: osd_req %p msg %p\n", __func__, osd_req, msg);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001479 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001480 if (obj_request_img_data_test(obj_request)) {
1481 rbd_assert(obj_request->img_request);
1482 rbd_assert(obj_request->which != BAD_WHICH);
1483 } else {
1484 rbd_assert(obj_request->which == BAD_WHICH);
1485 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001486
Sage Weil1b83bef2013-02-25 16:11:12 -08001487 if (osd_req->r_result < 0)
1488 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001489 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1490
Alex Elder0eefd472013-04-19 15:34:50 -05001491 BUG_ON(osd_req->r_num_ops > 2);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001492
Alex Elderc47f9372013-02-26 14:23:07 -06001493 /*
1494 * We support a 64-bit length, but ultimately it has to be
1495 * passed to blk_end_request(), which takes an unsigned int.
1496 */
Sage Weil1b83bef2013-02-25 16:11:12 -08001497 obj_request->xferred = osd_req->r_reply_op_len[0];
Alex Elder8b3e1a52013-01-24 16:13:36 -06001498 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Alex Elder79528732013-04-03 21:32:51 -05001499 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001500 switch (opcode) {
1501 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001502 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001503 break;
1504 case CEPH_OSD_OP_WRITE:
Alex Elderc47f9372013-02-26 14:23:07 -06001505 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001506 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001507 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001508 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001509 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001510 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001511 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001512 case CEPH_OSD_OP_WATCH:
Alex Elderc47f9372013-02-26 14:23:07 -06001513 rbd_osd_trivial_callback(obj_request);
Alex Elder9969ebc2013-01-18 12:31:10 -06001514 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001515 default:
1516 rbd_warn(NULL, "%s: unsupported op %hu\n",
1517 obj_request->object_name, (unsigned short) opcode);
1518 break;
1519 }
1520
Alex Elder07741302013-02-05 23:41:50 -06001521 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001522 rbd_obj_request_complete(obj_request);
1523}
1524
Alex Elder9d4df012013-04-19 15:34:50 -05001525static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001526{
1527 struct rbd_img_request *img_request = obj_request->img_request;
Alex Elder8c042b02013-04-03 01:28:58 -05001528 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001529 u64 snap_id;
Alex Elder430c28c2013-04-03 21:32:51 -05001530
Alex Elder8c042b02013-04-03 01:28:58 -05001531 rbd_assert(osd_req != NULL);
Alex Elder430c28c2013-04-03 21:32:51 -05001532
Alex Elder9d4df012013-04-19 15:34:50 -05001533 snap_id = img_request ? img_request->snap_id : CEPH_NOSNAP;
Alex Elder8c042b02013-04-03 01:28:58 -05001534 ceph_osdc_build_request(osd_req, obj_request->offset,
Alex Elder9d4df012013-04-19 15:34:50 -05001535 NULL, snap_id, NULL);
1536}
1537
1538static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1539{
1540 struct rbd_img_request *img_request = obj_request->img_request;
1541 struct ceph_osd_request *osd_req = obj_request->osd_req;
1542 struct ceph_snap_context *snapc;
1543 struct timespec mtime = CURRENT_TIME;
1544
1545 rbd_assert(osd_req != NULL);
1546
1547 snapc = img_request ? img_request->snapc : NULL;
1548 ceph_osdc_build_request(osd_req, obj_request->offset,
1549 snapc, CEPH_NOSNAP, &mtime);
Alex Elder430c28c2013-04-03 21:32:51 -05001550}
1551
Alex Elderbf0d5f502012-11-22 00:00:08 -06001552static struct ceph_osd_request *rbd_osd_req_create(
1553 struct rbd_device *rbd_dev,
1554 bool write_request,
Alex Elder430c28c2013-04-03 21:32:51 -05001555 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001556{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001557 struct ceph_snap_context *snapc = NULL;
1558 struct ceph_osd_client *osdc;
1559 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001560
Alex Elder6365d332013-02-11 12:33:24 -06001561 if (obj_request_img_data_test(obj_request)) {
1562 struct rbd_img_request *img_request = obj_request->img_request;
1563
Alex Elder0c425242013-02-08 09:55:49 -06001564 rbd_assert(write_request ==
1565 img_request_write_test(img_request));
1566 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001568 }
1569
1570 /* Allocate and initialize the request, for the single op */
1571
1572 osdc = &rbd_dev->rbd_client->client->osdc;
1573 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1574 if (!osd_req)
1575 return NULL; /* ENOMEM */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001576
Alex Elder430c28c2013-04-03 21:32:51 -05001577 if (write_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001578 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
Alex Elder430c28c2013-04-03 21:32:51 -05001579 else
Alex Elderbf0d5f502012-11-22 00:00:08 -06001580 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001581
1582 osd_req->r_callback = rbd_osd_req_callback;
1583 osd_req->r_priv = obj_request;
1584
1585 osd_req->r_oid_len = strlen(obj_request->object_name);
1586 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1587 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1588
1589 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1590
Alex Elderbf0d5f502012-11-22 00:00:08 -06001591 return osd_req;
1592}
1593
Alex Elder0eefd472013-04-19 15:34:50 -05001594/*
1595 * Create a copyup osd request based on the information in the
1596 * object request supplied. A copyup request has two osd ops,
1597 * a copyup method call, and a "normal" write request.
1598 */
1599static struct ceph_osd_request *
1600rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1601{
1602 struct rbd_img_request *img_request;
1603 struct ceph_snap_context *snapc;
1604 struct rbd_device *rbd_dev;
1605 struct ceph_osd_client *osdc;
1606 struct ceph_osd_request *osd_req;
1607
1608 rbd_assert(obj_request_img_data_test(obj_request));
1609 img_request = obj_request->img_request;
1610 rbd_assert(img_request);
1611 rbd_assert(img_request_write_test(img_request));
1612
1613 /* Allocate and initialize the request, for the two ops */
1614
1615 snapc = img_request->snapc;
1616 rbd_dev = img_request->rbd_dev;
1617 osdc = &rbd_dev->rbd_client->client->osdc;
1618 osd_req = ceph_osdc_alloc_request(osdc, snapc, 2, false, GFP_ATOMIC);
1619 if (!osd_req)
1620 return NULL; /* ENOMEM */
1621
1622 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1623 osd_req->r_callback = rbd_osd_req_callback;
1624 osd_req->r_priv = obj_request;
1625
1626 osd_req->r_oid_len = strlen(obj_request->object_name);
1627 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1628 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1629
1630 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1631
1632 return osd_req;
1633}
1634
1635
Alex Elderbf0d5f502012-11-22 00:00:08 -06001636static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1637{
1638 ceph_osdc_put_request(osd_req);
1639}
1640
1641/* object_name is assumed to be a non-null pointer and NUL-terminated */
1642
1643static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1644 u64 offset, u64 length,
1645 enum obj_request_type type)
1646{
1647 struct rbd_obj_request *obj_request;
1648 size_t size;
1649 char *name;
1650
1651 rbd_assert(obj_request_type_valid(type));
1652
1653 size = strlen(object_name) + 1;
1654 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1655 if (!obj_request)
1656 return NULL;
1657
1658 name = (char *)(obj_request + 1);
1659 obj_request->object_name = memcpy(name, object_name, size);
1660 obj_request->offset = offset;
1661 obj_request->length = length;
Alex Elder926f9b32013-02-11 12:33:24 -06001662 obj_request->flags = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001663 obj_request->which = BAD_WHICH;
1664 obj_request->type = type;
1665 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06001666 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001667 kref_init(&obj_request->kref);
1668
Alex Elder37206ee2013-02-20 17:32:08 -06001669 dout("%s: \"%s\" %llu/%llu %d -> obj %p\n", __func__, object_name,
1670 offset, length, (int)type, obj_request);
1671
Alex Elderbf0d5f502012-11-22 00:00:08 -06001672 return obj_request;
1673}
1674
1675static void rbd_obj_request_destroy(struct kref *kref)
1676{
1677 struct rbd_obj_request *obj_request;
1678
1679 obj_request = container_of(kref, struct rbd_obj_request, kref);
1680
Alex Elder37206ee2013-02-20 17:32:08 -06001681 dout("%s: obj %p\n", __func__, obj_request);
1682
Alex Elderbf0d5f502012-11-22 00:00:08 -06001683 rbd_assert(obj_request->img_request == NULL);
1684 rbd_assert(obj_request->which == BAD_WHICH);
1685
1686 if (obj_request->osd_req)
1687 rbd_osd_req_destroy(obj_request->osd_req);
1688
1689 rbd_assert(obj_request_type_valid(obj_request->type));
1690 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001691 case OBJ_REQUEST_NODATA:
1692 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001693 case OBJ_REQUEST_BIO:
1694 if (obj_request->bio_list)
1695 bio_chain_put(obj_request->bio_list);
1696 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001697 case OBJ_REQUEST_PAGES:
1698 if (obj_request->pages)
1699 ceph_release_page_vector(obj_request->pages,
1700 obj_request->page_count);
1701 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001702 }
1703
1704 kfree(obj_request);
1705}
1706
1707/*
1708 * Caller is responsible for filling in the list of object requests
1709 * that comprises the image request, and the Linux request pointer
1710 * (if there is one).
1711 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001712static struct rbd_img_request *rbd_img_request_create(
1713 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001714 u64 offset, u64 length,
Alex Elder9849e982013-01-24 16:13:36 -06001715 bool write_request,
1716 bool child_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001717{
1718 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001719
1720 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1721 if (!img_request)
1722 return NULL;
1723
1724 if (write_request) {
1725 down_read(&rbd_dev->header_rwsem);
Alex Elder812164f82013-04-30 00:44:32 -05001726 ceph_get_snap_context(rbd_dev->header.snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001727 up_read(&rbd_dev->header_rwsem);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001728 }
1729
1730 img_request->rq = NULL;
1731 img_request->rbd_dev = rbd_dev;
1732 img_request->offset = offset;
1733 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06001734 img_request->flags = 0;
1735 if (write_request) {
1736 img_request_write_set(img_request);
Alex Elder468521c2013-04-26 09:43:47 -05001737 img_request->snapc = rbd_dev->header.snapc;
Alex Elder0c425242013-02-08 09:55:49 -06001738 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001739 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06001740 }
Alex Elder9849e982013-01-24 16:13:36 -06001741 if (child_request)
1742 img_request_child_set(img_request);
Alex Elderd0b2e942013-01-24 16:13:36 -06001743 if (rbd_dev->parent_spec)
1744 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001745 spin_lock_init(&img_request->completion_lock);
1746 img_request->next_completion = 0;
1747 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06001748 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001749 img_request->obj_request_count = 0;
1750 INIT_LIST_HEAD(&img_request->obj_requests);
1751 kref_init(&img_request->kref);
1752
1753 rbd_img_request_get(img_request); /* Avoid a warning */
1754 rbd_img_request_put(img_request); /* TEMPORARY */
1755
Alex Elder37206ee2013-02-20 17:32:08 -06001756 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
1757 write_request ? "write" : "read", offset, length,
1758 img_request);
1759
Alex Elderbf0d5f502012-11-22 00:00:08 -06001760 return img_request;
1761}
1762
1763static void rbd_img_request_destroy(struct kref *kref)
1764{
1765 struct rbd_img_request *img_request;
1766 struct rbd_obj_request *obj_request;
1767 struct rbd_obj_request *next_obj_request;
1768
1769 img_request = container_of(kref, struct rbd_img_request, kref);
1770
Alex Elder37206ee2013-02-20 17:32:08 -06001771 dout("%s: img %p\n", __func__, img_request);
1772
Alex Elderbf0d5f502012-11-22 00:00:08 -06001773 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1774 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001775 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001776
Alex Elder0c425242013-02-08 09:55:49 -06001777 if (img_request_write_test(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001778 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001779
Alex Elder8b3e1a52013-01-24 16:13:36 -06001780 if (img_request_child_test(img_request))
1781 rbd_obj_request_put(img_request->obj_request);
1782
Alex Elderbf0d5f502012-11-22 00:00:08 -06001783 kfree(img_request);
1784}
1785
Alex Elder12178572013-02-08 09:55:49 -06001786static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1787{
Alex Elder6365d332013-02-11 12:33:24 -06001788 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001789 unsigned int xferred;
1790 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001791 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001792
Alex Elder6365d332013-02-11 12:33:24 -06001793 rbd_assert(obj_request_img_data_test(obj_request));
1794 img_request = obj_request->img_request;
1795
Alex Elder12178572013-02-08 09:55:49 -06001796 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1797 xferred = (unsigned int)obj_request->xferred;
1798 result = obj_request->result;
1799 if (result) {
1800 struct rbd_device *rbd_dev = img_request->rbd_dev;
1801
1802 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)\n",
1803 img_request_write_test(img_request) ? "write" : "read",
1804 obj_request->length, obj_request->img_offset,
1805 obj_request->offset);
1806 rbd_warn(rbd_dev, " result %d xferred %x\n",
1807 result, xferred);
1808 if (!img_request->result)
1809 img_request->result = result;
1810 }
1811
Alex Elderf1a47392013-04-19 15:34:50 -05001812 /* Image object requests don't own their page array */
1813
1814 if (obj_request->type == OBJ_REQUEST_PAGES) {
1815 obj_request->pages = NULL;
1816 obj_request->page_count = 0;
1817 }
1818
Alex Elder8b3e1a52013-01-24 16:13:36 -06001819 if (img_request_child_test(img_request)) {
1820 rbd_assert(img_request->obj_request != NULL);
1821 more = obj_request->which < img_request->obj_request_count - 1;
1822 } else {
1823 rbd_assert(img_request->rq != NULL);
1824 more = blk_end_request(img_request->rq, result, xferred);
1825 }
1826
1827 return more;
Alex Elder12178572013-02-08 09:55:49 -06001828}
1829
Alex Elder21692382013-04-05 01:27:12 -05001830static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1831{
1832 struct rbd_img_request *img_request;
1833 u32 which = obj_request->which;
1834 bool more = true;
1835
Alex Elder6365d332013-02-11 12:33:24 -06001836 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001837 img_request = obj_request->img_request;
1838
1839 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1840 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001841 rbd_assert(img_request->obj_request_count > 0);
1842 rbd_assert(which != BAD_WHICH);
1843 rbd_assert(which < img_request->obj_request_count);
1844 rbd_assert(which >= img_request->next_completion);
1845
1846 spin_lock_irq(&img_request->completion_lock);
1847 if (which != img_request->next_completion)
1848 goto out;
1849
1850 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001851 rbd_assert(more);
1852 rbd_assert(which < img_request->obj_request_count);
1853
1854 if (!obj_request_done_test(obj_request))
1855 break;
Alex Elder12178572013-02-08 09:55:49 -06001856 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001857 which++;
1858 }
1859
1860 rbd_assert(more ^ (which == img_request->obj_request_count));
1861 img_request->next_completion = which;
1862out:
1863 spin_unlock_irq(&img_request->completion_lock);
1864
1865 if (!more)
1866 rbd_img_request_complete(img_request);
1867}
1868
Alex Elderf1a47392013-04-19 15:34:50 -05001869/*
1870 * Split up an image request into one or more object requests, each
1871 * to a different object. The "type" parameter indicates whether
1872 * "data_desc" is the pointer to the head of a list of bio
1873 * structures, or the base of a page array. In either case this
1874 * function assumes data_desc describes memory sufficient to hold
1875 * all data described by the image request.
1876 */
1877static int rbd_img_request_fill(struct rbd_img_request *img_request,
1878 enum obj_request_type type,
1879 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001880{
1881 struct rbd_device *rbd_dev = img_request->rbd_dev;
1882 struct rbd_obj_request *obj_request = NULL;
1883 struct rbd_obj_request *next_obj_request;
Alex Elder0c425242013-02-08 09:55:49 -06001884 bool write_request = img_request_write_test(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05001885 struct bio *bio_list;
1886 unsigned int bio_offset = 0;
1887 struct page **pages;
Alex Elder7da22d22013-01-24 16:13:36 -06001888 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001889 u64 resid;
1890 u16 opcode;
1891
Alex Elderf1a47392013-04-19 15:34:50 -05001892 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
1893 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06001894
Alex Elder430c28c2013-04-03 21:32:51 -05001895 opcode = write_request ? CEPH_OSD_OP_WRITE : CEPH_OSD_OP_READ;
Alex Elder7da22d22013-01-24 16:13:36 -06001896 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001897 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06001898 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001899
1900 if (type == OBJ_REQUEST_BIO) {
1901 bio_list = data_desc;
1902 rbd_assert(img_offset == bio_list->bi_sector << SECTOR_SHIFT);
1903 } else {
1904 rbd_assert(type == OBJ_REQUEST_PAGES);
1905 pages = data_desc;
1906 }
1907
Alex Elderbf0d5f502012-11-22 00:00:08 -06001908 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05001909 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001910 const char *object_name;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001911 u64 offset;
1912 u64 length;
1913
Alex Elder7da22d22013-01-24 16:13:36 -06001914 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001915 if (!object_name)
1916 goto out_unwind;
Alex Elder7da22d22013-01-24 16:13:36 -06001917 offset = rbd_segment_offset(rbd_dev, img_offset);
1918 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001919 obj_request = rbd_obj_request_create(object_name,
Alex Elderf1a47392013-04-19 15:34:50 -05001920 offset, length, type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001921 kfree(object_name); /* object request has its own copy */
1922 if (!obj_request)
1923 goto out_unwind;
1924
Alex Elderf1a47392013-04-19 15:34:50 -05001925 if (type == OBJ_REQUEST_BIO) {
1926 unsigned int clone_size;
1927
1928 rbd_assert(length <= (u64)UINT_MAX);
1929 clone_size = (unsigned int)length;
1930 obj_request->bio_list =
1931 bio_chain_clone_range(&bio_list,
1932 &bio_offset,
1933 clone_size,
1934 GFP_ATOMIC);
1935 if (!obj_request->bio_list)
1936 goto out_partial;
1937 } else {
1938 unsigned int page_count;
1939
1940 obj_request->pages = pages;
1941 page_count = (u32)calc_pages_for(offset, length);
1942 obj_request->page_count = page_count;
1943 if ((offset + length) & ~PAGE_MASK)
1944 page_count--; /* more on last page */
1945 pages += page_count;
1946 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001947
Alex Elder2fa12322013-04-05 01:27:12 -05001948 osd_req = rbd_osd_req_create(rbd_dev, write_request,
1949 obj_request);
1950 if (!osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001951 goto out_partial;
Alex Elder2fa12322013-04-05 01:27:12 -05001952 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05001953 obj_request->callback = rbd_img_obj_callback;
Alex Elder430c28c2013-04-03 21:32:51 -05001954
Alex Elder2fa12322013-04-05 01:27:12 -05001955 osd_req_op_extent_init(osd_req, 0, opcode, offset, length,
1956 0, 0);
Alex Elderf1a47392013-04-19 15:34:50 -05001957 if (type == OBJ_REQUEST_BIO)
1958 osd_req_op_extent_osd_data_bio(osd_req, 0,
1959 obj_request->bio_list, length);
1960 else
1961 osd_req_op_extent_osd_data_pages(osd_req, 0,
1962 obj_request->pages, length,
1963 offset & ~PAGE_MASK, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05001964
1965 if (write_request)
1966 rbd_osd_req_format_write(obj_request);
1967 else
1968 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05001969
Alex Elder7da22d22013-01-24 16:13:36 -06001970 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001971 rbd_img_obj_request_add(img_request, obj_request);
1972
Alex Elder7da22d22013-01-24 16:13:36 -06001973 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001974 resid -= length;
1975 }
1976
1977 return 0;
1978
1979out_partial:
1980 rbd_obj_request_put(obj_request);
1981out_unwind:
1982 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1983 rbd_obj_request_put(obj_request);
1984
1985 return -ENOMEM;
1986}
1987
Alex Elder3d7efd12013-04-19 15:34:50 -05001988static void
Alex Elder0eefd472013-04-19 15:34:50 -05001989rbd_img_obj_copyup_callback(struct rbd_obj_request *obj_request)
1990{
1991 struct rbd_img_request *img_request;
1992 struct rbd_device *rbd_dev;
1993 u64 length;
1994 u32 page_count;
1995
1996 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
1997 rbd_assert(obj_request_img_data_test(obj_request));
1998 img_request = obj_request->img_request;
1999 rbd_assert(img_request);
2000
2001 rbd_dev = img_request->rbd_dev;
2002 rbd_assert(rbd_dev);
2003 length = (u64)1 << rbd_dev->header.obj_order;
2004 page_count = (u32)calc_pages_for(0, length);
2005
2006 rbd_assert(obj_request->copyup_pages);
2007 ceph_release_page_vector(obj_request->copyup_pages, page_count);
2008 obj_request->copyup_pages = NULL;
2009
2010 /*
2011 * We want the transfer count to reflect the size of the
2012 * original write request. There is no such thing as a
2013 * successful short write, so if the request was successful
2014 * we can just set it to the originally-requested length.
2015 */
2016 if (!obj_request->result)
2017 obj_request->xferred = obj_request->length;
2018
2019 /* Finish up with the normal image object callback */
2020
2021 rbd_img_obj_callback(obj_request);
2022}
2023
2024static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002025rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2026{
2027 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002028 struct ceph_osd_request *osd_req;
2029 struct ceph_osd_client *osdc;
2030 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002031 struct page **pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002032 int result;
2033 u64 obj_size;
2034 u64 xferred;
2035
2036 rbd_assert(img_request_child_test(img_request));
2037
2038 /* First get what we need from the image request */
2039
2040 pages = img_request->copyup_pages;
2041 rbd_assert(pages != NULL);
2042 img_request->copyup_pages = NULL;
2043
2044 orig_request = img_request->obj_request;
2045 rbd_assert(orig_request != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002046 rbd_assert(orig_request->type == OBJ_REQUEST_BIO);
Alex Elder3d7efd12013-04-19 15:34:50 -05002047 result = img_request->result;
2048 obj_size = img_request->length;
2049 xferred = img_request->xferred;
2050
Alex Elder0eefd472013-04-19 15:34:50 -05002051 rbd_dev = img_request->rbd_dev;
2052 rbd_assert(rbd_dev);
2053 rbd_assert(obj_size == (u64)1 << rbd_dev->header.obj_order);
2054
Alex Elder3d7efd12013-04-19 15:34:50 -05002055 rbd_img_request_put(img_request);
2056
Alex Elder0eefd472013-04-19 15:34:50 -05002057 if (result)
2058 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002059
Alex Elder0eefd472013-04-19 15:34:50 -05002060 /* Allocate the new copyup osd request for the original request */
Alex Elder3d7efd12013-04-19 15:34:50 -05002061
Alex Elder0eefd472013-04-19 15:34:50 -05002062 result = -ENOMEM;
2063 rbd_assert(!orig_request->osd_req);
2064 osd_req = rbd_osd_req_create_copyup(orig_request);
2065 if (!osd_req)
2066 goto out_err;
2067 orig_request->osd_req = osd_req;
2068 orig_request->copyup_pages = pages;
Alex Elder3d7efd12013-04-19 15:34:50 -05002069
Alex Elder0eefd472013-04-19 15:34:50 -05002070 /* Initialize the copyup op */
2071
2072 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
2073 osd_req_op_cls_request_data_pages(osd_req, 0, pages, obj_size, 0,
2074 false, false);
2075
2076 /* Then the original write request op */
2077
2078 osd_req_op_extent_init(osd_req, 1, CEPH_OSD_OP_WRITE,
2079 orig_request->offset,
2080 orig_request->length, 0, 0);
2081 osd_req_op_extent_osd_data_bio(osd_req, 1, orig_request->bio_list,
2082 orig_request->length);
2083
2084 rbd_osd_req_format_write(orig_request);
2085
2086 /* All set, send it off. */
2087
2088 orig_request->callback = rbd_img_obj_copyup_callback;
2089 osdc = &rbd_dev->rbd_client->client->osdc;
2090 result = rbd_obj_request_submit(osdc, orig_request);
2091 if (!result)
2092 return;
2093out_err:
2094 /* Record the error code and complete the request */
2095
2096 orig_request->result = result;
2097 orig_request->xferred = 0;
2098 obj_request_done_set(orig_request);
2099 rbd_obj_request_complete(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002100}
2101
2102/*
2103 * Read from the parent image the range of data that covers the
2104 * entire target of the given object request. This is used for
2105 * satisfying a layered image write request when the target of an
2106 * object request from the image request does not exist.
2107 *
2108 * A page array big enough to hold the returned data is allocated
2109 * and supplied to rbd_img_request_fill() as the "data descriptor."
2110 * When the read completes, this page array will be transferred to
2111 * the original object request for the copyup operation.
2112 *
2113 * If an error occurs, record it as the result of the original
2114 * object request and mark it done so it gets completed.
2115 */
2116static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2117{
2118 struct rbd_img_request *img_request = NULL;
2119 struct rbd_img_request *parent_request = NULL;
2120 struct rbd_device *rbd_dev;
2121 u64 img_offset;
2122 u64 length;
2123 struct page **pages = NULL;
2124 u32 page_count;
2125 int result;
2126
2127 rbd_assert(obj_request_img_data_test(obj_request));
2128 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2129
2130 img_request = obj_request->img_request;
2131 rbd_assert(img_request != NULL);
2132 rbd_dev = img_request->rbd_dev;
2133 rbd_assert(rbd_dev->parent != NULL);
2134
2135 /*
Alex Elder0eefd472013-04-19 15:34:50 -05002136 * First things first. The original osd request is of no
2137 * use to use any more, we'll need a new one that can hold
2138 * the two ops in a copyup request. We'll get that later,
2139 * but for now we can release the old one.
2140 */
2141 rbd_osd_req_destroy(obj_request->osd_req);
2142 obj_request->osd_req = NULL;
2143
2144 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002145 * Determine the byte range covered by the object in the
2146 * child image to which the original request was to be sent.
2147 */
2148 img_offset = obj_request->img_offset - obj_request->offset;
2149 length = (u64)1 << rbd_dev->header.obj_order;
2150
2151 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002152 * There is no defined parent data beyond the parent
2153 * overlap, so limit what we read at that boundary if
2154 * necessary.
2155 */
2156 if (img_offset + length > rbd_dev->parent_overlap) {
2157 rbd_assert(img_offset < rbd_dev->parent_overlap);
2158 length = rbd_dev->parent_overlap - img_offset;
2159 }
2160
2161 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002162 * Allocate a page array big enough to receive the data read
2163 * from the parent.
2164 */
2165 page_count = (u32)calc_pages_for(0, length);
2166 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2167 if (IS_ERR(pages)) {
2168 result = PTR_ERR(pages);
2169 pages = NULL;
2170 goto out_err;
2171 }
2172
2173 result = -ENOMEM;
2174 parent_request = rbd_img_request_create(rbd_dev->parent,
2175 img_offset, length,
2176 false, true);
2177 if (!parent_request)
2178 goto out_err;
2179 rbd_obj_request_get(obj_request);
2180 parent_request->obj_request = obj_request;
2181
2182 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2183 if (result)
2184 goto out_err;
2185 parent_request->copyup_pages = pages;
2186
2187 parent_request->callback = rbd_img_obj_parent_read_full_callback;
2188 result = rbd_img_request_submit(parent_request);
2189 if (!result)
2190 return 0;
2191
2192 parent_request->copyup_pages = NULL;
2193 parent_request->obj_request = NULL;
2194 rbd_obj_request_put(obj_request);
2195out_err:
2196 if (pages)
2197 ceph_release_page_vector(pages, page_count);
2198 if (parent_request)
2199 rbd_img_request_put(parent_request);
2200 obj_request->result = result;
2201 obj_request->xferred = 0;
2202 obj_request_done_set(obj_request);
2203
2204 return result;
2205}
2206
Alex Elderc5b5ef62013-02-11 12:33:24 -06002207static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2208{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002209 struct rbd_obj_request *orig_request;
2210 int result;
2211
2212 rbd_assert(!obj_request_img_data_test(obj_request));
2213
2214 /*
2215 * All we need from the object request is the original
2216 * request and the result of the STAT op. Grab those, then
2217 * we're done with the request.
2218 */
2219 orig_request = obj_request->obj_request;
2220 obj_request->obj_request = NULL;
2221 rbd_assert(orig_request);
2222 rbd_assert(orig_request->img_request);
2223
2224 result = obj_request->result;
2225 obj_request->result = 0;
2226
2227 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2228 obj_request, orig_request, result,
2229 obj_request->xferred, obj_request->length);
2230 rbd_obj_request_put(obj_request);
2231
2232 rbd_assert(orig_request);
2233 rbd_assert(orig_request->img_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002234
2235 /*
2236 * Our only purpose here is to determine whether the object
2237 * exists, and we don't want to treat the non-existence as
2238 * an error. If something else comes back, transfer the
2239 * error to the original request and complete it now.
2240 */
2241 if (!result) {
2242 obj_request_existence_set(orig_request, true);
2243 } else if (result == -ENOENT) {
2244 obj_request_existence_set(orig_request, false);
2245 } else if (result) {
2246 orig_request->result = result;
Alex Elder3d7efd12013-04-19 15:34:50 -05002247 goto out;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002248 }
2249
2250 /*
2251 * Resubmit the original request now that we have recorded
2252 * whether the target object exists.
2253 */
Alex Elderb454e362013-04-19 15:34:50 -05002254 orig_request->result = rbd_img_obj_request_submit(orig_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002255out:
Alex Elderc5b5ef62013-02-11 12:33:24 -06002256 if (orig_request->result)
2257 rbd_obj_request_complete(orig_request);
2258 rbd_obj_request_put(orig_request);
2259}
2260
2261static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2262{
2263 struct rbd_obj_request *stat_request;
2264 struct rbd_device *rbd_dev;
2265 struct ceph_osd_client *osdc;
2266 struct page **pages = NULL;
2267 u32 page_count;
2268 size_t size;
2269 int ret;
2270
2271 /*
2272 * The response data for a STAT call consists of:
2273 * le64 length;
2274 * struct {
2275 * le32 tv_sec;
2276 * le32 tv_nsec;
2277 * } mtime;
2278 */
2279 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2280 page_count = (u32)calc_pages_for(0, size);
2281 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2282 if (IS_ERR(pages))
2283 return PTR_ERR(pages);
2284
2285 ret = -ENOMEM;
2286 stat_request = rbd_obj_request_create(obj_request->object_name, 0, 0,
2287 OBJ_REQUEST_PAGES);
2288 if (!stat_request)
2289 goto out;
2290
2291 rbd_obj_request_get(obj_request);
2292 stat_request->obj_request = obj_request;
2293 stat_request->pages = pages;
2294 stat_request->page_count = page_count;
2295
2296 rbd_assert(obj_request->img_request);
2297 rbd_dev = obj_request->img_request->rbd_dev;
2298 stat_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2299 stat_request);
2300 if (!stat_request->osd_req)
2301 goto out;
2302 stat_request->callback = rbd_img_obj_exists_callback;
2303
2304 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT);
2305 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2306 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002307 rbd_osd_req_format_read(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002308
2309 osdc = &rbd_dev->rbd_client->client->osdc;
2310 ret = rbd_obj_request_submit(osdc, stat_request);
2311out:
2312 if (ret)
2313 rbd_obj_request_put(obj_request);
2314
2315 return ret;
2316}
2317
Alex Elderb454e362013-04-19 15:34:50 -05002318static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2319{
2320 struct rbd_img_request *img_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002321 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002322 bool known;
Alex Elderb454e362013-04-19 15:34:50 -05002323
2324 rbd_assert(obj_request_img_data_test(obj_request));
2325
2326 img_request = obj_request->img_request;
2327 rbd_assert(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002328 rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002329
Alex Elderb454e362013-04-19 15:34:50 -05002330 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002331 * Only writes to layered images need special handling.
2332 * Reads and non-layered writes are simple object requests.
2333 * Layered writes that start beyond the end of the overlap
2334 * with the parent have no parent data, so they too are
2335 * simple object requests. Finally, if the target object is
2336 * known to already exist, its parent data has already been
2337 * copied, so a write to the object can also be handled as a
2338 * simple object request.
Alex Elderb454e362013-04-19 15:34:50 -05002339 */
2340 if (!img_request_write_test(img_request) ||
2341 !img_request_layered_test(img_request) ||
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002342 rbd_dev->parent_overlap <= obj_request->img_offset ||
Alex Elder3d7efd12013-04-19 15:34:50 -05002343 ((known = obj_request_known_test(obj_request)) &&
2344 obj_request_exists_test(obj_request))) {
Alex Elderb454e362013-04-19 15:34:50 -05002345
2346 struct rbd_device *rbd_dev;
2347 struct ceph_osd_client *osdc;
2348
2349 rbd_dev = obj_request->img_request->rbd_dev;
2350 osdc = &rbd_dev->rbd_client->client->osdc;
2351
2352 return rbd_obj_request_submit(osdc, obj_request);
2353 }
2354
2355 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002356 * It's a layered write. The target object might exist but
2357 * we may not know that yet. If we know it doesn't exist,
2358 * start by reading the data for the full target object from
2359 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002360 */
Alex Elder3d7efd12013-04-19 15:34:50 -05002361 if (known)
2362 return rbd_img_obj_parent_read_full(obj_request);
2363
2364 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002365
2366 return rbd_img_obj_exists_submit(obj_request);
2367}
2368
Alex Elderbf0d5f502012-11-22 00:00:08 -06002369static int rbd_img_request_submit(struct rbd_img_request *img_request)
2370{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002371 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002372 struct rbd_obj_request *next_obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002373
Alex Elder37206ee2013-02-20 17:32:08 -06002374 dout("%s: img %p\n", __func__, img_request);
Alex Elder46faeed2013-04-10 17:47:46 -05002375 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002376 int ret;
2377
Alex Elderb454e362013-04-19 15:34:50 -05002378 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002379 if (ret)
2380 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002381 }
2382
2383 return 0;
2384}
2385
Alex Elder8b3e1a52013-01-24 16:13:36 -06002386static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2387{
2388 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002389 struct rbd_device *rbd_dev;
2390 u64 obj_end;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002391
2392 rbd_assert(img_request_child_test(img_request));
2393
2394 obj_request = img_request->obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002395 rbd_assert(obj_request);
2396 rbd_assert(obj_request->img_request);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002397
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002398 obj_request->result = img_request->result;
2399 if (obj_request->result)
2400 goto out;
2401
2402 /*
2403 * We need to zero anything beyond the parent overlap
2404 * boundary. Since rbd_img_obj_request_read_callback()
2405 * will zero anything beyond the end of a short read, an
2406 * easy way to do this is to pretend the data from the
2407 * parent came up short--ending at the overlap boundary.
2408 */
2409 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
2410 obj_end = obj_request->img_offset + obj_request->length;
2411 rbd_dev = obj_request->img_request->rbd_dev;
2412 if (obj_end > rbd_dev->parent_overlap) {
2413 u64 xferred = 0;
2414
2415 if (obj_request->img_offset < rbd_dev->parent_overlap)
2416 xferred = rbd_dev->parent_overlap -
2417 obj_request->img_offset;
2418
2419 obj_request->xferred = min(img_request->xferred, xferred);
2420 } else {
2421 obj_request->xferred = img_request->xferred;
2422 }
2423out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06002424 rbd_img_obj_request_read_callback(obj_request);
2425 rbd_obj_request_complete(obj_request);
2426}
2427
2428static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
2429{
2430 struct rbd_device *rbd_dev;
2431 struct rbd_img_request *img_request;
2432 int result;
2433
2434 rbd_assert(obj_request_img_data_test(obj_request));
2435 rbd_assert(obj_request->img_request != NULL);
2436 rbd_assert(obj_request->result == (s32) -ENOENT);
2437 rbd_assert(obj_request->type == OBJ_REQUEST_BIO);
2438
2439 rbd_dev = obj_request->img_request->rbd_dev;
2440 rbd_assert(rbd_dev->parent != NULL);
2441 /* rbd_read_finish(obj_request, obj_request->length); */
2442 img_request = rbd_img_request_create(rbd_dev->parent,
2443 obj_request->img_offset,
2444 obj_request->length,
2445 false, true);
2446 result = -ENOMEM;
2447 if (!img_request)
2448 goto out_err;
2449
2450 rbd_obj_request_get(obj_request);
2451 img_request->obj_request = obj_request;
2452
Alex Elderf1a47392013-04-19 15:34:50 -05002453 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2454 obj_request->bio_list);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002455 if (result)
2456 goto out_err;
2457
2458 img_request->callback = rbd_img_parent_read_callback;
2459 result = rbd_img_request_submit(img_request);
2460 if (result)
2461 goto out_err;
2462
2463 return;
2464out_err:
2465 if (img_request)
2466 rbd_img_request_put(img_request);
2467 obj_request->result = result;
2468 obj_request->xferred = 0;
2469 obj_request_done_set(obj_request);
2470}
2471
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002472static int rbd_obj_notify_ack(struct rbd_device *rbd_dev, u64 notify_id)
Alex Elderb8d70032012-11-30 17:53:04 -06002473{
2474 struct rbd_obj_request *obj_request;
Alex Elder21692382013-04-05 01:27:12 -05002475 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elderb8d70032012-11-30 17:53:04 -06002476 int ret;
2477
2478 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2479 OBJ_REQUEST_NODATA);
2480 if (!obj_request)
2481 return -ENOMEM;
2482
2483 ret = -ENOMEM;
Alex Elder430c28c2013-04-03 21:32:51 -05002484 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002485 if (!obj_request->osd_req)
2486 goto out;
Alex Elder21692382013-04-05 01:27:12 -05002487 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06002488
Alex Elderc99d2d42013-04-05 01:27:11 -05002489 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_NOTIFY_ACK,
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002490 notify_id, 0, 0);
Alex Elder9d4df012013-04-19 15:34:50 -05002491 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002492
Alex Elderb8d70032012-11-30 17:53:04 -06002493 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002494out:
Alex Eldercf81b602013-01-17 12:18:46 -06002495 if (ret)
2496 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06002497
2498 return ret;
2499}
2500
2501static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
2502{
2503 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Alex Elderb8d70032012-11-30 17:53:04 -06002504
2505 if (!rbd_dev)
2506 return;
2507
Alex Elder37206ee2013-02-20 17:32:08 -06002508 dout("%s: \"%s\" notify_id %llu opcode %u\n", __func__,
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002509 rbd_dev->header_name, (unsigned long long)notify_id,
2510 (unsigned int)opcode);
2511 (void)rbd_dev_refresh(rbd_dev);
Alex Elderb8d70032012-11-30 17:53:04 -06002512
Alex Eldercc4a38bd2013-04-30 00:44:33 -05002513 rbd_obj_notify_ack(rbd_dev, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06002514}
2515
Alex Elder9969ebc2013-01-18 12:31:10 -06002516/*
2517 * Request sync osd watch/unwatch. The value of "start" determines
2518 * whether a watch request is being initiated or torn down.
2519 */
2520static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
2521{
2522 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2523 struct rbd_obj_request *obj_request;
Alex Elder9969ebc2013-01-18 12:31:10 -06002524 int ret;
2525
2526 rbd_assert(start ^ !!rbd_dev->watch_event);
2527 rbd_assert(start ^ !!rbd_dev->watch_request);
2528
2529 if (start) {
Alex Elder3c663bb2013-02-15 11:42:30 -06002530 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, rbd_dev,
Alex Elder9969ebc2013-01-18 12:31:10 -06002531 &rbd_dev->watch_event);
2532 if (ret < 0)
2533 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06002534 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06002535 }
2536
2537 ret = -ENOMEM;
2538 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
2539 OBJ_REQUEST_NODATA);
2540 if (!obj_request)
2541 goto out_cancel;
2542
Alex Elder430c28c2013-04-03 21:32:51 -05002543 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true, obj_request);
2544 if (!obj_request->osd_req)
2545 goto out_cancel;
2546
Alex Elder8eb87562013-01-25 17:08:55 -06002547 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06002548 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06002549 else
Alex Elder6977c3f2013-01-25 17:08:55 -06002550 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06002551 rbd_dev->watch_request->osd_req);
Alex Elder21692382013-04-05 01:27:12 -05002552
2553 osd_req_op_watch_init(obj_request->osd_req, 0, CEPH_OSD_OP_WATCH,
Alex Elderb21ebdd2013-04-30 00:44:32 -05002554 rbd_dev->watch_event->cookie, 0, start);
Alex Elder9d4df012013-04-19 15:34:50 -05002555 rbd_osd_req_format_write(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002556
Alex Elder9969ebc2013-01-18 12:31:10 -06002557 ret = rbd_obj_request_submit(osdc, obj_request);
2558 if (ret)
2559 goto out_cancel;
2560 ret = rbd_obj_request_wait(obj_request);
2561 if (ret)
2562 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06002563 ret = obj_request->result;
2564 if (ret)
2565 goto out_cancel;
2566
Alex Elder8eb87562013-01-25 17:08:55 -06002567 /*
2568 * A watch request is set to linger, so the underlying osd
2569 * request won't go away until we unregister it. We retain
2570 * a pointer to the object request during that time (in
2571 * rbd_dev->watch_request), so we'll keep a reference to
2572 * it. We'll drop that reference (below) after we've
2573 * unregistered it.
2574 */
2575 if (start) {
2576 rbd_dev->watch_request = obj_request;
2577
2578 return 0;
2579 }
2580
2581 /* We have successfully torn down the watch request */
2582
2583 rbd_obj_request_put(rbd_dev->watch_request);
2584 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002585out_cancel:
2586 /* Cancel the event if we're tearing down, or on error */
2587 ceph_osdc_cancel_event(rbd_dev->watch_event);
2588 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06002589 if (obj_request)
2590 rbd_obj_request_put(obj_request);
2591
2592 return ret;
2593}
2594
Alex Elder36be9a72013-01-19 00:30:28 -06002595/*
Alex Elderf40eb342013-04-25 15:09:42 -05002596 * Synchronous osd object method call. Returns the number of bytes
2597 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06002598 */
2599static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
2600 const char *object_name,
2601 const char *class_name,
2602 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05002603 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06002604 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05002605 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05002606 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06002607{
Alex Elder21692382013-04-05 01:27:12 -05002608 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder36be9a72013-01-19 00:30:28 -06002609 struct rbd_obj_request *obj_request;
Alex Elder36be9a72013-01-19 00:30:28 -06002610 struct page **pages;
2611 u32 page_count;
2612 int ret;
2613
2614 /*
Alex Elder6010a452013-04-05 01:27:11 -05002615 * Method calls are ultimately read operations. The result
2616 * should placed into the inbound buffer provided. They
2617 * also supply outbound data--parameters for the object
2618 * method. Currently if this is present it will be a
2619 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06002620 */
Alex Elder57385b52013-04-21 12:14:45 -05002621 page_count = (u32)calc_pages_for(0, inbound_size);
Alex Elder36be9a72013-01-19 00:30:28 -06002622 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2623 if (IS_ERR(pages))
2624 return PTR_ERR(pages);
2625
2626 ret = -ENOMEM;
Alex Elder6010a452013-04-05 01:27:11 -05002627 obj_request = rbd_obj_request_create(object_name, 0, inbound_size,
Alex Elder36be9a72013-01-19 00:30:28 -06002628 OBJ_REQUEST_PAGES);
2629 if (!obj_request)
2630 goto out;
2631
2632 obj_request->pages = pages;
2633 obj_request->page_count = page_count;
2634
Alex Elder430c28c2013-04-03 21:32:51 -05002635 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder36be9a72013-01-19 00:30:28 -06002636 if (!obj_request->osd_req)
2637 goto out;
2638
Alex Elderc99d2d42013-04-05 01:27:11 -05002639 osd_req_op_cls_init(obj_request->osd_req, 0, CEPH_OSD_OP_CALL,
Alex Elder04017e22013-04-05 14:46:02 -05002640 class_name, method_name);
2641 if (outbound_size) {
2642 struct ceph_pagelist *pagelist;
2643
2644 pagelist = kmalloc(sizeof (*pagelist), GFP_NOFS);
2645 if (!pagelist)
2646 goto out;
2647
2648 ceph_pagelist_init(pagelist);
2649 ceph_pagelist_append(pagelist, outbound, outbound_size);
2650 osd_req_op_cls_request_data_pagelist(obj_request->osd_req, 0,
2651 pagelist);
2652 }
Alex Eldera4ce40a2013-04-05 01:27:12 -05002653 osd_req_op_cls_response_data_pages(obj_request->osd_req, 0,
2654 obj_request->pages, inbound_size,
Alex Elder44cd1882013-04-05 01:27:12 -05002655 0, false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002656 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002657
Alex Elder36be9a72013-01-19 00:30:28 -06002658 ret = rbd_obj_request_submit(osdc, obj_request);
2659 if (ret)
2660 goto out;
2661 ret = rbd_obj_request_wait(obj_request);
2662 if (ret)
2663 goto out;
2664
2665 ret = obj_request->result;
2666 if (ret < 0)
2667 goto out;
Alex Elder57385b52013-04-21 12:14:45 -05002668
2669 rbd_assert(obj_request->xferred < (u64)INT_MAX);
2670 ret = (int)obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002671 ceph_copy_from_page_vector(pages, inbound, 0, obj_request->xferred);
Alex Elder36be9a72013-01-19 00:30:28 -06002672out:
2673 if (obj_request)
2674 rbd_obj_request_put(obj_request);
2675 else
2676 ceph_release_page_vector(pages, page_count);
2677
2678 return ret;
2679}
2680
Alex Elderbf0d5f502012-11-22 00:00:08 -06002681static void rbd_request_fn(struct request_queue *q)
Alex Eldercc344fa2013-02-19 12:25:56 -06002682 __releases(q->queue_lock) __acquires(q->queue_lock)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002683{
2684 struct rbd_device *rbd_dev = q->queuedata;
2685 bool read_only = rbd_dev->mapping.read_only;
2686 struct request *rq;
2687 int result;
2688
2689 while ((rq = blk_fetch_request(q))) {
2690 bool write_request = rq_data_dir(rq) == WRITE;
2691 struct rbd_img_request *img_request;
2692 u64 offset;
2693 u64 length;
2694
2695 /* Ignore any non-FS requests that filter through. */
2696
2697 if (rq->cmd_type != REQ_TYPE_FS) {
Alex Elder4dda41d2013-02-20 21:59:33 -06002698 dout("%s: non-fs request type %d\n", __func__,
2699 (int) rq->cmd_type);
2700 __blk_end_request_all(rq, 0);
2701 continue;
2702 }
2703
2704 /* Ignore/skip any zero-length requests */
2705
2706 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
2707 length = (u64) blk_rq_bytes(rq);
2708
2709 if (!length) {
2710 dout("%s: zero-length request\n", __func__);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002711 __blk_end_request_all(rq, 0);
2712 continue;
2713 }
2714
2715 spin_unlock_irq(q->queue_lock);
2716
2717 /* Disallow writes to a read-only device */
2718
2719 if (write_request) {
2720 result = -EROFS;
2721 if (read_only)
2722 goto end_request;
2723 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
2724 }
2725
Alex Elder6d292902013-01-14 12:43:31 -06002726 /*
2727 * Quit early if the mapped snapshot no longer
2728 * exists. It's still possible the snapshot will
2729 * have disappeared by the time our request arrives
2730 * at the osd, but there's no sense in sending it if
2731 * we already know.
2732 */
2733 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002734 dout("request for non-existent snapshot");
2735 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
2736 result = -ENXIO;
2737 goto end_request;
2738 }
2739
Alex Elderbf0d5f502012-11-22 00:00:08 -06002740 result = -EINVAL;
Alex Elderc0cd10db2013-04-26 09:43:47 -05002741 if (offset && length > U64_MAX - offset + 1) {
2742 rbd_warn(rbd_dev, "bad request range (%llu~%llu)\n",
2743 offset, length);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002744 goto end_request; /* Shouldn't happen */
Alex Elderc0cd10db2013-04-26 09:43:47 -05002745 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002746
2747 result = -ENOMEM;
2748 img_request = rbd_img_request_create(rbd_dev, offset, length,
Alex Elder9849e982013-01-24 16:13:36 -06002749 write_request, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002750 if (!img_request)
2751 goto end_request;
2752
2753 img_request->rq = rq;
2754
Alex Elderf1a47392013-04-19 15:34:50 -05002755 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
2756 rq->bio);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002757 if (!result)
2758 result = rbd_img_request_submit(img_request);
2759 if (result)
2760 rbd_img_request_put(img_request);
2761end_request:
2762 spin_lock_irq(q->queue_lock);
2763 if (result < 0) {
Alex Elder7da22d22013-01-24 16:13:36 -06002764 rbd_warn(rbd_dev, "%s %llx at %llx result %d\n",
2765 write_request ? "write" : "read",
2766 length, offset, result);
2767
Alex Elderbf0d5f502012-11-22 00:00:08 -06002768 __blk_end_request_all(rq, result);
2769 }
2770 }
2771}
2772
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002773/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002774 * a queue callback. Makes sure that we don't create a bio that spans across
2775 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05002776 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002777 */
2778static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
2779 struct bio_vec *bvec)
2780{
2781 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05002782 sector_t sector_offset;
2783 sector_t sectors_per_obj;
2784 sector_t obj_sector_offset;
2785 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002786
Alex Eldere5cfeed22012-10-20 22:17:27 -05002787 /*
2788 * Find how far into its rbd object the partition-relative
2789 * bio start sector is to offset relative to the enclosing
2790 * device.
2791 */
2792 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
2793 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
2794 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06002795
Alex Eldere5cfeed22012-10-20 22:17:27 -05002796 /*
2797 * Compute the number of bytes from that offset to the end
2798 * of the object. Account for what's already used by the bio.
2799 */
2800 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
2801 if (ret > bmd->bi_size)
2802 ret -= bmd->bi_size;
2803 else
2804 ret = 0;
2805
2806 /*
2807 * Don't send back more than was asked for. And if the bio
2808 * was empty, let the whole thing through because: "Note
2809 * that a block device *must* allow a single page to be
2810 * added to an empty bio."
2811 */
2812 rbd_assert(bvec->bv_len <= PAGE_SIZE);
2813 if (ret > (int) bvec->bv_len || !bmd->bi_size)
2814 ret = (int) bvec->bv_len;
2815
2816 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002817}
2818
2819static void rbd_free_disk(struct rbd_device *rbd_dev)
2820{
2821 struct gendisk *disk = rbd_dev->disk;
2822
2823 if (!disk)
2824 return;
2825
Alex Eldera0cab922013-04-25 23:15:08 -05002826 rbd_dev->disk = NULL;
2827 if (disk->flags & GENHD_FL_UP) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002828 del_gendisk(disk);
Alex Eldera0cab922013-04-25 23:15:08 -05002829 if (disk->queue)
2830 blk_cleanup_queue(disk->queue);
2831 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002832 put_disk(disk);
2833}
2834
Alex Elder788e2df2013-01-17 12:25:27 -06002835static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
2836 const char *object_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05002837 u64 offset, u64 length, void *buf)
Alex Elder788e2df2013-01-17 12:25:27 -06002838
2839{
Alex Elder21692382013-04-05 01:27:12 -05002840 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder788e2df2013-01-17 12:25:27 -06002841 struct rbd_obj_request *obj_request;
Alex Elder788e2df2013-01-17 12:25:27 -06002842 struct page **pages = NULL;
2843 u32 page_count;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002844 size_t size;
Alex Elder788e2df2013-01-17 12:25:27 -06002845 int ret;
2846
2847 page_count = (u32) calc_pages_for(offset, length);
2848 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2849 if (IS_ERR(pages))
2850 ret = PTR_ERR(pages);
2851
2852 ret = -ENOMEM;
2853 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002854 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002855 if (!obj_request)
2856 goto out;
2857
2858 obj_request->pages = pages;
2859 obj_request->page_count = page_count;
2860
Alex Elder430c28c2013-04-03 21:32:51 -05002861 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false, obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06002862 if (!obj_request->osd_req)
2863 goto out;
2864
Alex Elderc99d2d42013-04-05 01:27:11 -05002865 osd_req_op_extent_init(obj_request->osd_req, 0, CEPH_OSD_OP_READ,
2866 offset, length, 0, 0);
Alex Elder406e2c92013-04-15 14:50:36 -05002867 osd_req_op_extent_osd_data_pages(obj_request->osd_req, 0,
Alex Eldera4ce40a2013-04-05 01:27:12 -05002868 obj_request->pages,
Alex Elder44cd1882013-04-05 01:27:12 -05002869 obj_request->length,
2870 obj_request->offset & ~PAGE_MASK,
2871 false, false);
Alex Elder9d4df012013-04-19 15:34:50 -05002872 rbd_osd_req_format_read(obj_request);
Alex Elder430c28c2013-04-03 21:32:51 -05002873
Alex Elder788e2df2013-01-17 12:25:27 -06002874 ret = rbd_obj_request_submit(osdc, obj_request);
2875 if (ret)
2876 goto out;
2877 ret = rbd_obj_request_wait(obj_request);
2878 if (ret)
2879 goto out;
2880
2881 ret = obj_request->result;
2882 if (ret < 0)
2883 goto out;
Alex Elder1ceae7e2013-02-06 13:11:38 -06002884
2885 rbd_assert(obj_request->xferred <= (u64) SIZE_MAX);
2886 size = (size_t) obj_request->xferred;
Alex Elder903bb322013-02-06 13:11:38 -06002887 ceph_copy_from_page_vector(pages, buf, 0, size);
Alex Elder7097f8d2013-04-30 00:44:33 -05002888 rbd_assert(size <= (size_t)INT_MAX);
2889 ret = (int)size;
Alex Elder788e2df2013-01-17 12:25:27 -06002890out:
2891 if (obj_request)
2892 rbd_obj_request_put(obj_request);
2893 else
2894 ceph_release_page_vector(pages, page_count);
2895
2896 return ret;
2897}
2898
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002899/*
Alex Elder4156d992012-08-02 11:29:46 -05002900 * Read the complete header for the given rbd device.
2901 *
2902 * Returns a pointer to a dynamically-allocated buffer containing
2903 * the complete and validated header. Caller can pass the address
2904 * of a variable that will be filled in with the version of the
2905 * header object at the time it was read.
2906 *
2907 * Returns a pointer-coded errno if a failure occurs.
2908 */
2909static struct rbd_image_header_ondisk *
Alex Elder7097f8d2013-04-30 00:44:33 -05002910rbd_dev_v1_header_read(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05002911{
2912 struct rbd_image_header_ondisk *ondisk = NULL;
2913 u32 snap_count = 0;
2914 u64 names_size = 0;
2915 u32 want_count;
2916 int ret;
2917
2918 /*
2919 * The complete header will include an array of its 64-bit
2920 * snapshot ids, followed by the names of those snapshots as
2921 * a contiguous block of NUL-terminated strings. Note that
2922 * the number of snapshots could change by the time we read
2923 * it in, in which case we re-read it.
2924 */
2925 do {
2926 size_t size;
2927
2928 kfree(ondisk);
2929
2930 size = sizeof (*ondisk);
2931 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2932 size += names_size;
2933 ondisk = kmalloc(size, GFP_KERNEL);
2934 if (!ondisk)
2935 return ERR_PTR(-ENOMEM);
2936
Alex Elder788e2df2013-01-17 12:25:27 -06002937 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder7097f8d2013-04-30 00:44:33 -05002938 0, size, ondisk);
Alex Elder4156d992012-08-02 11:29:46 -05002939 if (ret < 0)
2940 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05002941 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05002942 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002943 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2944 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002945 goto out_err;
2946 }
2947 if (!rbd_dev_ondisk_valid(ondisk)) {
2948 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002949 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002950 goto out_err;
2951 }
2952
2953 names_size = le64_to_cpu(ondisk->snap_names_len);
2954 want_count = snap_count;
2955 snap_count = le32_to_cpu(ondisk->snap_count);
2956 } while (snap_count != want_count);
2957
2958 return ondisk;
2959
2960out_err:
2961 kfree(ondisk);
2962
2963 return ERR_PTR(ret);
2964}
2965
2966/*
2967 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002968 */
2969static int rbd_read_header(struct rbd_device *rbd_dev,
2970 struct rbd_image_header *header)
2971{
Alex Elder4156d992012-08-02 11:29:46 -05002972 struct rbd_image_header_ondisk *ondisk;
Alex Elder4156d992012-08-02 11:29:46 -05002973 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002974
Alex Elder7097f8d2013-04-30 00:44:33 -05002975 ondisk = rbd_dev_v1_header_read(rbd_dev);
Alex Elder4156d992012-08-02 11:29:46 -05002976 if (IS_ERR(ondisk))
2977 return PTR_ERR(ondisk);
2978 ret = rbd_header_from_disk(header, ondisk);
Alex Elder4156d992012-08-02 11:29:46 -05002979 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002980
Alex Elder4156d992012-08-02 11:29:46 -05002981 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002982}
2983
Alex Elder41f38c22012-10-25 23:34:40 -05002984static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002985{
2986 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002987 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002988
Alex Elder6087b512013-04-25 15:09:41 -05002989 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) {
2990 list_del(&snap->node);
2991 rbd_snap_destroy(snap);
2992 }
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002993}
2994
Alex Elder94785542012-10-09 13:50:17 -07002995static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2996{
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002997 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002998 return;
2999
Alex Eldere28626a2013-04-26 15:44:35 -05003000 if (rbd_dev->mapping.size != rbd_dev->header.image_size) {
3001 sector_t size;
3002
3003 rbd_dev->mapping.size = rbd_dev->header.image_size;
3004 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3005 dout("setting size to %llu sectors", (unsigned long long)size);
3006 set_capacity(rbd_dev->disk, size);
3007 }
Alex Elder94785542012-10-09 13:50:17 -07003008}
3009
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003010/*
3011 * only read the first part of the ondisk header, without the snaps info
3012 */
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003013static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003014{
3015 int ret;
3016 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003017
3018 ret = rbd_read_header(rbd_dev, &h);
3019 if (ret < 0)
3020 return ret;
3021
Josh Durgina51aa0c2011-12-05 10:35:04 -08003022 down_write(&rbd_dev->header_rwsem);
3023
Alex Elder94785542012-10-09 13:50:17 -07003024 /* Update image size, and check for resize of mapped image */
3025 rbd_dev->header.image_size = h.image_size;
3026 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07003027
Alex Elder849b4262012-07-09 21:04:24 -05003028 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003029 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05003030 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08003031 /* osd requests may still refer to snapc */
Alex Elder812164f82013-04-30 00:44:32 -05003032 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003033
Josh Durgin93a24e02011-12-05 10:41:28 -08003034 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003035 rbd_dev->header.snapc = h.snapc;
3036 rbd_dev->header.snap_names = h.snap_names;
3037 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05003038 /* Free the extra copy of the object prefix */
Alex Elderc0cd10db2013-04-26 09:43:47 -05003039 if (strcmp(rbd_dev->header.object_prefix, h.object_prefix))
3040 rbd_warn(rbd_dev, "object prefix changed (ignoring)");
Alex Elder849b4262012-07-09 21:04:24 -05003041 kfree(h.object_prefix);
3042
Alex Elder304f6802012-08-31 17:29:52 -05003043 ret = rbd_dev_snaps_update(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003044
Josh Durginc6666012011-11-21 17:11:12 -08003045 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003046
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003047 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003048}
3049
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003050static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003051{
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003052 u64 image_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003053 int ret;
3054
Alex Elder117973f2012-08-31 17:29:55 -05003055 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003056 image_size = rbd_dev->header.image_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003057 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05003058 if (rbd_dev->image_format == 1)
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003059 ret = rbd_dev_v1_refresh(rbd_dev);
Alex Elder117973f2012-08-31 17:29:55 -05003060 else
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003061 ret = rbd_dev_v2_refresh(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003062 mutex_unlock(&ctl_mutex);
Alex Elder522a0cc2013-04-25 15:09:41 -05003063 if (ret)
3064 rbd_warn(rbd_dev, "got notification but failed to "
3065 " update snaps: %d\n", ret);
Alex Eldera3fbe5d2013-04-30 00:44:32 -05003066 if (image_size != rbd_dev->header.image_size)
3067 revalidate_disk(rbd_dev->disk);
Alex Elder1fe5e992012-07-25 09:32:41 -05003068
3069 return ret;
3070}
3071
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003072static int rbd_init_disk(struct rbd_device *rbd_dev)
3073{
3074 struct gendisk *disk;
3075 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003076 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003077
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003078 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003079 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
3080 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003081 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003082
Alex Elderf0f8cef2012-01-29 13:57:44 -06003083 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003084 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003085 disk->major = rbd_dev->major;
3086 disk->first_minor = 0;
3087 disk->fops = &rbd_bd_ops;
3088 disk->private_data = rbd_dev;
3089
Alex Elderbf0d5f502012-11-22 00:00:08 -06003090 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003091 if (!q)
3092 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003093
Alex Elder593a9e72012-02-07 12:03:37 -06003094 /* We use the default size, but let's be explicit about it. */
3095 blk_queue_physical_block_size(q, SECTOR_SIZE);
3096
Josh Durgin029bcbd2011-07-22 11:35:23 -07003097 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003098 segment_size = rbd_obj_bytes(&rbd_dev->header);
3099 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
3100 blk_queue_max_segment_size(q, segment_size);
3101 blk_queue_io_min(q, segment_size);
3102 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003103
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003104 blk_queue_merge_bvec(q, rbd_merge_bvec);
3105 disk->queue = q;
3106
3107 q->queuedata = rbd_dev;
3108
3109 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003110
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003111 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003112out_disk:
3113 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003114
3115 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003116}
3117
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003118/*
3119 sysfs
3120*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003121
Alex Elder593a9e72012-02-07 12:03:37 -06003122static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3123{
3124 return container_of(dev, struct rbd_device, dev);
3125}
3126
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003127static ssize_t rbd_size_show(struct device *dev,
3128 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003129{
Alex Elder593a9e72012-02-07 12:03:37 -06003130 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003131
Alex Elderfc71d832013-04-26 15:44:36 -05003132 return sprintf(buf, "%llu\n",
3133 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003134}
3135
Alex Elder34b13182012-07-13 20:35:12 -05003136/*
3137 * Note this shows the features for whatever's mapped, which is not
3138 * necessarily the base image.
3139 */
3140static ssize_t rbd_features_show(struct device *dev,
3141 struct device_attribute *attr, char *buf)
3142{
3143 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3144
3145 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003146 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05003147}
3148
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003149static ssize_t rbd_major_show(struct device *dev,
3150 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003151{
Alex Elder593a9e72012-02-07 12:03:37 -06003152 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003153
Alex Elderfc71d832013-04-26 15:44:36 -05003154 if (rbd_dev->major)
3155 return sprintf(buf, "%d\n", rbd_dev->major);
3156
3157 return sprintf(buf, "(none)\n");
3158
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003159}
3160
3161static ssize_t rbd_client_id_show(struct device *dev,
3162 struct device_attribute *attr, char *buf)
3163{
Alex Elder593a9e72012-02-07 12:03:37 -06003164 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003165
Alex Elder1dbb4392012-01-24 10:08:37 -06003166 return sprintf(buf, "client%lld\n",
3167 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003168}
3169
3170static ssize_t rbd_pool_show(struct device *dev,
3171 struct device_attribute *attr, char *buf)
3172{
Alex Elder593a9e72012-02-07 12:03:37 -06003173 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003174
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003175 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003176}
3177
Alex Elder9bb2f332012-07-12 10:46:35 -05003178static ssize_t rbd_pool_id_show(struct device *dev,
3179 struct device_attribute *attr, char *buf)
3180{
3181 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3182
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003183 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05003184 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05003185}
3186
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003187static ssize_t rbd_name_show(struct device *dev,
3188 struct device_attribute *attr, char *buf)
3189{
Alex Elder593a9e72012-02-07 12:03:37 -06003190 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003191
Alex Eldera92ffdf2012-10-30 19:40:33 -05003192 if (rbd_dev->spec->image_name)
3193 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
3194
3195 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003196}
3197
Alex Elder589d30e2012-07-10 20:30:11 -05003198static ssize_t rbd_image_id_show(struct device *dev,
3199 struct device_attribute *attr, char *buf)
3200{
3201 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3202
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003203 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003204}
3205
Alex Elder34b13182012-07-13 20:35:12 -05003206/*
3207 * Shows the name of the currently-mapped snapshot (or
3208 * RBD_SNAP_HEAD_NAME for the base image).
3209 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003210static ssize_t rbd_snap_show(struct device *dev,
3211 struct device_attribute *attr,
3212 char *buf)
3213{
Alex Elder593a9e72012-02-07 12:03:37 -06003214 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003215
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003216 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003217}
3218
Alex Elder86b00e02012-10-25 23:34:42 -05003219/*
3220 * For an rbd v2 image, shows the pool id, image id, and snapshot id
3221 * for the parent image. If there is no parent, simply shows
3222 * "(no parent image)".
3223 */
3224static ssize_t rbd_parent_show(struct device *dev,
3225 struct device_attribute *attr,
3226 char *buf)
3227{
3228 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
3229 struct rbd_spec *spec = rbd_dev->parent_spec;
3230 int count;
3231 char *bufp = buf;
3232
3233 if (!spec)
3234 return sprintf(buf, "(no parent image)\n");
3235
3236 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
3237 (unsigned long long) spec->pool_id, spec->pool_name);
3238 if (count < 0)
3239 return count;
3240 bufp += count;
3241
3242 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
3243 spec->image_name ? spec->image_name : "(unknown)");
3244 if (count < 0)
3245 return count;
3246 bufp += count;
3247
3248 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
3249 (unsigned long long) spec->snap_id, spec->snap_name);
3250 if (count < 0)
3251 return count;
3252 bufp += count;
3253
3254 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
3255 if (count < 0)
3256 return count;
3257 bufp += count;
3258
3259 return (ssize_t) (bufp - buf);
3260}
3261
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003262static ssize_t rbd_image_refresh(struct device *dev,
3263 struct device_attribute *attr,
3264 const char *buf,
3265 size_t size)
3266{
Alex Elder593a9e72012-02-07 12:03:37 -06003267 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05003268 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003269
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003270 ret = rbd_dev_refresh(rbd_dev);
Alex Elderb8136232012-07-25 09:32:41 -05003271
3272 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003273}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003274
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003275static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05003276static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003277static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
3278static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
3279static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05003280static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003281static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003282static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003283static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
3284static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05003285static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003286
3287static struct attribute *rbd_attrs[] = {
3288 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05003289 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003290 &dev_attr_major.attr,
3291 &dev_attr_client_id.attr,
3292 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05003293 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003294 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05003295 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003296 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05003297 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003298 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003299 NULL
3300};
3301
3302static struct attribute_group rbd_attr_group = {
3303 .attrs = rbd_attrs,
3304};
3305
3306static const struct attribute_group *rbd_attr_groups[] = {
3307 &rbd_attr_group,
3308 NULL
3309};
3310
3311static void rbd_sysfs_dev_release(struct device *dev)
3312{
3313}
3314
3315static struct device_type rbd_device_type = {
3316 .name = "rbd",
3317 .groups = rbd_attr_groups,
3318 .release = rbd_sysfs_dev_release,
3319};
3320
Alex Elder8b8fb992012-10-26 17:25:24 -05003321static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
3322{
3323 kref_get(&spec->kref);
3324
3325 return spec;
3326}
3327
3328static void rbd_spec_free(struct kref *kref);
3329static void rbd_spec_put(struct rbd_spec *spec)
3330{
3331 if (spec)
3332 kref_put(&spec->kref, rbd_spec_free);
3333}
3334
3335static struct rbd_spec *rbd_spec_alloc(void)
3336{
3337 struct rbd_spec *spec;
3338
3339 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
3340 if (!spec)
3341 return NULL;
3342 kref_init(&spec->kref);
3343
Alex Elder8b8fb992012-10-26 17:25:24 -05003344 return spec;
3345}
3346
3347static void rbd_spec_free(struct kref *kref)
3348{
3349 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
3350
3351 kfree(spec->pool_name);
3352 kfree(spec->image_id);
3353 kfree(spec->image_name);
3354 kfree(spec->snap_name);
3355 kfree(spec);
3356}
3357
Alex Eldercc344fa2013-02-19 12:25:56 -06003358static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
Alex Elderc53d5892012-10-25 23:34:42 -05003359 struct rbd_spec *spec)
3360{
3361 struct rbd_device *rbd_dev;
3362
3363 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
3364 if (!rbd_dev)
3365 return NULL;
3366
3367 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06003368 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05003369 INIT_LIST_HEAD(&rbd_dev->node);
3370 INIT_LIST_HEAD(&rbd_dev->snaps);
3371 init_rwsem(&rbd_dev->header_rwsem);
3372
3373 rbd_dev->spec = spec;
3374 rbd_dev->rbd_client = rbdc;
3375
Alex Elder0903e872012-11-14 12:25:19 -06003376 /* Initialize the layout used for all rbd requests */
3377
3378 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3379 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
3380 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
3381 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
3382
Alex Elderc53d5892012-10-25 23:34:42 -05003383 return rbd_dev;
3384}
3385
3386static void rbd_dev_destroy(struct rbd_device *rbd_dev)
3387{
Alex Elderc53d5892012-10-25 23:34:42 -05003388 rbd_put_client(rbd_dev->rbd_client);
3389 rbd_spec_put(rbd_dev->spec);
3390 kfree(rbd_dev);
3391}
3392
Alex Elder6087b512013-04-25 15:09:41 -05003393static void rbd_snap_destroy(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003394{
Alex Elder3e83b652013-04-23 13:52:53 -05003395 kfree(snap->name);
3396 kfree(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003397}
3398
Alex Elder6087b512013-04-25 15:09:41 -05003399static struct rbd_snap *rbd_snap_create(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05003400 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05003401 u64 snap_id, u64 snap_size,
3402 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003403{
Alex Elder4e891e02012-07-10 20:30:10 -05003404 struct rbd_snap *snap;
Alex Elder4e891e02012-07-10 20:30:10 -05003405
3406 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003407 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05003408 return ERR_PTR(-ENOMEM);
3409
Alex Elder6e584f52013-04-25 15:09:42 -05003410 snap->name = snap_name;
Alex Elderc8d18422012-07-10 20:30:11 -05003411 snap->id = snap_id;
3412 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05003413 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05003414
3415 return snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003416}
3417
Alex Elder6e584f52013-04-25 15:09:42 -05003418/*
3419 * Returns a dynamically-allocated snapshot name if successful, or a
3420 * pointer-coded error otherwise.
3421 */
Alex Eldercb752232013-04-30 00:44:33 -05003422static const char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
Alex Eldercd892122012-07-03 16:01:19 -05003423 u64 *snap_size, u64 *snap_features)
3424{
Alex Eldercb752232013-04-30 00:44:33 -05003425 const char *snap_name;
Alex Elder6e584f52013-04-25 15:09:42 -05003426 int i;
Alex Eldercd892122012-07-03 16:01:19 -05003427
3428 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
3429
Alex Eldercd892122012-07-03 16:01:19 -05003430 /* Skip over names until we find the one we are looking for */
3431
3432 snap_name = rbd_dev->header.snap_names;
Alex Elder6e584f52013-04-25 15:09:42 -05003433 for (i = 0; i < which; i++)
Alex Eldercd892122012-07-03 16:01:19 -05003434 snap_name += strlen(snap_name) + 1;
3435
Alex Elder6e584f52013-04-25 15:09:42 -05003436 snap_name = kstrdup(snap_name, GFP_KERNEL);
3437 if (!snap_name)
3438 return ERR_PTR(-ENOMEM);
3439
3440 *snap_size = rbd_dev->header.snap_sizes[which];
3441 *snap_features = 0; /* No features for v1 */
3442
Alex Eldercd892122012-07-03 16:01:19 -05003443 return snap_name;
3444}
3445
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003446/*
Alex Elder9d475de2012-07-03 16:01:19 -05003447 * Get the size and object order for an image snapshot, or if
3448 * snap_id is CEPH_NOSNAP, gets this information for the base
3449 * image.
3450 */
3451static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
3452 u8 *order, u64 *snap_size)
3453{
3454 __le64 snapid = cpu_to_le64(snap_id);
3455 int ret;
3456 struct {
3457 u8 order;
3458 __le64 size;
3459 } __attribute__ ((packed)) size_buf = { 0 };
3460
Alex Elder36be9a72013-01-19 00:30:28 -06003461 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05003462 "rbd", "get_size",
Alex Elder41579762013-04-21 12:14:45 -05003463 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003464 &size_buf, sizeof (size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06003465 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05003466 if (ret < 0)
3467 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003468 if (ret < sizeof (size_buf))
3469 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05003470
Alex Elderc86f86e2013-04-25 15:09:41 -05003471 if (order)
3472 *order = size_buf.order;
Alex Elder9d475de2012-07-03 16:01:19 -05003473 *snap_size = le64_to_cpu(size_buf.size);
3474
3475 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
Alex Elder57385b52013-04-21 12:14:45 -05003476 (unsigned long long)snap_id, (unsigned int)*order,
3477 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05003478
3479 return 0;
3480}
3481
3482static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
3483{
3484 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
3485 &rbd_dev->header.obj_order,
3486 &rbd_dev->header.image_size);
3487}
3488
Alex Elder1e130192012-07-03 16:01:19 -05003489static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
3490{
3491 void *reply_buf;
3492 int ret;
3493 void *p;
3494
3495 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
3496 if (!reply_buf)
3497 return -ENOMEM;
3498
Alex Elder36be9a72013-01-19 00:30:28 -06003499 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003500 "rbd", "get_object_prefix", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003501 reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06003502 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05003503 if (ret < 0)
3504 goto out;
3505
3506 p = reply_buf;
3507 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05003508 p + ret, NULL, GFP_NOIO);
3509 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05003510
3511 if (IS_ERR(rbd_dev->header.object_prefix)) {
3512 ret = PTR_ERR(rbd_dev->header.object_prefix);
3513 rbd_dev->header.object_prefix = NULL;
3514 } else {
3515 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
3516 }
Alex Elder1e130192012-07-03 16:01:19 -05003517out:
3518 kfree(reply_buf);
3519
3520 return ret;
3521}
3522
Alex Elderb1b54022012-07-03 16:01:19 -05003523static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
3524 u64 *snap_features)
3525{
3526 __le64 snapid = cpu_to_le64(snap_id);
3527 struct {
3528 __le64 features;
3529 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05003530 } __attribute__ ((packed)) features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07003531 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05003532 int ret;
3533
Alex Elder36be9a72013-01-19 00:30:28 -06003534 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05003535 "rbd", "get_features",
Alex Elder41579762013-04-21 12:14:45 -05003536 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003537 &features_buf, sizeof (features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06003538 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05003539 if (ret < 0)
3540 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05003541 if (ret < sizeof (features_buf))
3542 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07003543
3544 incompat = le64_to_cpu(features_buf.incompat);
Alex Elder5cbf6f122013-04-11 09:29:48 -05003545 if (incompat & ~RBD_FEATURES_SUPPORTED)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05003546 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07003547
Alex Elderb1b54022012-07-03 16:01:19 -05003548 *snap_features = le64_to_cpu(features_buf.features);
3549
3550 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05003551 (unsigned long long)snap_id,
3552 (unsigned long long)*snap_features,
3553 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05003554
3555 return 0;
3556}
3557
3558static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
3559{
3560 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
3561 &rbd_dev->header.features);
3562}
3563
Alex Elder86b00e02012-10-25 23:34:42 -05003564static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
3565{
3566 struct rbd_spec *parent_spec;
3567 size_t size;
3568 void *reply_buf = NULL;
3569 __le64 snapid;
3570 void *p;
3571 void *end;
3572 char *image_id;
3573 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05003574 int ret;
3575
3576 parent_spec = rbd_spec_alloc();
3577 if (!parent_spec)
3578 return -ENOMEM;
3579
3580 size = sizeof (__le64) + /* pool_id */
3581 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
3582 sizeof (__le64) + /* snap_id */
3583 sizeof (__le64); /* overlap */
3584 reply_buf = kmalloc(size, GFP_KERNEL);
3585 if (!reply_buf) {
3586 ret = -ENOMEM;
3587 goto out_err;
3588 }
3589
3590 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06003591 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05003592 "rbd", "get_parent",
Alex Elder41579762013-04-21 12:14:45 -05003593 &snapid, sizeof (snapid),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003594 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06003595 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05003596 if (ret < 0)
3597 goto out_err;
3598
Alex Elder86b00e02012-10-25 23:34:42 -05003599 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003600 end = reply_buf + ret;
3601 ret = -ERANGE;
Alex Elder86b00e02012-10-25 23:34:42 -05003602 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
3603 if (parent_spec->pool_id == CEPH_NOPOOL)
3604 goto out; /* No parent? No problem. */
3605
Alex Elder0903e872012-11-14 12:25:19 -06003606 /* The ceph file layout needs to fit pool id in 32 bits */
3607
3608 ret = -EIO;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003609 if (parent_spec->pool_id > (u64)U32_MAX) {
3610 rbd_warn(NULL, "parent pool id too large (%llu > %u)\n",
3611 (unsigned long long)parent_spec->pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05003612 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003613 }
Alex Elder0903e872012-11-14 12:25:19 -06003614
Alex Elder979ed482012-11-01 08:39:26 -05003615 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05003616 if (IS_ERR(image_id)) {
3617 ret = PTR_ERR(image_id);
3618 goto out_err;
3619 }
3620 parent_spec->image_id = image_id;
3621 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
3622 ceph_decode_64_safe(&p, end, overlap, out_err);
3623
3624 rbd_dev->parent_overlap = overlap;
3625 rbd_dev->parent_spec = parent_spec;
3626 parent_spec = NULL; /* rbd_dev now owns this */
3627out:
3628 ret = 0;
3629out_err:
3630 kfree(reply_buf);
3631 rbd_spec_put(parent_spec);
3632
3633 return ret;
3634}
3635
Alex Eldercc070d52013-04-21 12:14:45 -05003636static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
3637{
3638 struct {
3639 __le64 stripe_unit;
3640 __le64 stripe_count;
3641 } __attribute__ ((packed)) striping_info_buf = { 0 };
3642 size_t size = sizeof (striping_info_buf);
3643 void *p;
3644 u64 obj_size;
3645 u64 stripe_unit;
3646 u64 stripe_count;
3647 int ret;
3648
3649 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
3650 "rbd", "get_stripe_unit_count", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003651 (char *)&striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05003652 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
3653 if (ret < 0)
3654 return ret;
3655 if (ret < size)
3656 return -ERANGE;
3657
3658 /*
3659 * We don't actually support the "fancy striping" feature
3660 * (STRIPINGV2) yet, but if the striping sizes are the
3661 * defaults the behavior is the same as before. So find
3662 * out, and only fail if the image has non-default values.
3663 */
3664 ret = -EINVAL;
3665 obj_size = (u64)1 << rbd_dev->header.obj_order;
3666 p = &striping_info_buf;
3667 stripe_unit = ceph_decode_64(&p);
3668 if (stripe_unit != obj_size) {
3669 rbd_warn(rbd_dev, "unsupported stripe unit "
3670 "(got %llu want %llu)",
3671 stripe_unit, obj_size);
3672 return -EINVAL;
3673 }
3674 stripe_count = ceph_decode_64(&p);
3675 if (stripe_count != 1) {
3676 rbd_warn(rbd_dev, "unsupported stripe count "
3677 "(got %llu want 1)", stripe_count);
3678 return -EINVAL;
3679 }
Alex Elder500d0c02013-04-26 09:43:47 -05003680 rbd_dev->header.stripe_unit = stripe_unit;
3681 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05003682
3683 return 0;
3684}
3685
Alex Elder9e15b772012-10-30 19:40:33 -05003686static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
3687{
3688 size_t image_id_size;
3689 char *image_id;
3690 void *p;
3691 void *end;
3692 size_t size;
3693 void *reply_buf = NULL;
3694 size_t len = 0;
3695 char *image_name = NULL;
3696 int ret;
3697
3698 rbd_assert(!rbd_dev->spec->image_name);
3699
Alex Elder69e7a022012-11-01 08:39:26 -05003700 len = strlen(rbd_dev->spec->image_id);
3701 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05003702 image_id = kmalloc(image_id_size, GFP_KERNEL);
3703 if (!image_id)
3704 return NULL;
3705
3706 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05003707 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05003708 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05003709
3710 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
3711 reply_buf = kmalloc(size, GFP_KERNEL);
3712 if (!reply_buf)
3713 goto out;
3714
Alex Elder36be9a72013-01-19 00:30:28 -06003715 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05003716 "rbd", "dir_get_name",
3717 image_id, image_id_size,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003718 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05003719 if (ret < 0)
3720 goto out;
3721 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05003722 end = reply_buf + ret;
3723
Alex Elder9e15b772012-10-30 19:40:33 -05003724 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
3725 if (IS_ERR(image_name))
3726 image_name = NULL;
3727 else
3728 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
3729out:
3730 kfree(reply_buf);
3731 kfree(image_id);
3732
3733 return image_name;
3734}
3735
3736/*
Alex Elder2e9f7f12013-04-26 09:43:48 -05003737 * When an rbd image has a parent image, it is identified by the
3738 * pool, image, and snapshot ids (not names). This function fills
3739 * in the names for those ids. (It's OK if we can't figure out the
3740 * name for an image id, but the pool and snapshot ids should always
3741 * exist and have names.) All names in an rbd spec are dynamically
3742 * allocated.
Alex Eldere1d42132013-04-25 23:15:08 -05003743 *
3744 * When an image being mapped (not a parent) is probed, we have the
3745 * pool name and pool id, image name and image id, and the snapshot
3746 * name. The only thing we're missing is the snapshot id.
Alex Elder2e9f7f12013-04-26 09:43:48 -05003747 *
3748 * The set of snapshots for an image is not known until they have
3749 * been read by rbd_dev_snaps_update(), so we can't completely fill
3750 * in this information until after that has been called.
Alex Elder9e15b772012-10-30 19:40:33 -05003751 */
Alex Elder2e9f7f12013-04-26 09:43:48 -05003752static int rbd_dev_spec_update(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05003753{
Alex Elder2e9f7f12013-04-26 09:43:48 -05003754 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3755 struct rbd_spec *spec = rbd_dev->spec;
3756 const char *pool_name;
3757 const char *image_name;
3758 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05003759 int ret;
3760
Alex Eldere1d42132013-04-25 23:15:08 -05003761 /*
3762 * An image being mapped will have the pool name (etc.), but
3763 * we need to look up the snapshot id.
3764 */
Alex Elder2e9f7f12013-04-26 09:43:48 -05003765 if (spec->pool_name) {
3766 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
Alex Eldere1d42132013-04-25 23:15:08 -05003767 struct rbd_snap *snap;
3768
Alex Elder2e9f7f12013-04-26 09:43:48 -05003769 snap = snap_by_name(rbd_dev, spec->snap_name);
Alex Eldere1d42132013-04-25 23:15:08 -05003770 if (!snap)
3771 return -ENOENT;
Alex Elder2e9f7f12013-04-26 09:43:48 -05003772 spec->snap_id = snap->id;
Alex Eldere1d42132013-04-25 23:15:08 -05003773 } else {
Alex Elder2e9f7f12013-04-26 09:43:48 -05003774 spec->snap_id = CEPH_NOSNAP;
Alex Eldere1d42132013-04-25 23:15:08 -05003775 }
3776
3777 return 0;
3778 }
Alex Elder9e15b772012-10-30 19:40:33 -05003779
Alex Elder2e9f7f12013-04-26 09:43:48 -05003780 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05003781
Alex Elder2e9f7f12013-04-26 09:43:48 -05003782 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
3783 if (!pool_name) {
3784 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05003785 return -EIO;
3786 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05003787 pool_name = kstrdup(pool_name, GFP_KERNEL);
3788 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05003789 return -ENOMEM;
3790
3791 /* Fetch the image name; tolerate failure here */
3792
Alex Elder2e9f7f12013-04-26 09:43:48 -05003793 image_name = rbd_dev_image_name(rbd_dev);
3794 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05003795 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05003796
Alex Elder2e9f7f12013-04-26 09:43:48 -05003797 /* Look up the snapshot name, and make a copy */
Alex Elder9e15b772012-10-30 19:40:33 -05003798
Alex Elder2e9f7f12013-04-26 09:43:48 -05003799 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
3800 if (!snap_name) {
3801 rbd_warn(rbd_dev, "no snapshot with id %llu", spec->snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05003802 ret = -EIO;
3803 goto out_err;
3804 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05003805 snap_name = kstrdup(snap_name, GFP_KERNEL);
3806 if (!snap_name) {
3807 ret = -ENOMEM;
Alex Elder9e15b772012-10-30 19:40:33 -05003808 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05003809 }
3810
3811 spec->pool_name = pool_name;
3812 spec->image_name = image_name;
3813 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05003814
3815 return 0;
3816out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05003817 kfree(image_name);
3818 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05003819
3820 return ret;
3821}
3822
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003823static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05003824{
3825 size_t size;
3826 int ret;
3827 void *reply_buf;
3828 void *p;
3829 void *end;
3830 u64 seq;
3831 u32 snap_count;
3832 struct ceph_snap_context *snapc;
3833 u32 i;
3834
3835 /*
3836 * We'll need room for the seq value (maximum snapshot id),
3837 * snapshot count, and array of that many snapshot ids.
3838 * For now we have a fixed upper limit on the number we're
3839 * prepared to receive.
3840 */
3841 size = sizeof (__le64) + sizeof (__le32) +
3842 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3843 reply_buf = kzalloc(size, GFP_KERNEL);
3844 if (!reply_buf)
3845 return -ENOMEM;
3846
Alex Elder36be9a72013-01-19 00:30:28 -06003847 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder41579762013-04-21 12:14:45 -05003848 "rbd", "get_snapcontext", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003849 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06003850 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003851 if (ret < 0)
3852 goto out;
3853
Alex Elder35d489f2012-07-03 16:01:19 -05003854 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05003855 end = reply_buf + ret;
3856 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05003857 ceph_decode_64_safe(&p, end, seq, out);
3858 ceph_decode_32_safe(&p, end, snap_count, out);
3859
3860 /*
3861 * Make sure the reported number of snapshot ids wouldn't go
3862 * beyond the end of our buffer. But before checking that,
3863 * make sure the computed size of the snapshot context we
3864 * allocate is representable in a size_t.
3865 */
3866 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3867 / sizeof (u64)) {
3868 ret = -EINVAL;
3869 goto out;
3870 }
3871 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3872 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05003873 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05003874
Alex Elder812164f82013-04-30 00:44:32 -05003875 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05003876 if (!snapc) {
3877 ret = -ENOMEM;
3878 goto out;
3879 }
Alex Elder35d489f2012-07-03 16:01:19 -05003880 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05003881 for (i = 0; i < snap_count; i++)
3882 snapc->snaps[i] = ceph_decode_64(&p);
3883
3884 rbd_dev->header.snapc = snapc;
3885
3886 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05003887 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05003888out:
3889 kfree(reply_buf);
3890
Alex Elder57385b52013-04-21 12:14:45 -05003891 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05003892}
3893
Alex Eldercb752232013-04-30 00:44:33 -05003894static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003895{
3896 size_t size;
3897 void *reply_buf;
3898 __le64 snap_id;
3899 int ret;
3900 void *p;
3901 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003902 char *snap_name;
3903
3904 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3905 reply_buf = kmalloc(size, GFP_KERNEL);
3906 if (!reply_buf)
3907 return ERR_PTR(-ENOMEM);
3908
Alex Elderacb1b6c2013-04-25 15:09:41 -05003909 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003910 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003911 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003912 "rbd", "get_snapshot_name",
Alex Elder41579762013-04-21 12:14:45 -05003913 &snap_id, sizeof (snap_id),
Alex Eldere2a58ee2013-04-30 00:44:33 -05003914 reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06003915 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05003916 if (ret < 0) {
3917 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003918 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05003919 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003920
3921 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05003922 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05003923 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05003924 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003925 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003926
Alex Elderf40eb342013-04-25 15:09:42 -05003927 dout(" snap_id 0x%016llx snap_name = %s\n",
3928 (unsigned long long)le64_to_cpu(snap_id), snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003929out:
3930 kfree(reply_buf);
3931
Alex Elderf40eb342013-04-25 15:09:42 -05003932 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003933}
3934
Alex Eldercb752232013-04-30 00:44:33 -05003935static const char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003936 u64 *snap_size, u64 *snap_features)
3937{
Alex Eldere0b49862013-01-09 14:44:18 -06003938 u64 snap_id;
Alex Elderacb1b6c2013-04-25 15:09:41 -05003939 u64 size;
3940 u64 features;
Alex Eldercb752232013-04-30 00:44:33 -05003941 const char *snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003942 int ret;
3943
Alex Elderacb1b6c2013-04-25 15:09:41 -05003944 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003945 snap_id = rbd_dev->header.snapc->snaps[which];
Alex Elderacb1b6c2013-04-25 15:09:41 -05003946 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003947 if (ret)
Alex Elderacb1b6c2013-04-25 15:09:41 -05003948 goto out_err;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003949
Alex Elderacb1b6c2013-04-25 15:09:41 -05003950 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
3951 if (ret)
3952 goto out_err;
3953
3954 snap_name = rbd_dev_v2_snap_name(rbd_dev, which);
3955 if (!IS_ERR(snap_name)) {
3956 *snap_size = size;
3957 *snap_features = features;
3958 }
3959
3960 return snap_name;
3961out_err:
3962 return ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003963}
3964
Alex Eldercb752232013-04-30 00:44:33 -05003965static const char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003966 u64 *snap_size, u64 *snap_features)
3967{
3968 if (rbd_dev->image_format == 1)
3969 return rbd_dev_v1_snap_info(rbd_dev, which,
3970 snap_size, snap_features);
3971 if (rbd_dev->image_format == 2)
3972 return rbd_dev_v2_snap_info(rbd_dev, which,
3973 snap_size, snap_features);
3974 return ERR_PTR(-EINVAL);
3975}
3976
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003977static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05003978{
3979 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05003980
3981 down_write(&rbd_dev->header_rwsem);
3982
Alex Elder117973f2012-08-31 17:29:55 -05003983 ret = rbd_dev_v2_image_size(rbd_dev);
3984 if (ret)
3985 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05003986 rbd_update_mapping_size(rbd_dev);
3987
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003988 ret = rbd_dev_v2_snap_context(rbd_dev);
Alex Elder117973f2012-08-31 17:29:55 -05003989 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3990 if (ret)
3991 goto out;
3992 ret = rbd_dev_snaps_update(rbd_dev);
3993 dout("rbd_dev_snaps_update returned %d\n", ret);
3994 if (ret)
3995 goto out;
Alex Elder117973f2012-08-31 17:29:55 -05003996out:
3997 up_write(&rbd_dev->header_rwsem);
3998
3999 return ret;
4000}
4001
Alex Elder9d475de2012-07-03 16:01:19 -05004002/*
Alex Elder35938152012-08-02 11:29:46 -05004003 * Scan the rbd device's current snapshot list and compare it to the
4004 * newly-received snapshot context. Remove any existing snapshots
4005 * not present in the new snapshot context. Add a new snapshot for
4006 * any snaphots in the snapshot context not in the current list.
4007 * And verify there are no changes to snapshots we already know
4008 * about.
4009 *
4010 * Assumes the snapshots in the snapshot context are sorted by
4011 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
4012 * are also maintained in that order.)
Alex Elder522a0cc2013-04-25 15:09:41 -05004013 *
4014 * Note that any error occurs while updating the snapshot list
4015 * aborts the update, and the entire list is cleared. The snapshot
4016 * list becomes inconsistent at that point anyway, so it might as
4017 * well be empty.
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004018 */
Alex Elder304f6802012-08-31 17:29:52 -05004019static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004020{
Alex Elder35938152012-08-02 11:29:46 -05004021 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4022 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05004023 struct list_head *head = &rbd_dev->snaps;
4024 struct list_head *links = head->next;
4025 u32 index = 0;
Alex Elder522a0cc2013-04-25 15:09:41 -05004026 int ret = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004027
Alex Elder522a0cc2013-04-25 15:09:41 -05004028 dout("%s: snap count is %u\n", __func__, (unsigned int)snap_count);
Alex Elder35938152012-08-02 11:29:46 -05004029 while (index < snap_count || links != head) {
4030 u64 snap_id;
4031 struct rbd_snap *snap;
Alex Eldercb752232013-04-30 00:44:33 -05004032 const char *snap_name;
Alex Eldercd892122012-07-03 16:01:19 -05004033 u64 snap_size = 0;
4034 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004035
Alex Elder35938152012-08-02 11:29:46 -05004036 snap_id = index < snap_count ? snapc->snaps[index]
4037 : CEPH_NOSNAP;
4038 snap = links != head ? list_entry(links, struct rbd_snap, node)
4039 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05004040 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004041
Alex Elder35938152012-08-02 11:29:46 -05004042 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
4043 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004044
Alex Elder6d292902013-01-14 12:43:31 -06004045 /*
4046 * A previously-existing snapshot is not in
4047 * the new snap context.
4048 *
Alex Elder522a0cc2013-04-25 15:09:41 -05004049 * If the now-missing snapshot is the one
4050 * the image represents, clear its existence
4051 * flag so we can avoid sending any more
4052 * requests to it.
Alex Elder6d292902013-01-14 12:43:31 -06004053 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004054 if (rbd_dev->spec->snap_id == snap->id)
Alex Elder6d292902013-01-14 12:43:31 -06004055 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder3e83b652013-04-23 13:52:53 -05004056 dout("removing %ssnap id %llu\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004057 rbd_dev->spec->snap_id == snap->id ?
4058 "mapped " : "",
Alex Elder522a0cc2013-04-25 15:09:41 -05004059 (unsigned long long)snap->id);
Alex Elder6087b512013-04-25 15:09:41 -05004060
4061 list_del(&snap->node);
4062 rbd_snap_destroy(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004063
Alex Elder35938152012-08-02 11:29:46 -05004064 /* Done with this list entry; advance */
4065
4066 links = next;
4067 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004068 }
Alex Elder35938152012-08-02 11:29:46 -05004069
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004070 snap_name = rbd_dev_snap_info(rbd_dev, index,
4071 &snap_size, &snap_features);
Alex Elder522a0cc2013-04-25 15:09:41 -05004072 if (IS_ERR(snap_name)) {
4073 ret = PTR_ERR(snap_name);
4074 dout("failed to get snap info, error %d\n", ret);
4075 goto out_err;
4076 }
Alex Eldercd892122012-07-03 16:01:19 -05004077
Alex Elder522a0cc2013-04-25 15:09:41 -05004078 dout("entry %u: snap_id = %llu\n", (unsigned int)snap_count,
4079 (unsigned long long)snap_id);
Alex Elder35938152012-08-02 11:29:46 -05004080 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
4081 struct rbd_snap *new_snap;
4082
4083 /* We haven't seen this snapshot before */
4084
Alex Elder6087b512013-04-25 15:09:41 -05004085 new_snap = rbd_snap_create(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05004086 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05004087 if (IS_ERR(new_snap)) {
Alex Elder522a0cc2013-04-25 15:09:41 -05004088 ret = PTR_ERR(new_snap);
4089 dout(" failed to add dev, error %d\n", ret);
4090 goto out_err;
Alex Elder9fcbb802012-08-23 23:48:49 -05004091 }
Alex Elder35938152012-08-02 11:29:46 -05004092
4093 /* New goes before existing, or at end of list */
4094
Alex Elder9fcbb802012-08-23 23:48:49 -05004095 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05004096 if (snap)
4097 list_add_tail(&new_snap->node, &snap->node);
4098 else
Alex Elder523f3252012-08-30 00:16:37 -05004099 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05004100 } else {
4101 /* Already have this one */
4102
Alex Elder9fcbb802012-08-23 23:48:49 -05004103 dout(" already present\n");
4104
Alex Eldercd892122012-07-03 16:01:19 -05004105 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05004106 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05004107 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05004108
4109 /* Done with this list entry; advance */
4110
4111 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004112 }
Alex Elder35938152012-08-02 11:29:46 -05004113
4114 /* Advance to the next entry in the snapshot context */
4115
4116 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004117 }
Alex Elder9fcbb802012-08-23 23:48:49 -05004118 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004119
4120 return 0;
Alex Elder522a0cc2013-04-25 15:09:41 -05004121out_err:
4122 rbd_remove_all_snaps(rbd_dev);
4123
4124 return ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004125}
4126
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004127static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
4128{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004129 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05004130 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004131
4132 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004133
Alex Eldercd789ab2012-08-30 00:16:38 -05004134 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004135 dev->bus = &rbd_bus_type;
4136 dev->type = &rbd_device_type;
4137 dev->parent = &rbd_root_dev;
Alex Elder200a6a82013-04-28 23:32:34 -05004138 dev->release = rbd_dev_device_release;
Alex Elderde71a292012-07-03 16:01:19 -05004139 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004140 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004141
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004142 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05004143
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004144 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004145}
4146
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004147static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
4148{
4149 device_unregister(&rbd_dev->dev);
4150}
4151
Alex Eldere2839302012-08-29 17:11:06 -05004152static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06004153
4154/*
Alex Elder499afd52012-02-02 08:13:29 -06004155 * Get a unique rbd identifier for the given new rbd_dev, and add
4156 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06004157 */
Alex Eldere2839302012-08-29 17:11:06 -05004158static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06004159{
Alex Eldere2839302012-08-29 17:11:06 -05004160 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06004161
4162 spin_lock(&rbd_dev_list_lock);
4163 list_add_tail(&rbd_dev->node, &rbd_dev_list);
4164 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05004165 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
4166 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06004167}
Alex Elderb7f23c32012-01-29 13:57:43 -06004168
Alex Elder1ddbe942012-01-29 13:57:44 -06004169/*
Alex Elder499afd52012-02-02 08:13:29 -06004170 * Remove an rbd_dev from the global list, and record that its
4171 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06004172 */
Alex Eldere2839302012-08-29 17:11:06 -05004173static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06004174{
Alex Elderd184f6b2012-01-29 13:57:44 -06004175 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05004176 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004177 int max_id;
4178
Alex Elderaafb2302012-09-06 16:00:54 -05004179 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06004180
Alex Eldere2839302012-08-29 17:11:06 -05004181 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
4182 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06004183 spin_lock(&rbd_dev_list_lock);
4184 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06004185
4186 /*
4187 * If the id being "put" is not the current maximum, there
4188 * is nothing special we need to do.
4189 */
Alex Eldere2839302012-08-29 17:11:06 -05004190 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06004191 spin_unlock(&rbd_dev_list_lock);
4192 return;
4193 }
4194
4195 /*
4196 * We need to update the current maximum id. Search the
4197 * list to find out what it is. We're more likely to find
4198 * the maximum at the end, so search the list backward.
4199 */
4200 max_id = 0;
4201 list_for_each_prev(tmp, &rbd_dev_list) {
4202 struct rbd_device *rbd_dev;
4203
4204 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07004205 if (rbd_dev->dev_id > max_id)
4206 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06004207 }
Alex Elder499afd52012-02-02 08:13:29 -06004208 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06004209
Alex Elder1ddbe942012-01-29 13:57:44 -06004210 /*
Alex Eldere2839302012-08-29 17:11:06 -05004211 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06004212 * which case it now accurately reflects the new maximum.
4213 * Be careful not to overwrite the maximum value in that
4214 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06004215 */
Alex Eldere2839302012-08-29 17:11:06 -05004216 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
4217 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06004218}
4219
Alex Eldera725f65e2012-02-02 08:13:30 -06004220/*
Alex Eldere28fff262012-02-02 08:13:30 -06004221 * Skips over white space at *buf, and updates *buf to point to the
4222 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06004223 * the token (string of non-white space characters) found. Note
4224 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06004225 */
4226static inline size_t next_token(const char **buf)
4227{
4228 /*
4229 * These are the characters that produce nonzero for
4230 * isspace() in the "C" and "POSIX" locales.
4231 */
4232 const char *spaces = " \f\n\r\t\v";
4233
4234 *buf += strspn(*buf, spaces); /* Find start of token */
4235
4236 return strcspn(*buf, spaces); /* Return token length */
4237}
4238
4239/*
4240 * Finds the next token in *buf, and if the provided token buffer is
4241 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06004242 * copied, is guaranteed to be terminated with '\0'. Note that *buf
4243 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06004244 *
4245 * Returns the length of the token found (not including the '\0').
4246 * Return value will be 0 if no token is found, and it will be >=
4247 * token_size if the token would not fit.
4248 *
Alex Elder593a9e72012-02-07 12:03:37 -06004249 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06004250 * found token. Note that this occurs even if the token buffer is
4251 * too small to hold it.
4252 */
4253static inline size_t copy_token(const char **buf,
4254 char *token,
4255 size_t token_size)
4256{
4257 size_t len;
4258
4259 len = next_token(buf);
4260 if (len < token_size) {
4261 memcpy(token, *buf, len);
4262 *(token + len) = '\0';
4263 }
4264 *buf += len;
4265
4266 return len;
4267}
4268
4269/*
Alex Elderea3352f2012-07-09 21:04:23 -05004270 * Finds the next token in *buf, dynamically allocates a buffer big
4271 * enough to hold a copy of it, and copies the token into the new
4272 * buffer. The copy is guaranteed to be terminated with '\0'. Note
4273 * that a duplicate buffer is created even for a zero-length token.
4274 *
4275 * Returns a pointer to the newly-allocated duplicate, or a null
4276 * pointer if memory for the duplicate was not available. If
4277 * the lenp argument is a non-null pointer, the length of the token
4278 * (not including the '\0') is returned in *lenp.
4279 *
4280 * If successful, the *buf pointer will be updated to point beyond
4281 * the end of the found token.
4282 *
4283 * Note: uses GFP_KERNEL for allocation.
4284 */
4285static inline char *dup_token(const char **buf, size_t *lenp)
4286{
4287 char *dup;
4288 size_t len;
4289
4290 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05004291 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05004292 if (!dup)
4293 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05004294 *(dup + len) = '\0';
4295 *buf += len;
4296
4297 if (lenp)
4298 *lenp = len;
4299
4300 return dup;
4301}
4302
4303/*
Alex Elder859c31d2012-10-25 23:34:42 -05004304 * Parse the options provided for an "rbd add" (i.e., rbd image
4305 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
4306 * and the data written is passed here via a NUL-terminated buffer.
4307 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05004308 *
Alex Elder859c31d2012-10-25 23:34:42 -05004309 * The information extracted from these options is recorded in
4310 * the other parameters which return dynamically-allocated
4311 * structures:
4312 * ceph_opts
4313 * The address of a pointer that will refer to a ceph options
4314 * structure. Caller must release the returned pointer using
4315 * ceph_destroy_options() when it is no longer needed.
4316 * rbd_opts
4317 * Address of an rbd options pointer. Fully initialized by
4318 * this function; caller must release with kfree().
4319 * spec
4320 * Address of an rbd image specification pointer. Fully
4321 * initialized by this function based on parsed options.
4322 * Caller must release with rbd_spec_put().
4323 *
4324 * The options passed take this form:
4325 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
4326 * where:
4327 * <mon_addrs>
4328 * A comma-separated list of one or more monitor addresses.
4329 * A monitor address is an ip address, optionally followed
4330 * by a port number (separated by a colon).
4331 * I.e.: ip1[:port1][,ip2[:port2]...]
4332 * <options>
4333 * A comma-separated list of ceph and/or rbd options.
4334 * <pool_name>
4335 * The name of the rados pool containing the rbd image.
4336 * <image_name>
4337 * The name of the image in that pool to map.
4338 * <snap_id>
4339 * An optional snapshot id. If provided, the mapping will
4340 * present data from the image at the time that snapshot was
4341 * created. The image head is used if no snapshot id is
4342 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06004343 */
Alex Elder859c31d2012-10-25 23:34:42 -05004344static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05004345 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05004346 struct rbd_options **opts,
4347 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06004348{
Alex Elderd22f76e2012-07-12 10:46:35 -05004349 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05004350 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05004351 const char *mon_addrs;
Alex Elderecb4dc222013-04-26 09:43:47 -05004352 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05004353 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05004354 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004355 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004356 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05004357 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06004358
4359 /* The first four tokens are required */
4360
Alex Elder7ef32142012-02-02 08:13:30 -06004361 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05004362 if (!len) {
4363 rbd_warn(NULL, "no monitor address(es) provided");
4364 return -EINVAL;
4365 }
Alex Elder0ddebc02012-10-25 23:34:41 -05004366 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05004367 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06004368 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06004369
Alex Elderdc79b112012-10-25 23:34:41 -05004370 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05004371 options = dup_token(&buf, NULL);
4372 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05004373 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004374 if (!*options) {
4375 rbd_warn(NULL, "no options provided");
4376 goto out_err;
4377 }
Alex Eldera725f65e2012-02-02 08:13:30 -06004378
Alex Elder859c31d2012-10-25 23:34:42 -05004379 spec = rbd_spec_alloc();
4380 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05004381 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05004382
4383 spec->pool_name = dup_token(&buf, NULL);
4384 if (!spec->pool_name)
4385 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004386 if (!*spec->pool_name) {
4387 rbd_warn(NULL, "no pool name provided");
4388 goto out_err;
4389 }
Alex Eldere28fff262012-02-02 08:13:30 -06004390
Alex Elder69e7a022012-11-01 08:39:26 -05004391 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05004392 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004393 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05004394 if (!*spec->image_name) {
4395 rbd_warn(NULL, "no image name provided");
4396 goto out_err;
4397 }
Alex Eldere28fff262012-02-02 08:13:30 -06004398
Alex Elderf28e5652012-10-25 23:34:41 -05004399 /*
4400 * Snapshot name is optional; default is to use "-"
4401 * (indicating the head/no snapshot).
4402 */
Alex Elder3feeb8942012-08-31 17:29:52 -05004403 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05004404 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05004405 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
4406 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05004407 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05004408 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05004409 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05004410 }
Alex Elderecb4dc222013-04-26 09:43:47 -05004411 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
4412 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05004413 goto out_mem;
Alex Elderecb4dc222013-04-26 09:43:47 -05004414 *(snap_name + len) = '\0';
4415 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05004416
Alex Elder0ddebc02012-10-25 23:34:41 -05004417 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06004418
Alex Elder4e9afeb2012-10-25 23:34:41 -05004419 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
4420 if (!rbd_opts)
4421 goto out_mem;
4422
4423 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05004424
Alex Elder859c31d2012-10-25 23:34:42 -05004425 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05004426 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05004427 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004428 if (IS_ERR(copts)) {
4429 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05004430 goto out_err;
4431 }
Alex Elder859c31d2012-10-25 23:34:42 -05004432 kfree(options);
4433
4434 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004435 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05004436 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05004437
Alex Elderdc79b112012-10-25 23:34:41 -05004438 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05004439out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05004440 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05004441out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05004442 kfree(rbd_opts);
4443 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05004444 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05004445
Alex Elderdc79b112012-10-25 23:34:41 -05004446 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06004447}
4448
Alex Elder589d30e2012-07-10 20:30:11 -05004449/*
4450 * An rbd format 2 image has a unique identifier, distinct from the
4451 * name given to it by the user. Internally, that identifier is
4452 * what's used to specify the names of objects related to the image.
4453 *
4454 * A special "rbd id" object is used to map an rbd image name to its
4455 * id. If that object doesn't exist, then there is no v2 rbd image
4456 * with the supplied name.
4457 *
4458 * This function will record the given rbd_dev's image_id field if
4459 * it can be determined, and in that case will return 0. If any
4460 * errors occur a negative errno will be returned and the rbd_dev's
4461 * image_id field will be unchanged (and should be NULL).
4462 */
4463static int rbd_dev_image_id(struct rbd_device *rbd_dev)
4464{
4465 int ret;
4466 size_t size;
4467 char *object_name;
4468 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05004469 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05004470
Alex Elder589d30e2012-07-10 20:30:11 -05004471 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05004472 * When probing a parent image, the image id is already
4473 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05004474 * need to fetch the image id again in this case. We
4475 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05004476 */
Alex Elderc0fba362013-04-25 23:15:08 -05004477 if (rbd_dev->spec->image_id) {
4478 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
4479
Alex Elder2c0d0a12012-10-30 19:40:33 -05004480 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05004481 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05004482
4483 /*
Alex Elder589d30e2012-07-10 20:30:11 -05004484 * First, see if the format 2 image id file exists, and if
4485 * so, get the image's persistent id from it.
4486 */
Alex Elder69e7a022012-11-01 08:39:26 -05004487 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004488 object_name = kmalloc(size, GFP_NOIO);
4489 if (!object_name)
4490 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004491 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05004492 dout("rbd id object name is %s\n", object_name);
4493
4494 /* Response will be an encoded string, which includes a length */
4495
4496 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
4497 response = kzalloc(size, GFP_NOIO);
4498 if (!response) {
4499 ret = -ENOMEM;
4500 goto out;
4501 }
4502
Alex Elderc0fba362013-04-25 23:15:08 -05004503 /* If it doesn't exist we'll assume it's a format 1 image */
4504
Alex Elder36be9a72013-01-19 00:30:28 -06004505 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder41579762013-04-21 12:14:45 -05004506 "rbd", "get_id", NULL, 0,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004507 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004508 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05004509 if (ret == -ENOENT) {
4510 image_id = kstrdup("", GFP_KERNEL);
4511 ret = image_id ? 0 : -ENOMEM;
4512 if (!ret)
4513 rbd_dev->image_format = 1;
4514 } else if (ret > sizeof (__le32)) {
4515 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05004516
Alex Elderc0fba362013-04-25 23:15:08 -05004517 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05004518 NULL, GFP_NOIO);
Alex Elderc0fba362013-04-25 23:15:08 -05004519 ret = IS_ERR(image_id) ? PTR_ERR(image_id) : 0;
4520 if (!ret)
4521 rbd_dev->image_format = 2;
Alex Elder589d30e2012-07-10 20:30:11 -05004522 } else {
Alex Elderc0fba362013-04-25 23:15:08 -05004523 ret = -EINVAL;
4524 }
4525
4526 if (!ret) {
4527 rbd_dev->spec->image_id = image_id;
4528 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004529 }
4530out:
4531 kfree(response);
4532 kfree(object_name);
4533
4534 return ret;
4535}
4536
Alex Elder6fd48b32013-04-28 23:32:34 -05004537/* Undo whatever state changes are made by v1 or v2 image probe */
4538
4539static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
4540{
4541 struct rbd_image_header *header;
4542
4543 rbd_dev_remove_parent(rbd_dev);
4544 rbd_spec_put(rbd_dev->parent_spec);
4545 rbd_dev->parent_spec = NULL;
4546 rbd_dev->parent_overlap = 0;
4547
4548 /* Free dynamic fields from the header, then zero it out */
4549
4550 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05004551 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05004552 kfree(header->snap_sizes);
4553 kfree(header->snap_names);
4554 kfree(header->object_prefix);
4555 memset(header, 0, sizeof (*header));
4556}
4557
Alex Eldera30b71b2012-07-10 20:30:11 -05004558static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
4559{
4560 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004561
4562 /* Populate rbd image metadata */
4563
4564 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
4565 if (ret < 0)
4566 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05004567
4568 /* Version 1 images have no parent (no layering) */
4569
4570 rbd_dev->parent_spec = NULL;
4571 rbd_dev->parent_overlap = 0;
4572
Alex Eldera30b71b2012-07-10 20:30:11 -05004573 dout("discovered version 1 image, header name is %s\n",
4574 rbd_dev->header_name);
4575
4576 return 0;
4577
4578out_err:
4579 kfree(rbd_dev->header_name);
4580 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004581 kfree(rbd_dev->spec->image_id);
4582 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05004583
4584 return ret;
4585}
4586
4587static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
4588{
Alex Elder9d475de2012-07-03 16:01:19 -05004589 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004590
Alex Elder9d475de2012-07-03 16:01:19 -05004591 ret = rbd_dev_v2_image_size(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004592 if (ret)
Alex Elder9d475de2012-07-03 16:01:19 -05004593 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05004594
4595 /* Get the object prefix (a.k.a. block_name) for the image */
4596
4597 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004598 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05004599 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05004600
Alex Elderd8891402012-10-09 13:50:17 -07004601 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05004602
4603 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004604 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05004605 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05004606
Alex Elder86b00e02012-10-25 23:34:42 -05004607 /* If the image supports layering, get the parent info */
4608
4609 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
4610 ret = rbd_dev_v2_parent_info(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05004611 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004612 goto out_err;
Alex Elder96882f52013-04-30 00:44:32 -05004613
4614 /*
4615 * Don't print a warning for parent images. We can
4616 * tell this point because we won't know its pool
4617 * name yet (just its pool id).
4618 */
4619 if (rbd_dev->spec->pool_name)
4620 rbd_warn(rbd_dev, "WARNING: kernel layering "
4621 "is EXPERIMENTAL!");
Alex Elder86b00e02012-10-25 23:34:42 -05004622 }
4623
Alex Eldercc070d52013-04-21 12:14:45 -05004624 /* If the image supports fancy striping, get its parameters */
4625
4626 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
4627 ret = rbd_dev_v2_striping_info(rbd_dev);
4628 if (ret < 0)
4629 goto out_err;
4630 }
4631
Alex Elder6e14b1a2012-07-03 16:01:19 -05004632 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05004633
Alex Elder6e14b1a2012-07-03 16:01:19 -05004634 rbd_dev->header.crypt_type = 0;
4635 rbd_dev->header.comp_type = 0;
4636
4637 /* Get the snapshot context, plus the header version */
4638
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004639 ret = rbd_dev_v2_snap_context(rbd_dev);
Alex Elder35d489f2012-07-03 16:01:19 -05004640 if (ret)
4641 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05004642
Alex Eldera30b71b2012-07-10 20:30:11 -05004643 dout("discovered version 2 image, header name is %s\n",
4644 rbd_dev->header_name);
4645
Alex Elder35152972012-08-31 17:29:55 -05004646 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05004647out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05004648 rbd_dev->parent_overlap = 0;
4649 rbd_spec_put(rbd_dev->parent_spec);
4650 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004651 kfree(rbd_dev->header_name);
4652 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05004653 kfree(rbd_dev->header.object_prefix);
4654 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05004655
4656 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004657}
4658
Alex Elder124afba2013-04-26 15:44:36 -05004659static int rbd_dev_probe_parent(struct rbd_device *rbd_dev)
Alex Elder83a06262012-10-30 15:47:17 -05004660{
Alex Elder2f82ee52012-10-30 19:40:33 -05004661 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05004662 struct rbd_spec *parent_spec;
4663 struct rbd_client *rbdc;
4664 int ret;
4665
4666 if (!rbd_dev->parent_spec)
4667 return 0;
4668 /*
4669 * We need to pass a reference to the client and the parent
4670 * spec when creating the parent rbd_dev. Images related by
4671 * parent/child relationships always share both.
4672 */
4673 parent_spec = rbd_spec_get(rbd_dev->parent_spec);
4674 rbdc = __rbd_get_client(rbd_dev->rbd_client);
4675
4676 ret = -ENOMEM;
4677 parent = rbd_dev_create(rbdc, parent_spec);
4678 if (!parent)
4679 goto out_err;
4680
4681 ret = rbd_dev_image_probe(parent);
4682 if (ret < 0)
4683 goto out_err;
4684 rbd_dev->parent = parent;
4685
4686 return 0;
4687out_err:
4688 if (parent) {
4689 rbd_spec_put(rbd_dev->parent_spec);
4690 kfree(rbd_dev->header_name);
4691 rbd_dev_destroy(parent);
4692 } else {
4693 rbd_put_client(rbdc);
4694 rbd_spec_put(parent_spec);
4695 }
4696
4697 return ret;
4698}
4699
Alex Elder200a6a82013-04-28 23:32:34 -05004700static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05004701{
Alex Elder83a06262012-10-30 15:47:17 -05004702 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05004703
Alex Elderd1cf5782013-04-27 09:59:30 -05004704 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004705 if (ret)
Alex Elder9bb81c92013-04-27 09:59:30 -05004706 return ret;
Alex Elder5de10f32013-04-26 15:44:37 -05004707
Alex Elder83a06262012-10-30 15:47:17 -05004708 /* generate unique id: find highest unique id, add one */
4709 rbd_dev_id_get(rbd_dev);
4710
4711 /* Fill in the device name, now that we have its id. */
4712 BUILD_BUG_ON(DEV_NAME_LEN
4713 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
4714 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
4715
4716 /* Get our block major device number. */
4717
4718 ret = register_blkdev(0, rbd_dev->name);
4719 if (ret < 0)
4720 goto err_out_id;
4721 rbd_dev->major = ret;
4722
4723 /* Set up the blkdev mapping. */
4724
4725 ret = rbd_init_disk(rbd_dev);
4726 if (ret)
4727 goto err_out_blkdev;
4728
4729 ret = rbd_bus_add_dev(rbd_dev);
4730 if (ret)
4731 goto err_out_disk;
4732
Alex Elder83a06262012-10-30 15:47:17 -05004733 /* Everything's ready. Announce the disk to the world. */
4734
Alex Elderb5156e72013-04-26 15:44:36 -05004735 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Alex Elder129b79d2013-04-26 15:44:36 -05004736 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder83a06262012-10-30 15:47:17 -05004737 add_disk(rbd_dev->disk);
4738
4739 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
4740 (unsigned long long) rbd_dev->mapping.size);
4741
4742 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05004743
Alex Elder83a06262012-10-30 15:47:17 -05004744err_out_disk:
4745 rbd_free_disk(rbd_dev);
4746err_out_blkdev:
4747 unregister_blkdev(rbd_dev->major, rbd_dev->name);
4748err_out_id:
4749 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05004750 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05004751
4752 return ret;
4753}
4754
Alex Elder332bb122013-04-27 09:59:30 -05004755static int rbd_dev_header_name(struct rbd_device *rbd_dev)
4756{
4757 struct rbd_spec *spec = rbd_dev->spec;
4758 size_t size;
4759
4760 /* Record the header object name for this rbd image. */
4761
4762 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4763
4764 if (rbd_dev->image_format == 1)
4765 size = strlen(spec->image_name) + sizeof (RBD_SUFFIX);
4766 else
4767 size = sizeof (RBD_HEADER_PREFIX) + strlen(spec->image_id);
4768
4769 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
4770 if (!rbd_dev->header_name)
4771 return -ENOMEM;
4772
4773 if (rbd_dev->image_format == 1)
4774 sprintf(rbd_dev->header_name, "%s%s",
4775 spec->image_name, RBD_SUFFIX);
4776 else
4777 sprintf(rbd_dev->header_name, "%s%s",
4778 RBD_HEADER_PREFIX, spec->image_id);
4779 return 0;
4780}
4781
Alex Elder200a6a82013-04-28 23:32:34 -05004782static void rbd_dev_image_release(struct rbd_device *rbd_dev)
4783{
Alex Elder6fd48b32013-04-28 23:32:34 -05004784 int ret;
4785
4786 rbd_remove_all_snaps(rbd_dev);
4787 rbd_dev_unprobe(rbd_dev);
4788 ret = rbd_dev_header_watch_sync(rbd_dev, 0);
4789 if (ret)
4790 rbd_warn(rbd_dev, "failed to cancel watch event (%d)\n", ret);
Alex Elder200a6a82013-04-28 23:32:34 -05004791 kfree(rbd_dev->header_name);
Alex Elder6fd48b32013-04-28 23:32:34 -05004792 rbd_dev->header_name = NULL;
4793 rbd_dev->image_format = 0;
4794 kfree(rbd_dev->spec->image_id);
4795 rbd_dev->spec->image_id = NULL;
4796
Alex Elder200a6a82013-04-28 23:32:34 -05004797 rbd_dev_destroy(rbd_dev);
4798}
4799
Alex Eldera30b71b2012-07-10 20:30:11 -05004800/*
4801 * Probe for the existence of the header object for the given rbd
4802 * device. For format 2 images this includes determining the image
4803 * id.
4804 */
Alex Elder71f293e2013-04-26 09:43:48 -05004805static int rbd_dev_image_probe(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05004806{
4807 int ret;
Alex Elderb644de22013-04-27 09:59:31 -05004808 int tmp;
Alex Eldera30b71b2012-07-10 20:30:11 -05004809
4810 /*
4811 * Get the id from the image id object. If it's not a
4812 * format 2 image, we'll get ENOENT back, and we'll assume
4813 * it's a format 1 image.
4814 */
4815 ret = rbd_dev_image_id(rbd_dev);
4816 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05004817 return ret;
4818 rbd_assert(rbd_dev->spec->image_id);
4819 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
4820
Alex Elder332bb122013-04-27 09:59:30 -05004821 ret = rbd_dev_header_name(rbd_dev);
4822 if (ret)
4823 goto err_out_format;
4824
Alex Elderb644de22013-04-27 09:59:31 -05004825 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
4826 if (ret)
4827 goto out_header_name;
4828
Alex Elderc0fba362013-04-25 23:15:08 -05004829 if (rbd_dev->image_format == 1)
Alex Eldera30b71b2012-07-10 20:30:11 -05004830 ret = rbd_dev_v1_probe(rbd_dev);
4831 else
4832 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05004833 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05004834 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05004835
Alex Elder9bb81c92013-04-27 09:59:30 -05004836 ret = rbd_dev_snaps_update(rbd_dev);
4837 if (ret)
Alex Elder6fd48b32013-04-28 23:32:34 -05004838 goto err_out_probe;
Alex Elder9bb81c92013-04-27 09:59:30 -05004839
4840 ret = rbd_dev_spec_update(rbd_dev);
4841 if (ret)
4842 goto err_out_snaps;
4843
4844 ret = rbd_dev_probe_parent(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05004845 if (!ret)
4846 return 0;
Alex Elder83a06262012-10-30 15:47:17 -05004847
Alex Elder9bb81c92013-04-27 09:59:30 -05004848err_out_snaps:
4849 rbd_remove_all_snaps(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05004850err_out_probe:
4851 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05004852err_out_watch:
4853 tmp = rbd_dev_header_watch_sync(rbd_dev, 0);
4854 if (tmp)
4855 rbd_warn(rbd_dev, "unable to tear down watch request\n");
Alex Elder332bb122013-04-27 09:59:30 -05004856out_header_name:
4857 kfree(rbd_dev->header_name);
4858 rbd_dev->header_name = NULL;
4859err_out_format:
4860 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05004861 kfree(rbd_dev->spec->image_id);
4862 rbd_dev->spec->image_id = NULL;
4863
4864 dout("probe failed, returning %d\n", ret);
4865
4866 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05004867}
4868
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004869static ssize_t rbd_add(struct bus_type *bus,
4870 const char *buf,
4871 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004872{
Alex Eldercb8627c2012-07-09 21:04:23 -05004873 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05004874 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05004875 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05004876 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05004877 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06004878 struct ceph_osd_client *osdc;
4879 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004880
4881 if (!try_module_get(THIS_MODULE))
4882 return -ENODEV;
4883
Alex Eldera725f65e2012-02-02 08:13:30 -06004884 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05004885 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05004886 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05004887 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06004888
Alex Elder9d3997f2012-10-25 23:34:42 -05004889 rbdc = rbd_get_client(ceph_opts);
4890 if (IS_ERR(rbdc)) {
4891 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004892 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05004893 }
Alex Elderc53d5892012-10-25 23:34:42 -05004894 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004895
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004896 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05004897 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05004898 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004899 if (rc < 0)
4900 goto err_out_client;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004901 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05004902
Alex Elder0903e872012-11-14 12:25:19 -06004903 /* The ceph file layout needs to fit pool id in 32 bits */
4904
Alex Elderc0cd10db2013-04-26 09:43:47 -05004905 if (spec->pool_id > (u64)U32_MAX) {
4906 rbd_warn(NULL, "pool id too large (%llu > %u)\n",
4907 (unsigned long long)spec->pool_id, U32_MAX);
Alex Elder0903e872012-11-14 12:25:19 -06004908 rc = -EIO;
4909 goto err_out_client;
4910 }
4911
Alex Elderc53d5892012-10-25 23:34:42 -05004912 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004913 if (!rbd_dev)
4914 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05004915 rbdc = NULL; /* rbd_dev now owns this */
4916 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004917
Alex Elderbd4ba652012-10-25 23:34:42 -05004918 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05004919 kfree(rbd_opts);
4920 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05004921
Alex Elder71f293e2013-04-26 09:43:48 -05004922 rc = rbd_dev_image_probe(rbd_dev);
Alex Eldera30b71b2012-07-10 20:30:11 -05004923 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004924 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004925
Alex Elderb536f692013-04-28 23:32:34 -05004926 rc = rbd_dev_device_setup(rbd_dev);
4927 if (!rc)
4928 return count;
4929
4930 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004931err_out_rbd_dev:
4932 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004933err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004934 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004935err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004936 if (ceph_opts)
4937 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004938 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004939 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004940err_out_module:
4941 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004942
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004943 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004944
Alex Elderc0cd10db2013-04-26 09:43:47 -05004945 return (ssize_t)rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004946}
4947
Alex Elderde71a292012-07-03 16:01:19 -05004948static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004949{
4950 struct list_head *tmp;
4951 struct rbd_device *rbd_dev;
4952
Alex Eldere124a82f2012-01-29 13:57:44 -06004953 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004954 list_for_each(tmp, &rbd_dev_list) {
4955 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004956 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06004957 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004958 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06004959 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004960 }
Alex Eldere124a82f2012-01-29 13:57:44 -06004961 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004962 return NULL;
4963}
4964
Alex Elder200a6a82013-04-28 23:32:34 -05004965static void rbd_dev_device_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004966{
Alex Elder593a9e72012-02-07 12:03:37 -06004967 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004968
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004969 rbd_free_disk(rbd_dev);
Alex Elder200a6a82013-04-28 23:32:34 -05004970 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4971 rbd_dev_clear_mapping(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004972 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder200a6a82013-04-28 23:32:34 -05004973 rbd_dev->major = 0;
Alex Eldere2839302012-08-29 17:11:06 -05004974 rbd_dev_id_put(rbd_dev);
Alex Elderd1cf5782013-04-27 09:59:30 -05004975 rbd_dev_mapping_clear(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004976}
4977
Alex Elder05a46af2013-04-26 15:44:36 -05004978static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
4979{
Alex Elderad945fc2013-04-26 15:44:36 -05004980 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05004981 struct rbd_device *first = rbd_dev;
4982 struct rbd_device *second = first->parent;
4983 struct rbd_device *third;
4984
4985 /*
4986 * Follow to the parent with no grandparent and
4987 * remove it.
4988 */
4989 while (second && (third = second->parent)) {
4990 first = second;
4991 second = third;
4992 }
Alex Elderad945fc2013-04-26 15:44:36 -05004993 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05004994 rbd_dev_image_release(second);
Alex Elderad945fc2013-04-26 15:44:36 -05004995 first->parent = NULL;
4996 first->parent_overlap = 0;
4997
4998 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05004999 rbd_spec_put(first->parent_spec);
5000 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005001 }
5002}
5003
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005004static ssize_t rbd_remove(struct bus_type *bus,
5005 const char *buf,
5006 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005007{
5008 struct rbd_device *rbd_dev = NULL;
Alex Elder0d8189e2013-04-27 09:59:30 -05005009 int target_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005010 unsigned long ul;
Alex Elder0d8189e2013-04-27 09:59:30 -05005011 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005012
Alex Elder0d8189e2013-04-27 09:59:30 -05005013 ret = strict_strtoul(buf, 10, &ul);
5014 if (ret)
5015 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005016
5017 /* convert to int; abort if we lost anything in the conversion */
5018 target_id = (int) ul;
5019 if (target_id != ul)
5020 return -EINVAL;
5021
5022 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
5023
5024 rbd_dev = __rbd_get_dev(target_id);
5025 if (!rbd_dev) {
5026 ret = -ENOENT;
5027 goto done;
5028 }
5029
Alex Eldera14ea262013-02-05 13:23:12 -06005030 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06005031 if (rbd_dev->open_count)
Alex Elder42382b72012-11-16 09:29:16 -06005032 ret = -EBUSY;
Alex Elderb82d1672013-01-14 12:43:31 -06005033 else
5034 set_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags);
Alex Eldera14ea262013-02-05 13:23:12 -06005035 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -06005036 if (ret < 0)
Alex Elder42382b72012-11-16 09:29:16 -06005037 goto done;
Alex Elder0d8189e2013-04-27 09:59:30 -05005038 ret = count;
Alex Elderb4808152013-04-26 15:44:36 -05005039 rbd_bus_del_dev(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005040 rbd_dev_image_release(rbd_dev);
Alex Elder79ab7552013-04-28 23:32:34 -05005041 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005042done:
5043 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05005044
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005045 return ret;
5046}
5047
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005048/*
5049 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005050 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005051 */
5052static int rbd_sysfs_init(void)
5053{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005054 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005055
Alex Elderfed4c142012-02-07 12:03:36 -06005056 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005057 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005058 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005059
Alex Elderfed4c142012-02-07 12:03:36 -06005060 ret = bus_register(&rbd_bus_type);
5061 if (ret < 0)
5062 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005063
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005064 return ret;
5065}
5066
5067static void rbd_sysfs_cleanup(void)
5068{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005069 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005070 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005071}
5072
Alex Eldercc344fa2013-02-19 12:25:56 -06005073static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005074{
5075 int rc;
5076
Alex Elder1e32d342013-01-30 11:13:33 -06005077 if (!libceph_compatible(NULL)) {
5078 rbd_warn(NULL, "libceph incompatibility (quitting)");
5079
5080 return -EINVAL;
5081 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005082 rc = rbd_sysfs_init();
5083 if (rc)
5084 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06005085 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005086 return 0;
5087}
5088
Alex Eldercc344fa2013-02-19 12:25:56 -06005089static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005090{
5091 rbd_sysfs_cleanup();
5092}
5093
5094module_init(rbd_init);
5095module_exit(rbd_exit);
5096
5097MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
5098MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
5099MODULE_DESCRIPTION("rados block device");
5100
5101/* following authorship retained from original osdblk.c */
5102MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
5103
5104MODULE_LICENSE("GPL");