blob: 8c90a39c2a91144704933975108fcdcb27207aa9 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072
73#define RBD_SNAP_HEAD_NAME "-"
74
Alex Elder9e15b772012-10-30 19:40:33 -050075/* This allows a single page to hold an image name sent by OSD */
76#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050077#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050078
Alex Elder1e130192012-07-03 16:01:19 -050079#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050080
Alex Elderd8891402012-10-09 13:50:17 -070081/* Feature bits */
82
83#define RBD_FEATURE_LAYERING 1
84
85/* Features supported by this (client software) implementation. */
86
87#define RBD_FEATURES_ALL (0)
88
Alex Elder81a89792012-02-02 08:13:30 -060089/*
90 * An RBD device name will be "rbd#", where the "rbd" comes from
91 * RBD_DRV_NAME above, and # is a unique integer identifier.
92 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
93 * enough to hold all possible device names.
94 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060096#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097
98/*
99 * block device image metadata (in-memory version)
100 */
101struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500102 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500103 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500104 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105 __u8 obj_order;
106 __u8 crypt_type;
107 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108
Alex Elderf84344f2012-08-31 17:29:51 -0500109 /* The remaining fields need to be updated occasionally */
110 u64 image_size;
111 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700112 char *snap_names;
113 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700114
115 u64 obj_version;
116};
117
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500118/*
119 * An rbd image specification.
120 *
121 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500122 * identify an image. Each rbd_dev structure includes a pointer to
123 * an rbd_spec structure that encapsulates this identity.
124 *
125 * Each of the id's in an rbd_spec has an associated name. For a
126 * user-mapped image, the names are supplied and the id's associated
127 * with them are looked up. For a layered image, a parent image is
128 * defined by the tuple, and the names are looked up.
129 *
130 * An rbd_dev structure contains a parent_spec pointer which is
131 * non-null if the image it represents is a child in a layered
132 * image. This pointer will refer to the rbd_spec structure used
133 * by the parent rbd_dev for its own identity (i.e., the structure
134 * is shared between the parent and child).
135 *
136 * Since these structures are populated once, during the discovery
137 * phase of image construction, they are effectively immutable so
138 * we make no effort to synchronize access to them.
139 *
140 * Note that code herein does not assume the image name is known (it
141 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500142 */
143struct rbd_spec {
144 u64 pool_id;
145 char *pool_name;
146
147 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500148 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500149
150 u64 snap_id;
151 char *snap_name;
152
153 struct kref kref;
154};
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600157 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700158 */
159struct rbd_client {
160 struct ceph_client *client;
161 struct kref kref;
162 struct list_head node;
163};
164
Alex Elderbf0d5f502012-11-22 00:00:08 -0600165struct rbd_img_request;
166typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
167
168#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
169
170struct rbd_obj_request;
171typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
172
Alex Elder9969ebc2013-01-18 12:31:10 -0600173enum obj_request_type {
174 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
175};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600176
177struct rbd_obj_request {
178 const char *object_name;
179 u64 offset; /* object start byte */
180 u64 length; /* bytes from offset */
181
182 struct rbd_img_request *img_request;
183 struct list_head links; /* img_request->obj_requests */
184 u32 which; /* posn image request list */
185
186 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600187 union {
188 struct bio *bio_list;
189 struct {
190 struct page **pages;
191 u32 page_count;
192 };
193 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600194
195 struct ceph_osd_request *osd_req;
196
197 u64 xferred; /* bytes transferred */
198 u64 version;
199 s32 result;
200 atomic_t done;
201
202 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600203 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204
205 struct kref kref;
206};
207
208struct rbd_img_request {
209 struct request *rq;
210 struct rbd_device *rbd_dev;
211 u64 offset; /* starting image byte offset */
212 u64 length; /* byte count from offset */
213 bool write_request; /* false for read */
214 union {
215 struct ceph_snap_context *snapc; /* for writes */
216 u64 snap_id; /* for reads */
217 };
218 spinlock_t completion_lock;/* protects next_completion */
219 u32 next_completion;
220 rbd_img_callback_t callback;
221
222 u32 obj_request_count;
223 struct list_head obj_requests; /* rbd_obj_request structs */
224
225 struct kref kref;
226};
227
228#define for_each_obj_request(ireq, oreq) \
229 list_for_each_entry(oreq, &ireq->obj_requests, links)
230#define for_each_obj_request_from(ireq, oreq) \
231 list_for_each_entry_from(oreq, &ireq->obj_requests, links)
232#define for_each_obj_request_safe(ireq, oreq, n) \
233 list_for_each_entry_safe_reverse(oreq, n, &ireq->obj_requests, links)
234
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800235struct rbd_snap {
236 struct device dev;
237 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800238 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800239 struct list_head node;
240 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500241 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800242};
243
Alex Elderf84344f2012-08-31 17:29:51 -0500244struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500245 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500246 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500247 bool read_only;
248};
249
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250/*
251 * a single device
252 */
253struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500254 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255
256 int major; /* blkdev assigned major */
257 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700258
Alex Eldera30b71b2012-07-10 20:30:11 -0500259 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700260 struct rbd_client *rbd_client;
261
262 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
263
264 spinlock_t lock; /* queue lock */
265
266 struct rbd_image_header header;
Alex Elder6d292902013-01-14 12:43:31 -0600267 unsigned long flags;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500268 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500270 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500271
Alex Elder0903e872012-11-14 12:25:19 -0600272 struct ceph_file_layout layout;
273
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700274 struct ceph_osd_event *watch_event;
Alex Elder975241a2013-01-25 17:08:55 -0600275 struct rbd_obj_request *watch_request;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700276
Alex Elder86b00e02012-10-25 23:34:42 -0500277 struct rbd_spec *parent_spec;
278 u64 parent_overlap;
279
Josh Durginc6666012011-11-21 17:11:12 -0800280 /* protects updating the header */
281 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500282
283 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700284
285 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800286
287 /* list of snapshots */
288 struct list_head snaps;
289
290 /* sysfs related */
291 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600292 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800293};
294
Alex Elder6d292902013-01-14 12:43:31 -0600295/* Flag bits for rbd_dev->flags */
296
297enum rbd_dev_flags {
298 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
299};
300
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700301static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600302
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700303static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600304static DEFINE_SPINLOCK(rbd_dev_list_lock);
305
Alex Elder432b8582012-01-29 13:57:44 -0600306static LIST_HEAD(rbd_client_list); /* clients */
307static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700308
Alex Elder304f6802012-08-31 17:29:52 -0500309static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
310static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
311
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800312static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500313static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800314
Alex Elderf0f8cef2012-01-29 13:57:44 -0600315static ssize_t rbd_add(struct bus_type *bus, const char *buf,
316 size_t count);
317static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
318 size_t count);
319
320static struct bus_attribute rbd_bus_attrs[] = {
321 __ATTR(add, S_IWUSR, NULL, rbd_add),
322 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
323 __ATTR_NULL
324};
325
326static struct bus_type rbd_bus_type = {
327 .name = "rbd",
328 .bus_attrs = rbd_bus_attrs,
329};
330
331static void rbd_root_dev_release(struct device *dev)
332{
333}
334
335static struct device rbd_root_dev = {
336 .init_name = "rbd",
337 .release = rbd_root_dev_release,
338};
339
Alex Elder06ecc6c2012-11-01 10:17:15 -0500340static __printf(2, 3)
341void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
342{
343 struct va_format vaf;
344 va_list args;
345
346 va_start(args, fmt);
347 vaf.fmt = fmt;
348 vaf.va = &args;
349
350 if (!rbd_dev)
351 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
352 else if (rbd_dev->disk)
353 printk(KERN_WARNING "%s: %s: %pV\n",
354 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
355 else if (rbd_dev->spec && rbd_dev->spec->image_name)
356 printk(KERN_WARNING "%s: image %s: %pV\n",
357 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
358 else if (rbd_dev->spec && rbd_dev->spec->image_id)
359 printk(KERN_WARNING "%s: id %s: %pV\n",
360 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
361 else /* punt */
362 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
363 RBD_DRV_NAME, rbd_dev, &vaf);
364 va_end(args);
365}
366
Alex Elderaafb2302012-09-06 16:00:54 -0500367#ifdef RBD_DEBUG
368#define rbd_assert(expr) \
369 if (unlikely(!(expr))) { \
370 printk(KERN_ERR "\nAssertion failure in %s() " \
371 "at line %d:\n\n" \
372 "\trbd_assert(%s);\n\n", \
373 __func__, __LINE__, #expr); \
374 BUG(); \
375 }
376#else /* !RBD_DEBUG */
377# define rbd_assert(expr) ((void) 0)
378#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800379
Alex Elder117973f2012-08-31 17:29:55 -0500380static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
381static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700382
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383static int rbd_open(struct block_device *bdev, fmode_t mode)
384{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600385 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700386
Alex Elderf84344f2012-08-31 17:29:51 -0500387 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700388 return -EROFS;
389
Alex Elder42382b72012-11-16 09:29:16 -0600390 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600391 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500392 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600393 rbd_dev->open_count++;
394 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700395
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700396 return 0;
397}
398
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800399static int rbd_release(struct gendisk *disk, fmode_t mode)
400{
401 struct rbd_device *rbd_dev = disk->private_data;
402
Alex Elder42382b72012-11-16 09:29:16 -0600403 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
404 rbd_assert(rbd_dev->open_count > 0);
405 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600406 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600407 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800408
409 return 0;
410}
411
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700412static const struct block_device_operations rbd_bd_ops = {
413 .owner = THIS_MODULE,
414 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800415 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700416};
417
418/*
419 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500420 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421 */
Alex Elderf8c38922012-08-10 13:12:07 -0700422static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700423{
424 struct rbd_client *rbdc;
425 int ret = -ENOMEM;
426
427 dout("rbd_client_create\n");
428 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
429 if (!rbdc)
430 goto out_opt;
431
432 kref_init(&rbdc->kref);
433 INIT_LIST_HEAD(&rbdc->node);
434
Alex Elderbc534d82012-01-29 13:57:44 -0600435 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
436
Alex Elder43ae4702012-07-03 16:01:18 -0500437 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600439 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500440 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441
442 ret = ceph_open_session(rbdc->client);
443 if (ret < 0)
444 goto out_err;
445
Alex Elder432b8582012-01-29 13:57:44 -0600446 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700447 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600448 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700449
Alex Elderbc534d82012-01-29 13:57:44 -0600450 mutex_unlock(&ctl_mutex);
451
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700452 dout("rbd_client_create created %p\n", rbdc);
453 return rbdc;
454
455out_err:
456 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600457out_mutex:
458 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700459 kfree(rbdc);
460out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500461 if (ceph_opts)
462 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400463 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700464}
465
466/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700467 * Find a ceph client with specific addr and configuration. If
468 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700469 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700470static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471{
472 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700473 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700474
Alex Elder43ae4702012-07-03 16:01:18 -0500475 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700476 return NULL;
477
Alex Elder1f7ba332012-08-10 13:12:07 -0700478 spin_lock(&rbd_client_list_lock);
479 list_for_each_entry(client_node, &rbd_client_list, node) {
480 if (!ceph_compare_options(ceph_opts, client_node->client)) {
481 kref_get(&client_node->kref);
482 found = true;
483 break;
484 }
485 }
486 spin_unlock(&rbd_client_list_lock);
487
488 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700489}
490
491/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700492 * mount options
493 */
494enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700495 Opt_last_int,
496 /* int args above */
497 Opt_last_string,
498 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700499 Opt_read_only,
500 Opt_read_write,
501 /* Boolean args above */
502 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700503};
504
Alex Elder43ae4702012-07-03 16:01:18 -0500505static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700506 /* int args above */
507 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500508 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700509 {Opt_read_only, "ro"}, /* Alternate spelling */
510 {Opt_read_write, "read_write"},
511 {Opt_read_write, "rw"}, /* Alternate spelling */
512 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700513 {-1, NULL}
514};
515
Alex Elder98571b52013-01-20 14:44:42 -0600516struct rbd_options {
517 bool read_only;
518};
519
520#define RBD_READ_ONLY_DEFAULT false
521
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700522static int parse_rbd_opts_token(char *c, void *private)
523{
Alex Elder43ae4702012-07-03 16:01:18 -0500524 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700525 substring_t argstr[MAX_OPT_ARGS];
526 int token, intval, ret;
527
Alex Elder43ae4702012-07-03 16:01:18 -0500528 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700529 if (token < 0)
530 return -EINVAL;
531
532 if (token < Opt_last_int) {
533 ret = match_int(&argstr[0], &intval);
534 if (ret < 0) {
535 pr_err("bad mount option arg (not int) "
536 "at '%s'\n", c);
537 return ret;
538 }
539 dout("got int token %d val %d\n", token, intval);
540 } else if (token > Opt_last_int && token < Opt_last_string) {
541 dout("got string token %d val %s\n", token,
542 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700543 } else if (token > Opt_last_string && token < Opt_last_bool) {
544 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700545 } else {
546 dout("got token %d\n", token);
547 }
548
549 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700550 case Opt_read_only:
551 rbd_opts->read_only = true;
552 break;
553 case Opt_read_write:
554 rbd_opts->read_only = false;
555 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700556 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500557 rbd_assert(false);
558 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700559 }
560 return 0;
561}
562
563/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700564 * Get a ceph client with specific addr and configuration, if one does
565 * not exist create it.
566 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500567static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700568{
Alex Elderf8c38922012-08-10 13:12:07 -0700569 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700570
Alex Elder1f7ba332012-08-10 13:12:07 -0700571 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500572 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500573 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500574 else
Alex Elderf8c38922012-08-10 13:12:07 -0700575 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700576
Alex Elder9d3997f2012-10-25 23:34:42 -0500577 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700578}
579
580/*
581 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600582 *
Alex Elder432b8582012-01-29 13:57:44 -0600583 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584 */
585static void rbd_client_release(struct kref *kref)
586{
587 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
588
589 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500590 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700591 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500592 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700593
594 ceph_destroy_client(rbdc->client);
595 kfree(rbdc);
596}
597
598/*
599 * Drop reference to ceph client node. If it's not referenced anymore, release
600 * it.
601 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500602static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700603{
Alex Elderc53d5892012-10-25 23:34:42 -0500604 if (rbdc)
605 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606}
607
Alex Eldera30b71b2012-07-10 20:30:11 -0500608static bool rbd_image_format_valid(u32 image_format)
609{
610 return image_format == 1 || image_format == 2;
611}
612
Alex Elder8e94af82012-07-25 09:32:40 -0500613static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
614{
Alex Elder103a1502012-08-02 11:29:45 -0500615 size_t size;
616 u32 snap_count;
617
618 /* The header has to start with the magic rbd header text */
619 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
620 return false;
621
Alex Elderdb2388b2012-10-20 22:17:27 -0500622 /* The bio layer requires at least sector-sized I/O */
623
624 if (ondisk->options.order < SECTOR_SHIFT)
625 return false;
626
627 /* If we use u64 in a few spots we may be able to loosen this */
628
629 if (ondisk->options.order > 8 * sizeof (int) - 1)
630 return false;
631
Alex Elder103a1502012-08-02 11:29:45 -0500632 /*
633 * The size of a snapshot header has to fit in a size_t, and
634 * that limits the number of snapshots.
635 */
636 snap_count = le32_to_cpu(ondisk->snap_count);
637 size = SIZE_MAX - sizeof (struct ceph_snap_context);
638 if (snap_count > size / sizeof (__le64))
639 return false;
640
641 /*
642 * Not only that, but the size of the entire the snapshot
643 * header must also be representable in a size_t.
644 */
645 size -= snap_count * sizeof (__le64);
646 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
647 return false;
648
649 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500650}
651
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700652/*
653 * Create a new header structure, translate header format from the on-disk
654 * header.
655 */
656static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500657 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658{
Alex Elderccece232012-07-10 20:30:10 -0500659 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500660 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500661 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500662 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663
Alex Elder6a523252012-07-19 17:12:59 -0500664 memset(header, 0, sizeof (*header));
665
Alex Elder103a1502012-08-02 11:29:45 -0500666 snap_count = le32_to_cpu(ondisk->snap_count);
667
Alex Elder58c17b02012-08-23 23:22:06 -0500668 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
669 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500670 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500672 memcpy(header->object_prefix, ondisk->object_prefix, len);
673 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600674
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700675 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500676 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
677
Alex Elder621901d2012-08-23 23:22:06 -0500678 /* Save a copy of the snapshot names */
679
Alex Elderf785cc12012-08-23 23:22:06 -0500680 if (snap_names_len > (u64) SIZE_MAX)
681 return -EIO;
682 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500684 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500685 /*
686 * Note that rbd_dev_v1_header_read() guarantees
687 * the ondisk buffer we're working with has
688 * snap_names_len bytes beyond the end of the
689 * snapshot id array, this memcpy() is safe.
690 */
691 memcpy(header->snap_names, &ondisk->snaps[snap_count],
692 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500693
Alex Elder621901d2012-08-23 23:22:06 -0500694 /* Record each snapshot's size */
695
Alex Elderd2bb24e2012-07-26 23:37:14 -0500696 size = snap_count * sizeof (*header->snap_sizes);
697 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700698 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500699 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500700 for (i = 0; i < snap_count; i++)
701 header->snap_sizes[i] =
702 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703 } else {
Alex Elderccece232012-07-10 20:30:10 -0500704 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705 header->snap_names = NULL;
706 header->snap_sizes = NULL;
707 }
Alex Elder849b4262012-07-09 21:04:24 -0500708
Alex Elder34b13182012-07-13 20:35:12 -0500709 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700710 header->obj_order = ondisk->options.order;
711 header->crypt_type = ondisk->options.crypt_type;
712 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500713
Alex Elder621901d2012-08-23 23:22:06 -0500714 /* Allocate and fill in the snapshot context */
715
Alex Elderf84344f2012-08-31 17:29:51 -0500716 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500717 size = sizeof (struct ceph_snap_context);
718 size += snap_count * sizeof (header->snapc->snaps[0]);
719 header->snapc = kzalloc(size, GFP_KERNEL);
720 if (!header->snapc)
721 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700722
723 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500724 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700725 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500726 for (i = 0; i < snap_count; i++)
727 header->snapc->snaps[i] =
728 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729
730 return 0;
731
Alex Elder6a523252012-07-19 17:12:59 -0500732out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500733 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500734 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500736 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500737 kfree(header->object_prefix);
738 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500739
Alex Elder00f1f362012-02-07 12:03:36 -0600740 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700741}
742
Alex Elder9e15b772012-10-30 19:40:33 -0500743static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
744{
745 struct rbd_snap *snap;
746
747 if (snap_id == CEPH_NOSNAP)
748 return RBD_SNAP_HEAD_NAME;
749
750 list_for_each_entry(snap, &rbd_dev->snaps, node)
751 if (snap_id == snap->id)
752 return snap->name;
753
754 return NULL;
755}
756
Alex Elder8836b992012-08-30 14:42:15 -0500757static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700759
Alex Eldere86924a2012-07-10 20:30:11 -0500760 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600761
Alex Eldere86924a2012-07-10 20:30:11 -0500762 list_for_each_entry(snap, &rbd_dev->snaps, node) {
763 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500764 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500765 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500766 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600767
Alex Eldere86924a2012-07-10 20:30:11 -0500768 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600769 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770 }
Alex Eldere86924a2012-07-10 20:30:11 -0500771
Alex Elder00f1f362012-02-07 12:03:36 -0600772 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700773}
774
Alex Elder819d52b2012-10-25 23:34:41 -0500775static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700776{
Alex Elder78dc4472012-07-19 08:49:18 -0500777 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700778
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500779 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800780 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500781 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500782 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500783 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500784 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700785 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500786 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787 if (ret < 0)
788 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500789 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700790 }
Alex Elder6d292902013-01-14 12:43:31 -0600791 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
792
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700793done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700794 return ret;
795}
796
797static void rbd_header_free(struct rbd_image_header *header)
798{
Alex Elder849b4262012-07-09 21:04:24 -0500799 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500800 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700801 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500802 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500803 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500804 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800805 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500806 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807}
808
Alex Elder98571b52013-01-20 14:44:42 -0600809static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700810{
Alex Elder65ccfe22012-08-09 10:33:26 -0700811 char *name;
812 u64 segment;
813 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700814
Alex Elder2fd82b92012-11-09 15:05:54 -0600815 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700816 if (!name)
817 return NULL;
818 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600819 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700820 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600821 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700822 pr_err("error formatting segment name for #%llu (%d)\n",
823 segment, ret);
824 kfree(name);
825 name = NULL;
826 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700827
Alex Elder65ccfe22012-08-09 10:33:26 -0700828 return name;
829}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700830
Alex Elder65ccfe22012-08-09 10:33:26 -0700831static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
832{
833 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700834
Alex Elder65ccfe22012-08-09 10:33:26 -0700835 return offset & (segment_size - 1);
836}
837
838static u64 rbd_segment_length(struct rbd_device *rbd_dev,
839 u64 offset, u64 length)
840{
841 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
842
843 offset &= segment_size - 1;
844
Alex Elderaafb2302012-09-06 16:00:54 -0500845 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700846 if (offset + length > segment_size)
847 length = segment_size - offset;
848
849 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700850}
851
852/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700853 * returns the size of an object in the image
854 */
855static u64 rbd_obj_bytes(struct rbd_image_header *header)
856{
857 return 1 << header->obj_order;
858}
859
860/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700861 * bio helpers
862 */
863
864static void bio_chain_put(struct bio *chain)
865{
866 struct bio *tmp;
867
868 while (chain) {
869 tmp = chain;
870 chain = chain->bi_next;
871 bio_put(tmp);
872 }
873}
874
875/*
876 * zeros a bio chain, starting at specific offset
877 */
878static void zero_bio_chain(struct bio *chain, int start_ofs)
879{
880 struct bio_vec *bv;
881 unsigned long flags;
882 void *buf;
883 int i;
884 int pos = 0;
885
886 while (chain) {
887 bio_for_each_segment(bv, chain, i) {
888 if (pos + bv->bv_len > start_ofs) {
889 int remainder = max(start_ofs - pos, 0);
890 buf = bvec_kmap_irq(bv, &flags);
891 memset(buf + remainder, 0,
892 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200893 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700894 }
895 pos += bv->bv_len;
896 }
897
898 chain = chain->bi_next;
899 }
900}
901
902/*
Alex Elderf7760da2012-10-20 22:17:27 -0500903 * Clone a portion of a bio, starting at the given byte offset
904 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700905 */
Alex Elderf7760da2012-10-20 22:17:27 -0500906static struct bio *bio_clone_range(struct bio *bio_src,
907 unsigned int offset,
908 unsigned int len,
909 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700910{
Alex Elderf7760da2012-10-20 22:17:27 -0500911 struct bio_vec *bv;
912 unsigned int resid;
913 unsigned short idx;
914 unsigned int voff;
915 unsigned short end_idx;
916 unsigned short vcnt;
917 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700918
Alex Elderf7760da2012-10-20 22:17:27 -0500919 /* Handle the easy case for the caller */
920
921 if (!offset && len == bio_src->bi_size)
922 return bio_clone(bio_src, gfpmask);
923
924 if (WARN_ON_ONCE(!len))
925 return NULL;
926 if (WARN_ON_ONCE(len > bio_src->bi_size))
927 return NULL;
928 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
929 return NULL;
930
931 /* Find first affected segment... */
932
933 resid = offset;
934 __bio_for_each_segment(bv, bio_src, idx, 0) {
935 if (resid < bv->bv_len)
936 break;
937 resid -= bv->bv_len;
938 }
939 voff = resid;
940
941 /* ...and the last affected segment */
942
943 resid += len;
944 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
945 if (resid <= bv->bv_len)
946 break;
947 resid -= bv->bv_len;
948 }
949 vcnt = end_idx - idx + 1;
950
951 /* Build the clone */
952
953 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
954 if (!bio)
955 return NULL; /* ENOMEM */
956
957 bio->bi_bdev = bio_src->bi_bdev;
958 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
959 bio->bi_rw = bio_src->bi_rw;
960 bio->bi_flags |= 1 << BIO_CLONED;
961
962 /*
963 * Copy over our part of the bio_vec, then update the first
964 * and last (or only) entries.
965 */
966 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
967 vcnt * sizeof (struct bio_vec));
968 bio->bi_io_vec[0].bv_offset += voff;
969 if (vcnt > 1) {
970 bio->bi_io_vec[0].bv_len -= voff;
971 bio->bi_io_vec[vcnt - 1].bv_len = resid;
972 } else {
973 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700974 }
975
Alex Elderf7760da2012-10-20 22:17:27 -0500976 bio->bi_vcnt = vcnt;
977 bio->bi_size = len;
978 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700979
Alex Elderf7760da2012-10-20 22:17:27 -0500980 return bio;
981}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700982
Alex Elderf7760da2012-10-20 22:17:27 -0500983/*
984 * Clone a portion of a bio chain, starting at the given byte offset
985 * into the first bio in the source chain and continuing for the
986 * number of bytes indicated. The result is another bio chain of
987 * exactly the given length, or a null pointer on error.
988 *
989 * The bio_src and offset parameters are both in-out. On entry they
990 * refer to the first source bio and the offset into that bio where
991 * the start of data to be cloned is located.
992 *
993 * On return, bio_src is updated to refer to the bio in the source
994 * chain that contains first un-cloned byte, and *offset will
995 * contain the offset of that byte within that bio.
996 */
997static struct bio *bio_chain_clone_range(struct bio **bio_src,
998 unsigned int *offset,
999 unsigned int len,
1000 gfp_t gfpmask)
1001{
1002 struct bio *bi = *bio_src;
1003 unsigned int off = *offset;
1004 struct bio *chain = NULL;
1005 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001006
Alex Elderf7760da2012-10-20 22:17:27 -05001007 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001008
Alex Elderf7760da2012-10-20 22:17:27 -05001009 if (!bi || off >= bi->bi_size || !len)
1010 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001011
Alex Elderf7760da2012-10-20 22:17:27 -05001012 end = &chain;
1013 while (len) {
1014 unsigned int bi_size;
1015 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001016
Alex Elderf5400b72012-11-01 10:17:15 -05001017 if (!bi) {
1018 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001019 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001020 }
Alex Elderf7760da2012-10-20 22:17:27 -05001021 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1022 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1023 if (!bio)
1024 goto out_err; /* ENOMEM */
1025
1026 *end = bio;
1027 end = &bio->bi_next;
1028
1029 off += bi_size;
1030 if (off == bi->bi_size) {
1031 bi = bi->bi_next;
1032 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001033 }
Alex Elderf7760da2012-10-20 22:17:27 -05001034 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035 }
Alex Elderf7760da2012-10-20 22:17:27 -05001036 *bio_src = bi;
1037 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001038
Alex Elderf7760da2012-10-20 22:17:27 -05001039 return chain;
1040out_err:
1041 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001042
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001043 return NULL;
1044}
1045
Alex Elderbf0d5f502012-11-22 00:00:08 -06001046static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1047{
1048 kref_get(&obj_request->kref);
1049}
1050
1051static void rbd_obj_request_destroy(struct kref *kref);
1052static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1053{
1054 rbd_assert(obj_request != NULL);
1055 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1056}
1057
1058static void rbd_img_request_get(struct rbd_img_request *img_request)
1059{
1060 kref_get(&img_request->kref);
1061}
1062
1063static void rbd_img_request_destroy(struct kref *kref);
1064static void rbd_img_request_put(struct rbd_img_request *img_request)
1065{
1066 rbd_assert(img_request != NULL);
1067 kref_put(&img_request->kref, rbd_img_request_destroy);
1068}
1069
1070static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1071 struct rbd_obj_request *obj_request)
1072{
Alex Elder25dcf952013-01-25 17:08:55 -06001073 rbd_assert(obj_request->img_request == NULL);
1074
Alex Elderbf0d5f502012-11-22 00:00:08 -06001075 rbd_obj_request_get(obj_request);
1076 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001077 obj_request->which = img_request->obj_request_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001078 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001079 img_request->obj_request_count++;
1080 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001081}
1082
1083static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1084 struct rbd_obj_request *obj_request)
1085{
1086 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001087
Alex Elderbf0d5f502012-11-22 00:00:08 -06001088 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001089 rbd_assert(img_request->obj_request_count > 0);
1090 img_request->obj_request_count--;
1091 rbd_assert(obj_request->which == img_request->obj_request_count);
1092 obj_request->which = BAD_WHICH;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001093 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001094 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001095 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001096 rbd_obj_request_put(obj_request);
1097}
1098
1099static bool obj_request_type_valid(enum obj_request_type type)
1100{
1101 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001102 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001103 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001104 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001105 return true;
1106 default:
1107 return false;
1108 }
1109}
1110
Alex Elder8d23bf22012-11-19 22:55:21 -06001111struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1112{
1113 struct ceph_osd_req_op *op;
1114 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001115 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001116
1117 op = kzalloc(sizeof (*op), GFP_NOIO);
1118 if (!op)
1119 return NULL;
1120 op->op = opcode;
1121 va_start(args, opcode);
1122 switch (opcode) {
1123 case CEPH_OSD_OP_READ:
1124 case CEPH_OSD_OP_WRITE:
1125 /* rbd_osd_req_op_create(READ, offset, length) */
1126 /* rbd_osd_req_op_create(WRITE, offset, length) */
1127 op->extent.offset = va_arg(args, u64);
1128 op->extent.length = va_arg(args, u64);
1129 if (opcode == CEPH_OSD_OP_WRITE)
1130 op->payload_len = op->extent.length;
1131 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001132 case CEPH_OSD_OP_CALL:
1133 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1134 op->cls.class_name = va_arg(args, char *);
1135 size = strlen(op->cls.class_name);
1136 rbd_assert(size <= (size_t) U8_MAX);
1137 op->cls.class_len = size;
1138 op->payload_len = size;
1139
1140 op->cls.method_name = va_arg(args, char *);
1141 size = strlen(op->cls.method_name);
1142 rbd_assert(size <= (size_t) U8_MAX);
1143 op->cls.method_len = size;
1144 op->payload_len += size;
1145
1146 op->cls.argc = 0;
1147 op->cls.indata = va_arg(args, void *);
1148 size = va_arg(args, size_t);
1149 rbd_assert(size <= (size_t) U32_MAX);
1150 op->cls.indata_len = (u32) size;
1151 op->payload_len += size;
1152 break;
Alex Elder5efea492012-11-19 22:55:21 -06001153 case CEPH_OSD_OP_NOTIFY_ACK:
1154 case CEPH_OSD_OP_WATCH:
1155 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1156 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1157 op->watch.cookie = va_arg(args, u64);
1158 op->watch.ver = va_arg(args, u64);
1159 op->watch.ver = cpu_to_le64(op->watch.ver);
1160 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1161 op->watch.flag = (u8) 1;
1162 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001163 default:
1164 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1165 kfree(op);
1166 op = NULL;
1167 break;
1168 }
1169 va_end(args);
1170
1171 return op;
1172}
1173
1174static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1175{
1176 kfree(op);
1177}
1178
Alex Elderbf0d5f502012-11-22 00:00:08 -06001179static int rbd_obj_request_submit(struct ceph_osd_client *osdc,
1180 struct rbd_obj_request *obj_request)
1181{
1182 return ceph_osdc_start_request(osdc, obj_request->osd_req, false);
1183}
1184
1185static void rbd_img_request_complete(struct rbd_img_request *img_request)
1186{
1187 if (img_request->callback)
1188 img_request->callback(img_request);
1189 else
1190 rbd_img_request_put(img_request);
1191}
1192
Alex Elder788e2df2013-01-17 12:25:27 -06001193/* Caller is responsible for rbd_obj_request_destroy(obj_request) */
1194
1195static int rbd_obj_request_wait(struct rbd_obj_request *obj_request)
1196{
1197 return wait_for_completion_interruptible(&obj_request->completion);
1198}
1199
Alex Elder9969ebc2013-01-18 12:31:10 -06001200static void rbd_osd_trivial_callback(struct rbd_obj_request *obj_request,
1201 struct ceph_osd_op *op)
1202{
1203 atomic_set(&obj_request->done, 1);
1204}
1205
Alex Elderbf0d5f502012-11-22 00:00:08 -06001206static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1207{
1208 if (obj_request->callback)
1209 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001210 else
1211 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001212}
1213
Alex Elderbf0d5f502012-11-22 00:00:08 -06001214static void rbd_osd_read_callback(struct rbd_obj_request *obj_request,
1215 struct ceph_osd_op *op)
1216{
1217 u64 xferred;
1218
1219 /*
1220 * We support a 64-bit length, but ultimately it has to be
1221 * passed to blk_end_request(), which takes an unsigned int.
1222 */
1223 xferred = le64_to_cpu(op->extent.length);
1224 rbd_assert(xferred < (u64) UINT_MAX);
1225 if (obj_request->result == (s32) -ENOENT) {
1226 zero_bio_chain(obj_request->bio_list, 0);
1227 obj_request->result = 0;
1228 } else if (xferred < obj_request->length && !obj_request->result) {
1229 zero_bio_chain(obj_request->bio_list, xferred);
1230 xferred = obj_request->length;
1231 }
1232 obj_request->xferred = xferred;
1233 atomic_set(&obj_request->done, 1);
1234}
1235
1236static void rbd_osd_write_callback(struct rbd_obj_request *obj_request,
1237 struct ceph_osd_op *op)
1238{
1239 obj_request->xferred = le64_to_cpu(op->extent.length);
1240 atomic_set(&obj_request->done, 1);
1241}
1242
1243static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
1244 struct ceph_msg *msg)
1245{
1246 struct rbd_obj_request *obj_request = osd_req->r_priv;
1247 struct ceph_osd_reply_head *reply_head;
1248 struct ceph_osd_op *op;
1249 u32 num_ops;
1250 u16 opcode;
1251
1252 rbd_assert(osd_req == obj_request->osd_req);
1253 rbd_assert(!!obj_request->img_request ^
1254 (obj_request->which == BAD_WHICH));
1255
1256 obj_request->xferred = le32_to_cpu(msg->hdr.data_len);
1257 reply_head = msg->front.iov_base;
1258 obj_request->result = (s32) le32_to_cpu(reply_head->result);
1259 obj_request->version = le64_to_cpu(osd_req->r_reassert_version.version);
1260
1261 num_ops = le32_to_cpu(reply_head->num_ops);
1262 WARN_ON(num_ops != 1); /* For now */
1263
1264 op = &reply_head->ops[0];
1265 opcode = le16_to_cpu(op->op);
1266 switch (opcode) {
1267 case CEPH_OSD_OP_READ:
1268 rbd_osd_read_callback(obj_request, op);
1269 break;
1270 case CEPH_OSD_OP_WRITE:
1271 rbd_osd_write_callback(obj_request, op);
1272 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001273 case CEPH_OSD_OP_CALL:
Alex Elderb8d70032012-11-30 17:53:04 -06001274 case CEPH_OSD_OP_NOTIFY_ACK:
Alex Elder9969ebc2013-01-18 12:31:10 -06001275 case CEPH_OSD_OP_WATCH:
1276 rbd_osd_trivial_callback(obj_request, op);
1277 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001278 default:
1279 rbd_warn(NULL, "%s: unsupported op %hu\n",
1280 obj_request->object_name, (unsigned short) opcode);
1281 break;
1282 }
1283
1284 if (atomic_read(&obj_request->done))
1285 rbd_obj_request_complete(obj_request);
1286}
1287
1288static struct ceph_osd_request *rbd_osd_req_create(
1289 struct rbd_device *rbd_dev,
1290 bool write_request,
1291 struct rbd_obj_request *obj_request,
1292 struct ceph_osd_req_op *op)
1293{
1294 struct rbd_img_request *img_request = obj_request->img_request;
1295 struct ceph_snap_context *snapc = NULL;
1296 struct ceph_osd_client *osdc;
1297 struct ceph_osd_request *osd_req;
1298 struct timespec now;
1299 struct timespec *mtime;
1300 u64 snap_id = CEPH_NOSNAP;
1301 u64 offset = obj_request->offset;
1302 u64 length = obj_request->length;
1303
1304 if (img_request) {
1305 rbd_assert(img_request->write_request == write_request);
1306 if (img_request->write_request)
1307 snapc = img_request->snapc;
1308 else
1309 snap_id = img_request->snap_id;
1310 }
1311
1312 /* Allocate and initialize the request, for the single op */
1313
1314 osdc = &rbd_dev->rbd_client->client->osdc;
1315 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_ATOMIC);
1316 if (!osd_req)
1317 return NULL; /* ENOMEM */
1318
1319 rbd_assert(obj_request_type_valid(obj_request->type));
1320 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001321 case OBJ_REQUEST_NODATA:
1322 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001323 case OBJ_REQUEST_BIO:
1324 rbd_assert(obj_request->bio_list != NULL);
1325 osd_req->r_bio = obj_request->bio_list;
1326 bio_get(osd_req->r_bio);
1327 /* osd client requires "num pages" even for bio */
1328 osd_req->r_num_pages = calc_pages_for(offset, length);
1329 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001330 case OBJ_REQUEST_PAGES:
1331 osd_req->r_pages = obj_request->pages;
1332 osd_req->r_num_pages = obj_request->page_count;
1333 osd_req->r_page_alignment = offset & ~PAGE_MASK;
1334 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001335 }
1336
1337 if (write_request) {
1338 osd_req->r_flags = CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK;
1339 now = CURRENT_TIME;
1340 mtime = &now;
1341 } else {
1342 osd_req->r_flags = CEPH_OSD_FLAG_READ;
1343 mtime = NULL; /* not needed for reads */
1344 offset = 0; /* These are not used... */
1345 length = 0; /* ...for osd read requests */
1346 }
1347
1348 osd_req->r_callback = rbd_osd_req_callback;
1349 osd_req->r_priv = obj_request;
1350
1351 osd_req->r_oid_len = strlen(obj_request->object_name);
1352 rbd_assert(osd_req->r_oid_len < sizeof (osd_req->r_oid));
1353 memcpy(osd_req->r_oid, obj_request->object_name, osd_req->r_oid_len);
1354
1355 osd_req->r_file_layout = rbd_dev->layout; /* struct */
1356
1357 /* osd_req will get its own reference to snapc (if non-null) */
1358
1359 ceph_osdc_build_request(osd_req, offset, length, 1, op,
1360 snapc, snap_id, mtime);
1361
1362 return osd_req;
1363}
1364
1365static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1366{
1367 ceph_osdc_put_request(osd_req);
1368}
1369
1370/* object_name is assumed to be a non-null pointer and NUL-terminated */
1371
1372static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
1373 u64 offset, u64 length,
1374 enum obj_request_type type)
1375{
1376 struct rbd_obj_request *obj_request;
1377 size_t size;
1378 char *name;
1379
1380 rbd_assert(obj_request_type_valid(type));
1381
1382 size = strlen(object_name) + 1;
1383 obj_request = kzalloc(sizeof (*obj_request) + size, GFP_KERNEL);
1384 if (!obj_request)
1385 return NULL;
1386
1387 name = (char *)(obj_request + 1);
1388 obj_request->object_name = memcpy(name, object_name, size);
1389 obj_request->offset = offset;
1390 obj_request->length = length;
1391 obj_request->which = BAD_WHICH;
1392 obj_request->type = type;
1393 INIT_LIST_HEAD(&obj_request->links);
1394 atomic_set(&obj_request->done, 0);
Alex Elder788e2df2013-01-17 12:25:27 -06001395 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001396 kref_init(&obj_request->kref);
1397
1398 return obj_request;
1399}
1400
1401static void rbd_obj_request_destroy(struct kref *kref)
1402{
1403 struct rbd_obj_request *obj_request;
1404
1405 obj_request = container_of(kref, struct rbd_obj_request, kref);
1406
1407 rbd_assert(obj_request->img_request == NULL);
1408 rbd_assert(obj_request->which == BAD_WHICH);
1409
1410 if (obj_request->osd_req)
1411 rbd_osd_req_destroy(obj_request->osd_req);
1412
1413 rbd_assert(obj_request_type_valid(obj_request->type));
1414 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001415 case OBJ_REQUEST_NODATA:
1416 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001417 case OBJ_REQUEST_BIO:
1418 if (obj_request->bio_list)
1419 bio_chain_put(obj_request->bio_list);
1420 break;
Alex Elder788e2df2013-01-17 12:25:27 -06001421 case OBJ_REQUEST_PAGES:
1422 if (obj_request->pages)
1423 ceph_release_page_vector(obj_request->pages,
1424 obj_request->page_count);
1425 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001426 }
1427
1428 kfree(obj_request);
1429}
1430
1431/*
1432 * Caller is responsible for filling in the list of object requests
1433 * that comprises the image request, and the Linux request pointer
1434 * (if there is one).
1435 */
1436struct rbd_img_request *rbd_img_request_create(struct rbd_device *rbd_dev,
1437 u64 offset, u64 length,
1438 bool write_request)
1439{
1440 struct rbd_img_request *img_request;
1441 struct ceph_snap_context *snapc = NULL;
1442
1443 img_request = kmalloc(sizeof (*img_request), GFP_ATOMIC);
1444 if (!img_request)
1445 return NULL;
1446
1447 if (write_request) {
1448 down_read(&rbd_dev->header_rwsem);
1449 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1450 up_read(&rbd_dev->header_rwsem);
1451 if (WARN_ON(!snapc)) {
1452 kfree(img_request);
1453 return NULL; /* Shouldn't happen */
1454 }
1455 }
1456
1457 img_request->rq = NULL;
1458 img_request->rbd_dev = rbd_dev;
1459 img_request->offset = offset;
1460 img_request->length = length;
1461 img_request->write_request = write_request;
1462 if (write_request)
1463 img_request->snapc = snapc;
1464 else
1465 img_request->snap_id = rbd_dev->spec->snap_id;
1466 spin_lock_init(&img_request->completion_lock);
1467 img_request->next_completion = 0;
1468 img_request->callback = NULL;
1469 img_request->obj_request_count = 0;
1470 INIT_LIST_HEAD(&img_request->obj_requests);
1471 kref_init(&img_request->kref);
1472
1473 rbd_img_request_get(img_request); /* Avoid a warning */
1474 rbd_img_request_put(img_request); /* TEMPORARY */
1475
1476 return img_request;
1477}
1478
1479static void rbd_img_request_destroy(struct kref *kref)
1480{
1481 struct rbd_img_request *img_request;
1482 struct rbd_obj_request *obj_request;
1483 struct rbd_obj_request *next_obj_request;
1484
1485 img_request = container_of(kref, struct rbd_img_request, kref);
1486
1487 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1488 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001489 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001490
1491 if (img_request->write_request)
1492 ceph_put_snap_context(img_request->snapc);
1493
1494 kfree(img_request);
1495}
1496
1497static int rbd_img_request_fill_bio(struct rbd_img_request *img_request,
1498 struct bio *bio_list)
1499{
1500 struct rbd_device *rbd_dev = img_request->rbd_dev;
1501 struct rbd_obj_request *obj_request = NULL;
1502 struct rbd_obj_request *next_obj_request;
1503 unsigned int bio_offset;
1504 u64 image_offset;
1505 u64 resid;
1506 u16 opcode;
1507
1508 opcode = img_request->write_request ? CEPH_OSD_OP_WRITE
1509 : CEPH_OSD_OP_READ;
1510 bio_offset = 0;
1511 image_offset = img_request->offset;
1512 rbd_assert(image_offset == bio_list->bi_sector << SECTOR_SHIFT);
1513 resid = img_request->length;
1514 while (resid) {
1515 const char *object_name;
1516 unsigned int clone_size;
1517 struct ceph_osd_req_op *op;
1518 u64 offset;
1519 u64 length;
1520
1521 object_name = rbd_segment_name(rbd_dev, image_offset);
1522 if (!object_name)
1523 goto out_unwind;
1524 offset = rbd_segment_offset(rbd_dev, image_offset);
1525 length = rbd_segment_length(rbd_dev, image_offset, resid);
1526 obj_request = rbd_obj_request_create(object_name,
1527 offset, length,
1528 OBJ_REQUEST_BIO);
1529 kfree(object_name); /* object request has its own copy */
1530 if (!obj_request)
1531 goto out_unwind;
1532
1533 rbd_assert(length <= (u64) UINT_MAX);
1534 clone_size = (unsigned int) length;
1535 obj_request->bio_list = bio_chain_clone_range(&bio_list,
1536 &bio_offset, clone_size,
1537 GFP_ATOMIC);
1538 if (!obj_request->bio_list)
1539 goto out_partial;
1540
1541 /*
1542 * Build up the op to use in building the osd
1543 * request. Note that the contents of the op are
1544 * copied by rbd_osd_req_create().
1545 */
1546 op = rbd_osd_req_op_create(opcode, offset, length);
1547 if (!op)
1548 goto out_partial;
1549 obj_request->osd_req = rbd_osd_req_create(rbd_dev,
1550 img_request->write_request,
1551 obj_request, op);
1552 rbd_osd_req_op_destroy(op);
1553 if (!obj_request->osd_req)
1554 goto out_partial;
1555 /* status and version are initially zero-filled */
1556
1557 rbd_img_obj_request_add(img_request, obj_request);
1558
1559 image_offset += length;
1560 resid -= length;
1561 }
1562
1563 return 0;
1564
1565out_partial:
1566 rbd_obj_request_put(obj_request);
1567out_unwind:
1568 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1569 rbd_obj_request_put(obj_request);
1570
1571 return -ENOMEM;
1572}
1573
1574static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1575{
1576 struct rbd_img_request *img_request;
1577 u32 which = obj_request->which;
1578 bool more = true;
1579
1580 img_request = obj_request->img_request;
1581 rbd_assert(img_request != NULL);
1582 rbd_assert(img_request->rq != NULL);
1583 rbd_assert(which != BAD_WHICH);
1584 rbd_assert(which < img_request->obj_request_count);
1585 rbd_assert(which >= img_request->next_completion);
1586
1587 spin_lock_irq(&img_request->completion_lock);
1588 if (which != img_request->next_completion)
1589 goto out;
1590
1591 for_each_obj_request_from(img_request, obj_request) {
1592 unsigned int xferred;
1593 int result;
1594
1595 rbd_assert(more);
1596 rbd_assert(which < img_request->obj_request_count);
1597
1598 if (!atomic_read(&obj_request->done))
1599 break;
1600
1601 rbd_assert(obj_request->xferred <= (u64) UINT_MAX);
1602 xferred = (unsigned int) obj_request->xferred;
1603 result = (int) obj_request->result;
1604 if (result)
1605 rbd_warn(NULL, "obj_request %s result %d xferred %u\n",
1606 img_request->write_request ? "write" : "read",
1607 result, xferred);
1608
1609 more = blk_end_request(img_request->rq, result, xferred);
1610 which++;
1611 }
1612 rbd_assert(more ^ (which == img_request->obj_request_count));
1613 img_request->next_completion = which;
1614out:
1615 spin_unlock_irq(&img_request->completion_lock);
1616
1617 if (!more)
1618 rbd_img_request_complete(img_request);
1619}
1620
1621static int rbd_img_request_submit(struct rbd_img_request *img_request)
1622{
1623 struct rbd_device *rbd_dev = img_request->rbd_dev;
1624 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1625 struct rbd_obj_request *obj_request;
1626
1627 for_each_obj_request(img_request, obj_request) {
1628 int ret;
1629
1630 obj_request->callback = rbd_img_obj_callback;
1631 ret = rbd_obj_request_submit(osdc, obj_request);
1632 if (ret)
1633 return ret;
1634 /*
1635 * The image request has its own reference to each
1636 * of its object requests, so we can safely drop the
1637 * initial one here.
1638 */
1639 rbd_obj_request_put(obj_request);
1640 }
1641
1642 return 0;
1643}
1644
Alex Eldercf81b602013-01-17 12:18:46 -06001645static int rbd_obj_notify_ack(struct rbd_device *rbd_dev,
Alex Elderb8d70032012-11-30 17:53:04 -06001646 u64 ver, u64 notify_id)
1647{
1648 struct rbd_obj_request *obj_request;
1649 struct ceph_osd_req_op *op;
1650 struct ceph_osd_client *osdc;
1651 int ret;
1652
1653 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1654 OBJ_REQUEST_NODATA);
1655 if (!obj_request)
1656 return -ENOMEM;
1657
1658 ret = -ENOMEM;
1659 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
1660 if (!op)
1661 goto out;
1662 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1663 obj_request, op);
1664 rbd_osd_req_op_destroy(op);
1665 if (!obj_request->osd_req)
1666 goto out;
1667
1668 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Eldercf81b602013-01-17 12:18:46 -06001669 obj_request->callback = rbd_obj_request_put;
Alex Elderb8d70032012-11-30 17:53:04 -06001670 ret = rbd_obj_request_submit(osdc, obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001671out:
Alex Eldercf81b602013-01-17 12:18:46 -06001672 if (ret)
1673 rbd_obj_request_put(obj_request);
Alex Elderb8d70032012-11-30 17:53:04 -06001674
1675 return ret;
1676}
1677
1678static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1679{
1680 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1681 u64 hver;
1682 int rc;
1683
1684 if (!rbd_dev)
1685 return;
1686
1687 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1688 rbd_dev->header_name, (unsigned long long) notify_id,
1689 (unsigned int) opcode);
1690 rc = rbd_dev_refresh(rbd_dev, &hver);
1691 if (rc)
1692 rbd_warn(rbd_dev, "got notification but failed to "
1693 " update snaps: %d\n", rc);
1694
Alex Eldercf81b602013-01-17 12:18:46 -06001695 rbd_obj_notify_ack(rbd_dev, hver, notify_id);
Alex Elderb8d70032012-11-30 17:53:04 -06001696}
1697
Alex Elder9969ebc2013-01-18 12:31:10 -06001698/*
1699 * Request sync osd watch/unwatch. The value of "start" determines
1700 * whether a watch request is being initiated or torn down.
1701 */
1702static int rbd_dev_header_watch_sync(struct rbd_device *rbd_dev, int start)
1703{
1704 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1705 struct rbd_obj_request *obj_request;
1706 struct ceph_osd_req_op *op;
1707 int ret;
1708
1709 rbd_assert(start ^ !!rbd_dev->watch_event);
1710 rbd_assert(start ^ !!rbd_dev->watch_request);
1711
1712 if (start) {
1713 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1714 &rbd_dev->watch_event);
1715 if (ret < 0)
1716 return ret;
Alex Elder8eb87562013-01-25 17:08:55 -06001717 rbd_assert(rbd_dev->watch_event != NULL);
Alex Elder9969ebc2013-01-18 12:31:10 -06001718 }
1719
1720 ret = -ENOMEM;
1721 obj_request = rbd_obj_request_create(rbd_dev->header_name, 0, 0,
1722 OBJ_REQUEST_NODATA);
1723 if (!obj_request)
1724 goto out_cancel;
1725
1726 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1727 rbd_dev->watch_event->cookie,
1728 rbd_dev->header.obj_version, start);
1729 if (!op)
1730 goto out_cancel;
1731 obj_request->osd_req = rbd_osd_req_create(rbd_dev, true,
1732 obj_request, op);
1733 rbd_osd_req_op_destroy(op);
1734 if (!obj_request->osd_req)
1735 goto out_cancel;
1736
Alex Elder8eb87562013-01-25 17:08:55 -06001737 if (start)
Alex Elder975241a2013-01-25 17:08:55 -06001738 ceph_osdc_set_request_linger(osdc, obj_request->osd_req);
Alex Elder8eb87562013-01-25 17:08:55 -06001739 else
Alex Elder6977c3f2013-01-25 17:08:55 -06001740 ceph_osdc_unregister_linger_request(osdc,
Alex Elder975241a2013-01-25 17:08:55 -06001741 rbd_dev->watch_request->osd_req);
Alex Elder9969ebc2013-01-18 12:31:10 -06001742 ret = rbd_obj_request_submit(osdc, obj_request);
1743 if (ret)
1744 goto out_cancel;
1745 ret = rbd_obj_request_wait(obj_request);
1746 if (ret)
1747 goto out_cancel;
Alex Elder9969ebc2013-01-18 12:31:10 -06001748 ret = obj_request->result;
1749 if (ret)
1750 goto out_cancel;
1751
Alex Elder8eb87562013-01-25 17:08:55 -06001752 /*
1753 * A watch request is set to linger, so the underlying osd
1754 * request won't go away until we unregister it. We retain
1755 * a pointer to the object request during that time (in
1756 * rbd_dev->watch_request), so we'll keep a reference to
1757 * it. We'll drop that reference (below) after we've
1758 * unregistered it.
1759 */
1760 if (start) {
1761 rbd_dev->watch_request = obj_request;
1762
1763 return 0;
1764 }
1765
1766 /* We have successfully torn down the watch request */
1767
1768 rbd_obj_request_put(rbd_dev->watch_request);
1769 rbd_dev->watch_request = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06001770out_cancel:
1771 /* Cancel the event if we're tearing down, or on error */
1772 ceph_osdc_cancel_event(rbd_dev->watch_event);
1773 rbd_dev->watch_event = NULL;
Alex Elder9969ebc2013-01-18 12:31:10 -06001774 if (obj_request)
1775 rbd_obj_request_put(obj_request);
1776
1777 return ret;
1778}
1779
Alex Elder36be9a72013-01-19 00:30:28 -06001780/*
1781 * Synchronous osd object method call
1782 */
1783static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
1784 const char *object_name,
1785 const char *class_name,
1786 const char *method_name,
1787 const char *outbound,
1788 size_t outbound_size,
1789 char *inbound,
1790 size_t inbound_size,
1791 u64 *version)
1792{
1793 struct rbd_obj_request *obj_request;
1794 struct ceph_osd_client *osdc;
1795 struct ceph_osd_req_op *op;
1796 struct page **pages;
1797 u32 page_count;
1798 int ret;
1799
1800 /*
1801 * Method calls are ultimately read operations but they
1802 * don't involve object data (so no offset or length).
1803 * The result should placed into the inbound buffer
1804 * provided. They also supply outbound data--parameters for
1805 * the object method. Currently if this is present it will
1806 * be a snapshot id.
1807 */
1808 page_count = (u32) calc_pages_for(0, inbound_size);
1809 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
1810 if (IS_ERR(pages))
1811 return PTR_ERR(pages);
1812
1813 ret = -ENOMEM;
1814 obj_request = rbd_obj_request_create(object_name, 0, 0,
1815 OBJ_REQUEST_PAGES);
1816 if (!obj_request)
1817 goto out;
1818
1819 obj_request->pages = pages;
1820 obj_request->page_count = page_count;
1821
1822 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1823 method_name, outbound, outbound_size);
1824 if (!op)
1825 goto out;
1826 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
1827 obj_request, op);
1828 rbd_osd_req_op_destroy(op);
1829 if (!obj_request->osd_req)
1830 goto out;
1831
1832 osdc = &rbd_dev->rbd_client->client->osdc;
1833 ret = rbd_obj_request_submit(osdc, obj_request);
1834 if (ret)
1835 goto out;
1836 ret = rbd_obj_request_wait(obj_request);
1837 if (ret)
1838 goto out;
1839
1840 ret = obj_request->result;
1841 if (ret < 0)
1842 goto out;
1843 ret = ceph_copy_from_page_vector(pages, inbound, 0,
1844 obj_request->xferred);
1845 if (version)
1846 *version = obj_request->version;
1847out:
1848 if (obj_request)
1849 rbd_obj_request_put(obj_request);
1850 else
1851 ceph_release_page_vector(pages, page_count);
1852
1853 return ret;
1854}
1855
Alex Elderbf0d5f502012-11-22 00:00:08 -06001856static void rbd_request_fn(struct request_queue *q)
1857{
1858 struct rbd_device *rbd_dev = q->queuedata;
1859 bool read_only = rbd_dev->mapping.read_only;
1860 struct request *rq;
1861 int result;
1862
1863 while ((rq = blk_fetch_request(q))) {
1864 bool write_request = rq_data_dir(rq) == WRITE;
1865 struct rbd_img_request *img_request;
1866 u64 offset;
1867 u64 length;
1868
1869 /* Ignore any non-FS requests that filter through. */
1870
1871 if (rq->cmd_type != REQ_TYPE_FS) {
1872 __blk_end_request_all(rq, 0);
1873 continue;
1874 }
1875
1876 spin_unlock_irq(q->queue_lock);
1877
1878 /* Disallow writes to a read-only device */
1879
1880 if (write_request) {
1881 result = -EROFS;
1882 if (read_only)
1883 goto end_request;
1884 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
1885 }
1886
Alex Elder6d292902013-01-14 12:43:31 -06001887 /*
1888 * Quit early if the mapped snapshot no longer
1889 * exists. It's still possible the snapshot will
1890 * have disappeared by the time our request arrives
1891 * at the osd, but there's no sense in sending it if
1892 * we already know.
1893 */
1894 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
Alex Elderbf0d5f502012-11-22 00:00:08 -06001895 dout("request for non-existent snapshot");
1896 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1897 result = -ENXIO;
1898 goto end_request;
1899 }
1900
1901 offset = (u64) blk_rq_pos(rq) << SECTOR_SHIFT;
1902 length = (u64) blk_rq_bytes(rq);
1903
1904 result = -EINVAL;
1905 if (WARN_ON(offset && length > U64_MAX - offset + 1))
1906 goto end_request; /* Shouldn't happen */
1907
1908 result = -ENOMEM;
1909 img_request = rbd_img_request_create(rbd_dev, offset, length,
1910 write_request);
1911 if (!img_request)
1912 goto end_request;
1913
1914 img_request->rq = rq;
1915
1916 result = rbd_img_request_fill_bio(img_request, rq->bio);
1917 if (!result)
1918 result = rbd_img_request_submit(img_request);
1919 if (result)
1920 rbd_img_request_put(img_request);
1921end_request:
1922 spin_lock_irq(q->queue_lock);
1923 if (result < 0) {
1924 rbd_warn(rbd_dev, "obj_request %s result %d\n",
1925 write_request ? "write" : "read", result);
1926 __blk_end_request_all(rq, result);
1927 }
1928 }
1929}
1930
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001931/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001932 * a queue callback. Makes sure that we don't create a bio that spans across
1933 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001934 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001935 */
1936static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1937 struct bio_vec *bvec)
1938{
1939 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001940 sector_t sector_offset;
1941 sector_t sectors_per_obj;
1942 sector_t obj_sector_offset;
1943 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001944
Alex Eldere5cfeed22012-10-20 22:17:27 -05001945 /*
1946 * Find how far into its rbd object the partition-relative
1947 * bio start sector is to offset relative to the enclosing
1948 * device.
1949 */
1950 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1951 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1952 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001953
Alex Eldere5cfeed22012-10-20 22:17:27 -05001954 /*
1955 * Compute the number of bytes from that offset to the end
1956 * of the object. Account for what's already used by the bio.
1957 */
1958 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1959 if (ret > bmd->bi_size)
1960 ret -= bmd->bi_size;
1961 else
1962 ret = 0;
1963
1964 /*
1965 * Don't send back more than was asked for. And if the bio
1966 * was empty, let the whole thing through because: "Note
1967 * that a block device *must* allow a single page to be
1968 * added to an empty bio."
1969 */
1970 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1971 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1972 ret = (int) bvec->bv_len;
1973
1974 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001975}
1976
1977static void rbd_free_disk(struct rbd_device *rbd_dev)
1978{
1979 struct gendisk *disk = rbd_dev->disk;
1980
1981 if (!disk)
1982 return;
1983
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001984 if (disk->flags & GENHD_FL_UP)
1985 del_gendisk(disk);
1986 if (disk->queue)
1987 blk_cleanup_queue(disk->queue);
1988 put_disk(disk);
1989}
1990
Alex Elder788e2df2013-01-17 12:25:27 -06001991static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
1992 const char *object_name,
1993 u64 offset, u64 length,
1994 char *buf, u64 *version)
1995
1996{
1997 struct ceph_osd_req_op *op;
1998 struct rbd_obj_request *obj_request;
1999 struct ceph_osd_client *osdc;
2000 struct page **pages = NULL;
2001 u32 page_count;
2002 int ret;
2003
2004 page_count = (u32) calc_pages_for(offset, length);
2005 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2006 if (IS_ERR(pages))
2007 ret = PTR_ERR(pages);
2008
2009 ret = -ENOMEM;
2010 obj_request = rbd_obj_request_create(object_name, offset, length,
Alex Elder36be9a72013-01-19 00:30:28 -06002011 OBJ_REQUEST_PAGES);
Alex Elder788e2df2013-01-17 12:25:27 -06002012 if (!obj_request)
2013 goto out;
2014
2015 obj_request->pages = pages;
2016 obj_request->page_count = page_count;
2017
2018 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, offset, length);
2019 if (!op)
2020 goto out;
2021 obj_request->osd_req = rbd_osd_req_create(rbd_dev, false,
2022 obj_request, op);
2023 rbd_osd_req_op_destroy(op);
2024 if (!obj_request->osd_req)
2025 goto out;
2026
2027 osdc = &rbd_dev->rbd_client->client->osdc;
2028 ret = rbd_obj_request_submit(osdc, obj_request);
2029 if (ret)
2030 goto out;
2031 ret = rbd_obj_request_wait(obj_request);
2032 if (ret)
2033 goto out;
2034
2035 ret = obj_request->result;
2036 if (ret < 0)
2037 goto out;
2038 ret = ceph_copy_from_page_vector(pages, buf, 0, obj_request->xferred);
2039 if (version)
2040 *version = obj_request->version;
2041out:
2042 if (obj_request)
2043 rbd_obj_request_put(obj_request);
2044 else
2045 ceph_release_page_vector(pages, page_count);
2046
2047 return ret;
2048}
2049
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002050/*
Alex Elder4156d992012-08-02 11:29:46 -05002051 * Read the complete header for the given rbd device.
2052 *
2053 * Returns a pointer to a dynamically-allocated buffer containing
2054 * the complete and validated header. Caller can pass the address
2055 * of a variable that will be filled in with the version of the
2056 * header object at the time it was read.
2057 *
2058 * Returns a pointer-coded errno if a failure occurs.
2059 */
2060static struct rbd_image_header_ondisk *
2061rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
2062{
2063 struct rbd_image_header_ondisk *ondisk = NULL;
2064 u32 snap_count = 0;
2065 u64 names_size = 0;
2066 u32 want_count;
2067 int ret;
2068
2069 /*
2070 * The complete header will include an array of its 64-bit
2071 * snapshot ids, followed by the names of those snapshots as
2072 * a contiguous block of NUL-terminated strings. Note that
2073 * the number of snapshots could change by the time we read
2074 * it in, in which case we re-read it.
2075 */
2076 do {
2077 size_t size;
2078
2079 kfree(ondisk);
2080
2081 size = sizeof (*ondisk);
2082 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
2083 size += names_size;
2084 ondisk = kmalloc(size, GFP_KERNEL);
2085 if (!ondisk)
2086 return ERR_PTR(-ENOMEM);
2087
Alex Elder788e2df2013-01-17 12:25:27 -06002088 ret = rbd_obj_read_sync(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05002089 0, size,
2090 (char *) ondisk, version);
2091
2092 if (ret < 0)
2093 goto out_err;
2094 if (WARN_ON((size_t) ret < size)) {
2095 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002096 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
2097 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05002098 goto out_err;
2099 }
2100 if (!rbd_dev_ondisk_valid(ondisk)) {
2101 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05002102 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05002103 goto out_err;
2104 }
2105
2106 names_size = le64_to_cpu(ondisk->snap_names_len);
2107 want_count = snap_count;
2108 snap_count = le32_to_cpu(ondisk->snap_count);
2109 } while (snap_count != want_count);
2110
2111 return ondisk;
2112
2113out_err:
2114 kfree(ondisk);
2115
2116 return ERR_PTR(ret);
2117}
2118
2119/*
2120 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002121 */
2122static int rbd_read_header(struct rbd_device *rbd_dev,
2123 struct rbd_image_header *header)
2124{
Alex Elder4156d992012-08-02 11:29:46 -05002125 struct rbd_image_header_ondisk *ondisk;
2126 u64 ver = 0;
2127 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002128
Alex Elder4156d992012-08-02 11:29:46 -05002129 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
2130 if (IS_ERR(ondisk))
2131 return PTR_ERR(ondisk);
2132 ret = rbd_header_from_disk(header, ondisk);
2133 if (ret >= 0)
2134 header->obj_version = ver;
2135 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002136
Alex Elder4156d992012-08-02 11:29:46 -05002137 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002138}
2139
Alex Elder41f38c22012-10-25 23:34:40 -05002140static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002141{
2142 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05002143 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002144
Alex Eldera0593292012-07-19 09:09:27 -05002145 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05002146 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002147}
2148
Alex Elder94785542012-10-09 13:50:17 -07002149static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
2150{
2151 sector_t size;
2152
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002153 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07002154 return;
2155
2156 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
2157 dout("setting size to %llu sectors", (unsigned long long) size);
2158 rbd_dev->mapping.size = (u64) size;
2159 set_capacity(rbd_dev->disk, size);
2160}
2161
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002162/*
2163 * only read the first part of the ondisk header, without the snaps info
2164 */
Alex Elder117973f2012-08-31 17:29:55 -05002165static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002166{
2167 int ret;
2168 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002169
2170 ret = rbd_read_header(rbd_dev, &h);
2171 if (ret < 0)
2172 return ret;
2173
Josh Durgina51aa0c2011-12-05 10:35:04 -08002174 down_write(&rbd_dev->header_rwsem);
2175
Alex Elder94785542012-10-09 13:50:17 -07002176 /* Update image size, and check for resize of mapped image */
2177 rbd_dev->header.image_size = h.image_size;
2178 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07002179
Alex Elder849b4262012-07-09 21:04:24 -05002180 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002181 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05002182 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08002183 /* osd requests may still refer to snapc */
2184 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002185
Alex Elderb8136232012-07-25 09:32:41 -05002186 if (hver)
2187 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08002188 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08002189 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002190 rbd_dev->header.snapc = h.snapc;
2191 rbd_dev->header.snap_names = h.snap_names;
2192 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05002193 /* Free the extra copy of the object prefix */
2194 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
2195 kfree(h.object_prefix);
2196
Alex Elder304f6802012-08-31 17:29:52 -05002197 ret = rbd_dev_snaps_update(rbd_dev);
2198 if (!ret)
2199 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002200
Josh Durginc6666012011-11-21 17:11:12 -08002201 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002202
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002203 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002204}
2205
Alex Elder117973f2012-08-31 17:29:55 -05002206static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05002207{
2208 int ret;
2209
Alex Elder117973f2012-08-31 17:29:55 -05002210 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05002211 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05002212 if (rbd_dev->image_format == 1)
2213 ret = rbd_dev_v1_refresh(rbd_dev, hver);
2214 else
2215 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05002216 mutex_unlock(&ctl_mutex);
2217
2218 return ret;
2219}
2220
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002221static int rbd_init_disk(struct rbd_device *rbd_dev)
2222{
2223 struct gendisk *disk;
2224 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06002225 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002226
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002227 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002228 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
2229 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002230 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002231
Alex Elderf0f8cef2012-01-29 13:57:44 -06002232 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05002233 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002234 disk->major = rbd_dev->major;
2235 disk->first_minor = 0;
2236 disk->fops = &rbd_bd_ops;
2237 disk->private_data = rbd_dev;
2238
Alex Elderbf0d5f502012-11-22 00:00:08 -06002239 q = blk_init_queue(rbd_request_fn, &rbd_dev->lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002240 if (!q)
2241 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07002242
Alex Elder593a9e72012-02-07 12:03:37 -06002243 /* We use the default size, but let's be explicit about it. */
2244 blk_queue_physical_block_size(q, SECTOR_SIZE);
2245
Josh Durgin029bcbd2011-07-22 11:35:23 -07002246 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06002247 segment_size = rbd_obj_bytes(&rbd_dev->header);
2248 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
2249 blk_queue_max_segment_size(q, segment_size);
2250 blk_queue_io_min(q, segment_size);
2251 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07002252
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002253 blk_queue_merge_bvec(q, rbd_merge_bvec);
2254 disk->queue = q;
2255
2256 q->queuedata = rbd_dev;
2257
2258 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002259
Alex Elder12f02942012-08-29 17:11:07 -05002260 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
2261
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002262 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002263out_disk:
2264 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05002265
2266 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002267}
2268
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002269/*
2270 sysfs
2271*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002272
Alex Elder593a9e72012-02-07 12:03:37 -06002273static struct rbd_device *dev_to_rbd_dev(struct device *dev)
2274{
2275 return container_of(dev, struct rbd_device, dev);
2276}
2277
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002278static ssize_t rbd_size_show(struct device *dev,
2279 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002280{
Alex Elder593a9e72012-02-07 12:03:37 -06002281 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08002282 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002283
Josh Durgina51aa0c2011-12-05 10:35:04 -08002284 down_read(&rbd_dev->header_rwsem);
2285 size = get_capacity(rbd_dev->disk);
2286 up_read(&rbd_dev->header_rwsem);
2287
2288 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002289}
2290
Alex Elder34b13182012-07-13 20:35:12 -05002291/*
2292 * Note this shows the features for whatever's mapped, which is not
2293 * necessarily the base image.
2294 */
2295static ssize_t rbd_features_show(struct device *dev,
2296 struct device_attribute *attr, char *buf)
2297{
2298 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2299
2300 return sprintf(buf, "0x%016llx\n",
2301 (unsigned long long) rbd_dev->mapping.features);
2302}
2303
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002304static ssize_t rbd_major_show(struct device *dev,
2305 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002306{
Alex Elder593a9e72012-02-07 12:03:37 -06002307 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002308
2309 return sprintf(buf, "%d\n", rbd_dev->major);
2310}
2311
2312static ssize_t rbd_client_id_show(struct device *dev,
2313 struct device_attribute *attr, char *buf)
2314{
Alex Elder593a9e72012-02-07 12:03:37 -06002315 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002316
Alex Elder1dbb4392012-01-24 10:08:37 -06002317 return sprintf(buf, "client%lld\n",
2318 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002319}
2320
2321static ssize_t rbd_pool_show(struct device *dev,
2322 struct device_attribute *attr, char *buf)
2323{
Alex Elder593a9e72012-02-07 12:03:37 -06002324 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002325
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002326 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002327}
2328
Alex Elder9bb2f332012-07-12 10:46:35 -05002329static ssize_t rbd_pool_id_show(struct device *dev,
2330 struct device_attribute *attr, char *buf)
2331{
2332 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2333
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002334 return sprintf(buf, "%llu\n",
2335 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002336}
2337
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002338static ssize_t rbd_name_show(struct device *dev,
2339 struct device_attribute *attr, char *buf)
2340{
Alex Elder593a9e72012-02-07 12:03:37 -06002341 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002342
Alex Eldera92ffdf2012-10-30 19:40:33 -05002343 if (rbd_dev->spec->image_name)
2344 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2345
2346 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002347}
2348
Alex Elder589d30e2012-07-10 20:30:11 -05002349static ssize_t rbd_image_id_show(struct device *dev,
2350 struct device_attribute *attr, char *buf)
2351{
2352 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2353
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002354 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002355}
2356
Alex Elder34b13182012-07-13 20:35:12 -05002357/*
2358 * Shows the name of the currently-mapped snapshot (or
2359 * RBD_SNAP_HEAD_NAME for the base image).
2360 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002361static ssize_t rbd_snap_show(struct device *dev,
2362 struct device_attribute *attr,
2363 char *buf)
2364{
Alex Elder593a9e72012-02-07 12:03:37 -06002365 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002366
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002367 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002368}
2369
Alex Elder86b00e02012-10-25 23:34:42 -05002370/*
2371 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2372 * for the parent image. If there is no parent, simply shows
2373 * "(no parent image)".
2374 */
2375static ssize_t rbd_parent_show(struct device *dev,
2376 struct device_attribute *attr,
2377 char *buf)
2378{
2379 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2380 struct rbd_spec *spec = rbd_dev->parent_spec;
2381 int count;
2382 char *bufp = buf;
2383
2384 if (!spec)
2385 return sprintf(buf, "(no parent image)\n");
2386
2387 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2388 (unsigned long long) spec->pool_id, spec->pool_name);
2389 if (count < 0)
2390 return count;
2391 bufp += count;
2392
2393 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2394 spec->image_name ? spec->image_name : "(unknown)");
2395 if (count < 0)
2396 return count;
2397 bufp += count;
2398
2399 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2400 (unsigned long long) spec->snap_id, spec->snap_name);
2401 if (count < 0)
2402 return count;
2403 bufp += count;
2404
2405 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2406 if (count < 0)
2407 return count;
2408 bufp += count;
2409
2410 return (ssize_t) (bufp - buf);
2411}
2412
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002413static ssize_t rbd_image_refresh(struct device *dev,
2414 struct device_attribute *attr,
2415 const char *buf,
2416 size_t size)
2417{
Alex Elder593a9e72012-02-07 12:03:37 -06002418 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002419 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002420
Alex Elder117973f2012-08-31 17:29:55 -05002421 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002422
2423 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002424}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002425
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002426static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002427static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002428static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2429static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2430static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002431static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002432static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002433static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002434static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2435static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002436static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002437
2438static struct attribute *rbd_attrs[] = {
2439 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002440 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002441 &dev_attr_major.attr,
2442 &dev_attr_client_id.attr,
2443 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002444 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002445 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002446 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002447 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002448 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002449 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002450 NULL
2451};
2452
2453static struct attribute_group rbd_attr_group = {
2454 .attrs = rbd_attrs,
2455};
2456
2457static const struct attribute_group *rbd_attr_groups[] = {
2458 &rbd_attr_group,
2459 NULL
2460};
2461
2462static void rbd_sysfs_dev_release(struct device *dev)
2463{
2464}
2465
2466static struct device_type rbd_device_type = {
2467 .name = "rbd",
2468 .groups = rbd_attr_groups,
2469 .release = rbd_sysfs_dev_release,
2470};
2471
2472
2473/*
2474 sysfs - snapshots
2475*/
2476
2477static ssize_t rbd_snap_size_show(struct device *dev,
2478 struct device_attribute *attr,
2479 char *buf)
2480{
2481 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2482
Josh Durgin3591538f2011-12-05 18:25:13 -08002483 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002484}
2485
2486static ssize_t rbd_snap_id_show(struct device *dev,
2487 struct device_attribute *attr,
2488 char *buf)
2489{
2490 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2491
Josh Durgin3591538f2011-12-05 18:25:13 -08002492 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002493}
2494
Alex Elder34b13182012-07-13 20:35:12 -05002495static ssize_t rbd_snap_features_show(struct device *dev,
2496 struct device_attribute *attr,
2497 char *buf)
2498{
2499 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2500
2501 return sprintf(buf, "0x%016llx\n",
2502 (unsigned long long) snap->features);
2503}
2504
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002505static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2506static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002507static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002508
2509static struct attribute *rbd_snap_attrs[] = {
2510 &dev_attr_snap_size.attr,
2511 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002512 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002513 NULL,
2514};
2515
2516static struct attribute_group rbd_snap_attr_group = {
2517 .attrs = rbd_snap_attrs,
2518};
2519
2520static void rbd_snap_dev_release(struct device *dev)
2521{
2522 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2523 kfree(snap->name);
2524 kfree(snap);
2525}
2526
2527static const struct attribute_group *rbd_snap_attr_groups[] = {
2528 &rbd_snap_attr_group,
2529 NULL
2530};
2531
2532static struct device_type rbd_snap_device_type = {
2533 .groups = rbd_snap_attr_groups,
2534 .release = rbd_snap_dev_release,
2535};
2536
Alex Elder8b8fb992012-10-26 17:25:24 -05002537static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2538{
2539 kref_get(&spec->kref);
2540
2541 return spec;
2542}
2543
2544static void rbd_spec_free(struct kref *kref);
2545static void rbd_spec_put(struct rbd_spec *spec)
2546{
2547 if (spec)
2548 kref_put(&spec->kref, rbd_spec_free);
2549}
2550
2551static struct rbd_spec *rbd_spec_alloc(void)
2552{
2553 struct rbd_spec *spec;
2554
2555 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2556 if (!spec)
2557 return NULL;
2558 kref_init(&spec->kref);
2559
2560 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2561
2562 return spec;
2563}
2564
2565static void rbd_spec_free(struct kref *kref)
2566{
2567 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2568
2569 kfree(spec->pool_name);
2570 kfree(spec->image_id);
2571 kfree(spec->image_name);
2572 kfree(spec->snap_name);
2573 kfree(spec);
2574}
2575
Alex Elderc53d5892012-10-25 23:34:42 -05002576struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2577 struct rbd_spec *spec)
2578{
2579 struct rbd_device *rbd_dev;
2580
2581 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2582 if (!rbd_dev)
2583 return NULL;
2584
2585 spin_lock_init(&rbd_dev->lock);
Alex Elder6d292902013-01-14 12:43:31 -06002586 rbd_dev->flags = 0;
Alex Elderc53d5892012-10-25 23:34:42 -05002587 INIT_LIST_HEAD(&rbd_dev->node);
2588 INIT_LIST_HEAD(&rbd_dev->snaps);
2589 init_rwsem(&rbd_dev->header_rwsem);
2590
2591 rbd_dev->spec = spec;
2592 rbd_dev->rbd_client = rbdc;
2593
Alex Elder0903e872012-11-14 12:25:19 -06002594 /* Initialize the layout used for all rbd requests */
2595
2596 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2597 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2598 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2599 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2600
Alex Elderc53d5892012-10-25 23:34:42 -05002601 return rbd_dev;
2602}
2603
2604static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2605{
Alex Elder86b00e02012-10-25 23:34:42 -05002606 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002607 kfree(rbd_dev->header_name);
2608 rbd_put_client(rbd_dev->rbd_client);
2609 rbd_spec_put(rbd_dev->spec);
2610 kfree(rbd_dev);
2611}
2612
Alex Elder304f6802012-08-31 17:29:52 -05002613static bool rbd_snap_registered(struct rbd_snap *snap)
2614{
2615 bool ret = snap->dev.type == &rbd_snap_device_type;
2616 bool reg = device_is_registered(&snap->dev);
2617
2618 rbd_assert(!ret ^ reg);
2619
2620 return ret;
2621}
2622
Alex Elder41f38c22012-10-25 23:34:40 -05002623static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002624{
2625 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002626 if (device_is_registered(&snap->dev))
2627 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002628}
2629
Alex Elder14e70852012-07-19 09:09:27 -05002630static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002631 struct device *parent)
2632{
2633 struct device *dev = &snap->dev;
2634 int ret;
2635
2636 dev->type = &rbd_snap_device_type;
2637 dev->parent = parent;
2638 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002639 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002640 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2641
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002642 ret = device_register(dev);
2643
2644 return ret;
2645}
2646
Alex Elder4e891e02012-07-10 20:30:10 -05002647static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002648 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002649 u64 snap_id, u64 snap_size,
2650 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002651{
Alex Elder4e891e02012-07-10 20:30:10 -05002652 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002653 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002654
2655 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002656 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002657 return ERR_PTR(-ENOMEM);
2658
2659 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002660 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002661 if (!snap->name)
2662 goto err;
2663
Alex Elderc8d18422012-07-10 20:30:11 -05002664 snap->id = snap_id;
2665 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002666 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002667
2668 return snap;
2669
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002670err:
2671 kfree(snap->name);
2672 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002673
2674 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002675}
2676
Alex Eldercd892122012-07-03 16:01:19 -05002677static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2678 u64 *snap_size, u64 *snap_features)
2679{
2680 char *snap_name;
2681
2682 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2683
2684 *snap_size = rbd_dev->header.snap_sizes[which];
2685 *snap_features = 0; /* No features for v1 */
2686
2687 /* Skip over names until we find the one we are looking for */
2688
2689 snap_name = rbd_dev->header.snap_names;
2690 while (which--)
2691 snap_name += strlen(snap_name) + 1;
2692
2693 return snap_name;
2694}
2695
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002696/*
Alex Elder9d475de2012-07-03 16:01:19 -05002697 * Get the size and object order for an image snapshot, or if
2698 * snap_id is CEPH_NOSNAP, gets this information for the base
2699 * image.
2700 */
2701static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2702 u8 *order, u64 *snap_size)
2703{
2704 __le64 snapid = cpu_to_le64(snap_id);
2705 int ret;
2706 struct {
2707 u8 order;
2708 __le64 size;
2709 } __attribute__ ((packed)) size_buf = { 0 };
2710
Alex Elder36be9a72013-01-19 00:30:28 -06002711 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder9d475de2012-07-03 16:01:19 -05002712 "rbd", "get_size",
2713 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002714 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002715 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05002716 if (ret < 0)
2717 return ret;
2718
2719 *order = size_buf.order;
2720 *snap_size = le64_to_cpu(size_buf.size);
2721
2722 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2723 (unsigned long long) snap_id, (unsigned int) *order,
2724 (unsigned long long) *snap_size);
2725
2726 return 0;
2727}
2728
2729static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2730{
2731 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2732 &rbd_dev->header.obj_order,
2733 &rbd_dev->header.image_size);
2734}
2735
Alex Elder1e130192012-07-03 16:01:19 -05002736static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2737{
2738 void *reply_buf;
2739 int ret;
2740 void *p;
2741
2742 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2743 if (!reply_buf)
2744 return -ENOMEM;
2745
Alex Elder36be9a72013-01-19 00:30:28 -06002746 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder1e130192012-07-03 16:01:19 -05002747 "rbd", "get_object_prefix",
2748 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002749 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002750 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05002751 if (ret < 0)
2752 goto out;
Alex Elder36be9a72013-01-19 00:30:28 -06002753 ret = 0; /* rbd_obj_method_sync() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002754
2755 p = reply_buf;
2756 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2757 p + RBD_OBJ_PREFIX_LEN_MAX,
2758 NULL, GFP_NOIO);
2759
2760 if (IS_ERR(rbd_dev->header.object_prefix)) {
2761 ret = PTR_ERR(rbd_dev->header.object_prefix);
2762 rbd_dev->header.object_prefix = NULL;
2763 } else {
2764 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2765 }
2766
2767out:
2768 kfree(reply_buf);
2769
2770 return ret;
2771}
2772
Alex Elderb1b54022012-07-03 16:01:19 -05002773static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2774 u64 *snap_features)
2775{
2776 __le64 snapid = cpu_to_le64(snap_id);
2777 struct {
2778 __le64 features;
2779 __le64 incompat;
2780 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002781 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002782 int ret;
2783
Alex Elder36be9a72013-01-19 00:30:28 -06002784 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb1b54022012-07-03 16:01:19 -05002785 "rbd", "get_features",
2786 (char *) &snapid, sizeof (snapid),
2787 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002788 NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002789 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05002790 if (ret < 0)
2791 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002792
2793 incompat = le64_to_cpu(features_buf.incompat);
2794 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002795 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002796
Alex Elderb1b54022012-07-03 16:01:19 -05002797 *snap_features = le64_to_cpu(features_buf.features);
2798
2799 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2800 (unsigned long long) snap_id,
2801 (unsigned long long) *snap_features,
2802 (unsigned long long) le64_to_cpu(features_buf.incompat));
2803
2804 return 0;
2805}
2806
2807static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2808{
2809 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2810 &rbd_dev->header.features);
2811}
2812
Alex Elder86b00e02012-10-25 23:34:42 -05002813static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2814{
2815 struct rbd_spec *parent_spec;
2816 size_t size;
2817 void *reply_buf = NULL;
2818 __le64 snapid;
2819 void *p;
2820 void *end;
2821 char *image_id;
2822 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002823 int ret;
2824
2825 parent_spec = rbd_spec_alloc();
2826 if (!parent_spec)
2827 return -ENOMEM;
2828
2829 size = sizeof (__le64) + /* pool_id */
2830 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2831 sizeof (__le64) + /* snap_id */
2832 sizeof (__le64); /* overlap */
2833 reply_buf = kmalloc(size, GFP_KERNEL);
2834 if (!reply_buf) {
2835 ret = -ENOMEM;
2836 goto out_err;
2837 }
2838
2839 snapid = cpu_to_le64(CEPH_NOSNAP);
Alex Elder36be9a72013-01-19 00:30:28 -06002840 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder86b00e02012-10-25 23:34:42 -05002841 "rbd", "get_parent",
2842 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002843 (char *) reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06002844 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05002845 if (ret < 0)
2846 goto out_err;
2847
2848 ret = -ERANGE;
2849 p = reply_buf;
2850 end = (char *) reply_buf + size;
2851 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2852 if (parent_spec->pool_id == CEPH_NOPOOL)
2853 goto out; /* No parent? No problem. */
2854
Alex Elder0903e872012-11-14 12:25:19 -06002855 /* The ceph file layout needs to fit pool id in 32 bits */
2856
2857 ret = -EIO;
2858 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2859 goto out;
2860
Alex Elder979ed482012-11-01 08:39:26 -05002861 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002862 if (IS_ERR(image_id)) {
2863 ret = PTR_ERR(image_id);
2864 goto out_err;
2865 }
2866 parent_spec->image_id = image_id;
2867 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2868 ceph_decode_64_safe(&p, end, overlap, out_err);
2869
2870 rbd_dev->parent_overlap = overlap;
2871 rbd_dev->parent_spec = parent_spec;
2872 parent_spec = NULL; /* rbd_dev now owns this */
2873out:
2874 ret = 0;
2875out_err:
2876 kfree(reply_buf);
2877 rbd_spec_put(parent_spec);
2878
2879 return ret;
2880}
2881
Alex Elder9e15b772012-10-30 19:40:33 -05002882static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2883{
2884 size_t image_id_size;
2885 char *image_id;
2886 void *p;
2887 void *end;
2888 size_t size;
2889 void *reply_buf = NULL;
2890 size_t len = 0;
2891 char *image_name = NULL;
2892 int ret;
2893
2894 rbd_assert(!rbd_dev->spec->image_name);
2895
Alex Elder69e7a022012-11-01 08:39:26 -05002896 len = strlen(rbd_dev->spec->image_id);
2897 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002898 image_id = kmalloc(image_id_size, GFP_KERNEL);
2899 if (!image_id)
2900 return NULL;
2901
2902 p = image_id;
2903 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002904 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002905
2906 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2907 reply_buf = kmalloc(size, GFP_KERNEL);
2908 if (!reply_buf)
2909 goto out;
2910
Alex Elder36be9a72013-01-19 00:30:28 -06002911 ret = rbd_obj_method_sync(rbd_dev, RBD_DIRECTORY,
Alex Elder9e15b772012-10-30 19:40:33 -05002912 "rbd", "dir_get_name",
2913 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06002914 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05002915 if (ret < 0)
2916 goto out;
2917 p = reply_buf;
2918 end = (char *) reply_buf + size;
2919 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2920 if (IS_ERR(image_name))
2921 image_name = NULL;
2922 else
2923 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2924out:
2925 kfree(reply_buf);
2926 kfree(image_id);
2927
2928 return image_name;
2929}
2930
2931/*
2932 * When a parent image gets probed, we only have the pool, image,
2933 * and snapshot ids but not the names of any of them. This call
2934 * is made later to fill in those names. It has to be done after
2935 * rbd_dev_snaps_update() has completed because some of the
2936 * information (in particular, snapshot name) is not available
2937 * until then.
2938 */
2939static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2940{
2941 struct ceph_osd_client *osdc;
2942 const char *name;
2943 void *reply_buf = NULL;
2944 int ret;
2945
2946 if (rbd_dev->spec->pool_name)
2947 return 0; /* Already have the names */
2948
2949 /* Look up the pool name */
2950
2951 osdc = &rbd_dev->rbd_client->client->osdc;
2952 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05002953 if (!name) {
2954 rbd_warn(rbd_dev, "there is no pool with id %llu",
2955 rbd_dev->spec->pool_id); /* Really a BUG() */
2956 return -EIO;
2957 }
Alex Elder9e15b772012-10-30 19:40:33 -05002958
2959 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2960 if (!rbd_dev->spec->pool_name)
2961 return -ENOMEM;
2962
2963 /* Fetch the image name; tolerate failure here */
2964
2965 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05002966 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05002967 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05002968 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05002969 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05002970
2971 /* Look up the snapshot name. */
2972
2973 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2974 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05002975 rbd_warn(rbd_dev, "no snapshot with id %llu",
2976 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05002977 ret = -EIO;
2978 goto out_err;
2979 }
2980 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2981 if(!rbd_dev->spec->snap_name)
2982 goto out_err;
2983
2984 return 0;
2985out_err:
2986 kfree(reply_buf);
2987 kfree(rbd_dev->spec->pool_name);
2988 rbd_dev->spec->pool_name = NULL;
2989
2990 return ret;
2991}
2992
Alex Elder6e14b1a2012-07-03 16:01:19 -05002993static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002994{
2995 size_t size;
2996 int ret;
2997 void *reply_buf;
2998 void *p;
2999 void *end;
3000 u64 seq;
3001 u32 snap_count;
3002 struct ceph_snap_context *snapc;
3003 u32 i;
3004
3005 /*
3006 * We'll need room for the seq value (maximum snapshot id),
3007 * snapshot count, and array of that many snapshot ids.
3008 * For now we have a fixed upper limit on the number we're
3009 * prepared to receive.
3010 */
3011 size = sizeof (__le64) + sizeof (__le32) +
3012 RBD_MAX_SNAP_COUNT * sizeof (__le64);
3013 reply_buf = kzalloc(size, GFP_KERNEL);
3014 if (!reply_buf)
3015 return -ENOMEM;
3016
Alex Elder36be9a72013-01-19 00:30:28 -06003017 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elder35d489f2012-07-03 16:01:19 -05003018 "rbd", "get_snapcontext",
3019 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003020 reply_buf, size, ver);
Alex Elder36be9a72013-01-19 00:30:28 -06003021 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05003022 if (ret < 0)
3023 goto out;
3024
3025 ret = -ERANGE;
3026 p = reply_buf;
3027 end = (char *) reply_buf + size;
3028 ceph_decode_64_safe(&p, end, seq, out);
3029 ceph_decode_32_safe(&p, end, snap_count, out);
3030
3031 /*
3032 * Make sure the reported number of snapshot ids wouldn't go
3033 * beyond the end of our buffer. But before checking that,
3034 * make sure the computed size of the snapshot context we
3035 * allocate is representable in a size_t.
3036 */
3037 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
3038 / sizeof (u64)) {
3039 ret = -EINVAL;
3040 goto out;
3041 }
3042 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
3043 goto out;
3044
3045 size = sizeof (struct ceph_snap_context) +
3046 snap_count * sizeof (snapc->snaps[0]);
3047 snapc = kmalloc(size, GFP_KERNEL);
3048 if (!snapc) {
3049 ret = -ENOMEM;
3050 goto out;
3051 }
3052
3053 atomic_set(&snapc->nref, 1);
3054 snapc->seq = seq;
3055 snapc->num_snaps = snap_count;
3056 for (i = 0; i < snap_count; i++)
3057 snapc->snaps[i] = ceph_decode_64(&p);
3058
3059 rbd_dev->header.snapc = snapc;
3060
3061 dout(" snap context seq = %llu, snap_count = %u\n",
3062 (unsigned long long) seq, (unsigned int) snap_count);
3063
3064out:
3065 kfree(reply_buf);
3066
3067 return 0;
3068}
3069
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003070static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
3071{
3072 size_t size;
3073 void *reply_buf;
3074 __le64 snap_id;
3075 int ret;
3076 void *p;
3077 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003078 char *snap_name;
3079
3080 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
3081 reply_buf = kmalloc(size, GFP_KERNEL);
3082 if (!reply_buf)
3083 return ERR_PTR(-ENOMEM);
3084
3085 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
Alex Elder36be9a72013-01-19 00:30:28 -06003086 ret = rbd_obj_method_sync(rbd_dev, rbd_dev->header_name,
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003087 "rbd", "get_snapshot_name",
3088 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06003089 reply_buf, size, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003090 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003091 if (ret < 0)
3092 goto out;
3093
3094 p = reply_buf;
3095 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05003096 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003097 if (IS_ERR(snap_name)) {
3098 ret = PTR_ERR(snap_name);
3099 goto out;
3100 } else {
3101 dout(" snap_id 0x%016llx snap_name = %s\n",
3102 (unsigned long long) le64_to_cpu(snap_id), snap_name);
3103 }
3104 kfree(reply_buf);
3105
3106 return snap_name;
3107out:
3108 kfree(reply_buf);
3109
3110 return ERR_PTR(ret);
3111}
3112
3113static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
3114 u64 *snap_size, u64 *snap_features)
3115{
Alex Eldere0b49862013-01-09 14:44:18 -06003116 u64 snap_id;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003117 u8 order;
3118 int ret;
3119
3120 snap_id = rbd_dev->header.snapc->snaps[which];
3121 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
3122 if (ret)
3123 return ERR_PTR(ret);
3124 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
3125 if (ret)
3126 return ERR_PTR(ret);
3127
3128 return rbd_dev_v2_snap_name(rbd_dev, which);
3129}
3130
3131static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
3132 u64 *snap_size, u64 *snap_features)
3133{
3134 if (rbd_dev->image_format == 1)
3135 return rbd_dev_v1_snap_info(rbd_dev, which,
3136 snap_size, snap_features);
3137 if (rbd_dev->image_format == 2)
3138 return rbd_dev_v2_snap_info(rbd_dev, which,
3139 snap_size, snap_features);
3140 return ERR_PTR(-EINVAL);
3141}
3142
Alex Elder117973f2012-08-31 17:29:55 -05003143static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
3144{
3145 int ret;
3146 __u8 obj_order;
3147
3148 down_write(&rbd_dev->header_rwsem);
3149
3150 /* Grab old order first, to see if it changes */
3151
3152 obj_order = rbd_dev->header.obj_order,
3153 ret = rbd_dev_v2_image_size(rbd_dev);
3154 if (ret)
3155 goto out;
3156 if (rbd_dev->header.obj_order != obj_order) {
3157 ret = -EIO;
3158 goto out;
3159 }
3160 rbd_update_mapping_size(rbd_dev);
3161
3162 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
3163 dout("rbd_dev_v2_snap_context returned %d\n", ret);
3164 if (ret)
3165 goto out;
3166 ret = rbd_dev_snaps_update(rbd_dev);
3167 dout("rbd_dev_snaps_update returned %d\n", ret);
3168 if (ret)
3169 goto out;
3170 ret = rbd_dev_snaps_register(rbd_dev);
3171 dout("rbd_dev_snaps_register returned %d\n", ret);
3172out:
3173 up_write(&rbd_dev->header_rwsem);
3174
3175 return ret;
3176}
3177
Alex Elder9d475de2012-07-03 16:01:19 -05003178/*
Alex Elder35938152012-08-02 11:29:46 -05003179 * Scan the rbd device's current snapshot list and compare it to the
3180 * newly-received snapshot context. Remove any existing snapshots
3181 * not present in the new snapshot context. Add a new snapshot for
3182 * any snaphots in the snapshot context not in the current list.
3183 * And verify there are no changes to snapshots we already know
3184 * about.
3185 *
3186 * Assumes the snapshots in the snapshot context are sorted by
3187 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
3188 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003189 */
Alex Elder304f6802012-08-31 17:29:52 -05003190static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003191{
Alex Elder35938152012-08-02 11:29:46 -05003192 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
3193 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05003194 struct list_head *head = &rbd_dev->snaps;
3195 struct list_head *links = head->next;
3196 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003197
Alex Elder9fcbb802012-08-23 23:48:49 -05003198 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05003199 while (index < snap_count || links != head) {
3200 u64 snap_id;
3201 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05003202 char *snap_name;
3203 u64 snap_size = 0;
3204 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003205
Alex Elder35938152012-08-02 11:29:46 -05003206 snap_id = index < snap_count ? snapc->snaps[index]
3207 : CEPH_NOSNAP;
3208 snap = links != head ? list_entry(links, struct rbd_snap, node)
3209 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05003210 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003211
Alex Elder35938152012-08-02 11:29:46 -05003212 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
3213 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003214
Alex Elder6d292902013-01-14 12:43:31 -06003215 /*
3216 * A previously-existing snapshot is not in
3217 * the new snap context.
3218 *
3219 * If the now missing snapshot is the one the
3220 * image is mapped to, clear its exists flag
3221 * so we can avoid sending any more requests
3222 * to it.
3223 */
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003224 if (rbd_dev->spec->snap_id == snap->id)
Alex Elder6d292902013-01-14 12:43:31 -06003225 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Alex Elder41f38c22012-10-25 23:34:40 -05003226 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05003227 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003228 rbd_dev->spec->snap_id == snap->id ?
3229 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05003230 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003231
Alex Elder35938152012-08-02 11:29:46 -05003232 /* Done with this list entry; advance */
3233
3234 links = next;
3235 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003236 }
Alex Elder35938152012-08-02 11:29:46 -05003237
Alex Elderb8b1e2d2012-07-03 16:01:19 -05003238 snap_name = rbd_dev_snap_info(rbd_dev, index,
3239 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05003240 if (IS_ERR(snap_name))
3241 return PTR_ERR(snap_name);
3242
Alex Elder9fcbb802012-08-23 23:48:49 -05003243 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
3244 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05003245 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
3246 struct rbd_snap *new_snap;
3247
3248 /* We haven't seen this snapshot before */
3249
Alex Elderc8d18422012-07-10 20:30:11 -05003250 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05003251 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05003252 if (IS_ERR(new_snap)) {
3253 int err = PTR_ERR(new_snap);
3254
3255 dout(" failed to add dev, error %d\n", err);
3256
3257 return err;
3258 }
Alex Elder35938152012-08-02 11:29:46 -05003259
3260 /* New goes before existing, or at end of list */
3261
Alex Elder9fcbb802012-08-23 23:48:49 -05003262 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05003263 if (snap)
3264 list_add_tail(&new_snap->node, &snap->node);
3265 else
Alex Elder523f3252012-08-30 00:16:37 -05003266 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05003267 } else {
3268 /* Already have this one */
3269
Alex Elder9fcbb802012-08-23 23:48:49 -05003270 dout(" already present\n");
3271
Alex Eldercd892122012-07-03 16:01:19 -05003272 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05003273 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05003274 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05003275
3276 /* Done with this list entry; advance */
3277
3278 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003279 }
Alex Elder35938152012-08-02 11:29:46 -05003280
3281 /* Advance to the next entry in the snapshot context */
3282
3283 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003284 }
Alex Elder9fcbb802012-08-23 23:48:49 -05003285 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003286
3287 return 0;
3288}
3289
Alex Elder304f6802012-08-31 17:29:52 -05003290/*
3291 * Scan the list of snapshots and register the devices for any that
3292 * have not already been registered.
3293 */
3294static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
3295{
3296 struct rbd_snap *snap;
3297 int ret = 0;
3298
3299 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05003300 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
3301 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05003302
3303 list_for_each_entry(snap, &rbd_dev->snaps, node) {
3304 if (!rbd_snap_registered(snap)) {
3305 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
3306 if (ret < 0)
3307 break;
3308 }
3309 }
3310 dout("%s: returning %d\n", __func__, ret);
3311
3312 return ret;
3313}
3314
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003315static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3316{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003317 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003318 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003319
3320 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003321
Alex Eldercd789ab2012-08-30 00:16:38 -05003322 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003323 dev->bus = &rbd_bus_type;
3324 dev->type = &rbd_device_type;
3325 dev->parent = &rbd_root_dev;
3326 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003327 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003328 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003329
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003330 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003331
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003332 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003333}
3334
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003335static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3336{
3337 device_unregister(&rbd_dev->dev);
3338}
3339
Alex Eldere2839302012-08-29 17:11:06 -05003340static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003341
3342/*
Alex Elder499afd52012-02-02 08:13:29 -06003343 * Get a unique rbd identifier for the given new rbd_dev, and add
3344 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003345 */
Alex Eldere2839302012-08-29 17:11:06 -05003346static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003347{
Alex Eldere2839302012-08-29 17:11:06 -05003348 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003349
3350 spin_lock(&rbd_dev_list_lock);
3351 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3352 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003353 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3354 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003355}
Alex Elderb7f23c32012-01-29 13:57:43 -06003356
Alex Elder1ddbe942012-01-29 13:57:44 -06003357/*
Alex Elder499afd52012-02-02 08:13:29 -06003358 * Remove an rbd_dev from the global list, and record that its
3359 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003360 */
Alex Eldere2839302012-08-29 17:11:06 -05003361static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003362{
Alex Elderd184f6b2012-01-29 13:57:44 -06003363 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003364 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003365 int max_id;
3366
Alex Elderaafb2302012-09-06 16:00:54 -05003367 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003368
Alex Eldere2839302012-08-29 17:11:06 -05003369 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3370 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003371 spin_lock(&rbd_dev_list_lock);
3372 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003373
3374 /*
3375 * If the id being "put" is not the current maximum, there
3376 * is nothing special we need to do.
3377 */
Alex Eldere2839302012-08-29 17:11:06 -05003378 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003379 spin_unlock(&rbd_dev_list_lock);
3380 return;
3381 }
3382
3383 /*
3384 * We need to update the current maximum id. Search the
3385 * list to find out what it is. We're more likely to find
3386 * the maximum at the end, so search the list backward.
3387 */
3388 max_id = 0;
3389 list_for_each_prev(tmp, &rbd_dev_list) {
3390 struct rbd_device *rbd_dev;
3391
3392 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003393 if (rbd_dev->dev_id > max_id)
3394 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003395 }
Alex Elder499afd52012-02-02 08:13:29 -06003396 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003397
Alex Elder1ddbe942012-01-29 13:57:44 -06003398 /*
Alex Eldere2839302012-08-29 17:11:06 -05003399 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003400 * which case it now accurately reflects the new maximum.
3401 * Be careful not to overwrite the maximum value in that
3402 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003403 */
Alex Eldere2839302012-08-29 17:11:06 -05003404 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3405 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003406}
3407
Alex Eldera725f65e2012-02-02 08:13:30 -06003408/*
Alex Eldere28fff262012-02-02 08:13:30 -06003409 * Skips over white space at *buf, and updates *buf to point to the
3410 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003411 * the token (string of non-white space characters) found. Note
3412 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003413 */
3414static inline size_t next_token(const char **buf)
3415{
3416 /*
3417 * These are the characters that produce nonzero for
3418 * isspace() in the "C" and "POSIX" locales.
3419 */
3420 const char *spaces = " \f\n\r\t\v";
3421
3422 *buf += strspn(*buf, spaces); /* Find start of token */
3423
3424 return strcspn(*buf, spaces); /* Return token length */
3425}
3426
3427/*
3428 * Finds the next token in *buf, and if the provided token buffer is
3429 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003430 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3431 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003432 *
3433 * Returns the length of the token found (not including the '\0').
3434 * Return value will be 0 if no token is found, and it will be >=
3435 * token_size if the token would not fit.
3436 *
Alex Elder593a9e72012-02-07 12:03:37 -06003437 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003438 * found token. Note that this occurs even if the token buffer is
3439 * too small to hold it.
3440 */
3441static inline size_t copy_token(const char **buf,
3442 char *token,
3443 size_t token_size)
3444{
3445 size_t len;
3446
3447 len = next_token(buf);
3448 if (len < token_size) {
3449 memcpy(token, *buf, len);
3450 *(token + len) = '\0';
3451 }
3452 *buf += len;
3453
3454 return len;
3455}
3456
3457/*
Alex Elderea3352f2012-07-09 21:04:23 -05003458 * Finds the next token in *buf, dynamically allocates a buffer big
3459 * enough to hold a copy of it, and copies the token into the new
3460 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3461 * that a duplicate buffer is created even for a zero-length token.
3462 *
3463 * Returns a pointer to the newly-allocated duplicate, or a null
3464 * pointer if memory for the duplicate was not available. If
3465 * the lenp argument is a non-null pointer, the length of the token
3466 * (not including the '\0') is returned in *lenp.
3467 *
3468 * If successful, the *buf pointer will be updated to point beyond
3469 * the end of the found token.
3470 *
3471 * Note: uses GFP_KERNEL for allocation.
3472 */
3473static inline char *dup_token(const char **buf, size_t *lenp)
3474{
3475 char *dup;
3476 size_t len;
3477
3478 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003479 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003480 if (!dup)
3481 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003482 *(dup + len) = '\0';
3483 *buf += len;
3484
3485 if (lenp)
3486 *lenp = len;
3487
3488 return dup;
3489}
3490
3491/*
Alex Elder859c31d2012-10-25 23:34:42 -05003492 * Parse the options provided for an "rbd add" (i.e., rbd image
3493 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3494 * and the data written is passed here via a NUL-terminated buffer.
3495 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003496 *
Alex Elder859c31d2012-10-25 23:34:42 -05003497 * The information extracted from these options is recorded in
3498 * the other parameters which return dynamically-allocated
3499 * structures:
3500 * ceph_opts
3501 * The address of a pointer that will refer to a ceph options
3502 * structure. Caller must release the returned pointer using
3503 * ceph_destroy_options() when it is no longer needed.
3504 * rbd_opts
3505 * Address of an rbd options pointer. Fully initialized by
3506 * this function; caller must release with kfree().
3507 * spec
3508 * Address of an rbd image specification pointer. Fully
3509 * initialized by this function based on parsed options.
3510 * Caller must release with rbd_spec_put().
3511 *
3512 * The options passed take this form:
3513 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3514 * where:
3515 * <mon_addrs>
3516 * A comma-separated list of one or more monitor addresses.
3517 * A monitor address is an ip address, optionally followed
3518 * by a port number (separated by a colon).
3519 * I.e.: ip1[:port1][,ip2[:port2]...]
3520 * <options>
3521 * A comma-separated list of ceph and/or rbd options.
3522 * <pool_name>
3523 * The name of the rados pool containing the rbd image.
3524 * <image_name>
3525 * The name of the image in that pool to map.
3526 * <snap_id>
3527 * An optional snapshot id. If provided, the mapping will
3528 * present data from the image at the time that snapshot was
3529 * created. The image head is used if no snapshot id is
3530 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003531 */
Alex Elder859c31d2012-10-25 23:34:42 -05003532static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003533 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003534 struct rbd_options **opts,
3535 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003536{
Alex Elderd22f76e2012-07-12 10:46:35 -05003537 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003538 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003539 const char *mon_addrs;
3540 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003541 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003542 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003543 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003544 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003545
3546 /* The first four tokens are required */
3547
Alex Elder7ef32142012-02-02 08:13:30 -06003548 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05003549 if (!len) {
3550 rbd_warn(NULL, "no monitor address(es) provided");
3551 return -EINVAL;
3552 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003553 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003554 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003555 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003556
Alex Elderdc79b112012-10-25 23:34:41 -05003557 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003558 options = dup_token(&buf, NULL);
3559 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003560 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003561 if (!*options) {
3562 rbd_warn(NULL, "no options provided");
3563 goto out_err;
3564 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003565
Alex Elder859c31d2012-10-25 23:34:42 -05003566 spec = rbd_spec_alloc();
3567 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003568 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003569
3570 spec->pool_name = dup_token(&buf, NULL);
3571 if (!spec->pool_name)
3572 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003573 if (!*spec->pool_name) {
3574 rbd_warn(NULL, "no pool name provided");
3575 goto out_err;
3576 }
Alex Eldere28fff262012-02-02 08:13:30 -06003577
Alex Elder69e7a022012-11-01 08:39:26 -05003578 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003579 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003580 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003581 if (!*spec->image_name) {
3582 rbd_warn(NULL, "no image name provided");
3583 goto out_err;
3584 }
Alex Eldere28fff262012-02-02 08:13:30 -06003585
Alex Elderf28e5652012-10-25 23:34:41 -05003586 /*
3587 * Snapshot name is optional; default is to use "-"
3588 * (indicating the head/no snapshot).
3589 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003590 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003591 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003592 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3593 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003594 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003595 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003596 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003597 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003598 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003599 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003600 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003601 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003602
Alex Elder0ddebc02012-10-25 23:34:41 -05003603 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003604
Alex Elder4e9afeb2012-10-25 23:34:41 -05003605 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3606 if (!rbd_opts)
3607 goto out_mem;
3608
3609 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003610
Alex Elder859c31d2012-10-25 23:34:42 -05003611 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003612 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003613 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003614 if (IS_ERR(copts)) {
3615 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003616 goto out_err;
3617 }
Alex Elder859c31d2012-10-25 23:34:42 -05003618 kfree(options);
3619
3620 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003621 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003622 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003623
Alex Elderdc79b112012-10-25 23:34:41 -05003624 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003625out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003626 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003627out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003628 kfree(rbd_opts);
3629 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003630 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003631
Alex Elderdc79b112012-10-25 23:34:41 -05003632 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003633}
3634
Alex Elder589d30e2012-07-10 20:30:11 -05003635/*
3636 * An rbd format 2 image has a unique identifier, distinct from the
3637 * name given to it by the user. Internally, that identifier is
3638 * what's used to specify the names of objects related to the image.
3639 *
3640 * A special "rbd id" object is used to map an rbd image name to its
3641 * id. If that object doesn't exist, then there is no v2 rbd image
3642 * with the supplied name.
3643 *
3644 * This function will record the given rbd_dev's image_id field if
3645 * it can be determined, and in that case will return 0. If any
3646 * errors occur a negative errno will be returned and the rbd_dev's
3647 * image_id field will be unchanged (and should be NULL).
3648 */
3649static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3650{
3651 int ret;
3652 size_t size;
3653 char *object_name;
3654 void *response;
3655 void *p;
3656
3657 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003658 * When probing a parent image, the image id is already
3659 * known (and the image name likely is not). There's no
3660 * need to fetch the image id again in this case.
3661 */
3662 if (rbd_dev->spec->image_id)
3663 return 0;
3664
3665 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003666 * First, see if the format 2 image id file exists, and if
3667 * so, get the image's persistent id from it.
3668 */
Alex Elder69e7a022012-11-01 08:39:26 -05003669 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003670 object_name = kmalloc(size, GFP_NOIO);
3671 if (!object_name)
3672 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003673 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003674 dout("rbd id object name is %s\n", object_name);
3675
3676 /* Response will be an encoded string, which includes a length */
3677
3678 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3679 response = kzalloc(size, GFP_NOIO);
3680 if (!response) {
3681 ret = -ENOMEM;
3682 goto out;
3683 }
3684
Alex Elder36be9a72013-01-19 00:30:28 -06003685 ret = rbd_obj_method_sync(rbd_dev, object_name,
Alex Elder589d30e2012-07-10 20:30:11 -05003686 "rbd", "get_id",
3687 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003688 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder36be9a72013-01-19 00:30:28 -06003689 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder589d30e2012-07-10 20:30:11 -05003690 if (ret < 0)
3691 goto out;
Alex Elder36be9a72013-01-19 00:30:28 -06003692 ret = 0; /* rbd_obj_method_sync() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003693
3694 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003695 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003696 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003697 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003698 if (IS_ERR(rbd_dev->spec->image_id)) {
3699 ret = PTR_ERR(rbd_dev->spec->image_id);
3700 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003701 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003702 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003703 }
3704out:
3705 kfree(response);
3706 kfree(object_name);
3707
3708 return ret;
3709}
3710
Alex Eldera30b71b2012-07-10 20:30:11 -05003711static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3712{
3713 int ret;
3714 size_t size;
3715
3716 /* Version 1 images have no id; empty string is used */
3717
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003718 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3719 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003720 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003721
3722 /* Record the header object name for this rbd image. */
3723
Alex Elder69e7a022012-11-01 08:39:26 -05003724 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003725 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3726 if (!rbd_dev->header_name) {
3727 ret = -ENOMEM;
3728 goto out_err;
3729 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003730 sprintf(rbd_dev->header_name, "%s%s",
3731 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003732
3733 /* Populate rbd image metadata */
3734
3735 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3736 if (ret < 0)
3737 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003738
3739 /* Version 1 images have no parent (no layering) */
3740
3741 rbd_dev->parent_spec = NULL;
3742 rbd_dev->parent_overlap = 0;
3743
Alex Eldera30b71b2012-07-10 20:30:11 -05003744 rbd_dev->image_format = 1;
3745
3746 dout("discovered version 1 image, header name is %s\n",
3747 rbd_dev->header_name);
3748
3749 return 0;
3750
3751out_err:
3752 kfree(rbd_dev->header_name);
3753 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003754 kfree(rbd_dev->spec->image_id);
3755 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003756
3757 return ret;
3758}
3759
3760static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3761{
3762 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003763 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003764 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003765
3766 /*
3767 * Image id was filled in by the caller. Record the header
3768 * object name for this rbd image.
3769 */
Alex Elder979ed482012-11-01 08:39:26 -05003770 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003771 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3772 if (!rbd_dev->header_name)
3773 return -ENOMEM;
3774 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003775 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003776
3777 /* Get the size and object order for the image */
3778
3779 ret = rbd_dev_v2_image_size(rbd_dev);
3780 if (ret < 0)
3781 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003782
3783 /* Get the object prefix (a.k.a. block_name) for the image */
3784
3785 ret = rbd_dev_v2_object_prefix(rbd_dev);
3786 if (ret < 0)
3787 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003788
Alex Elderd8891402012-10-09 13:50:17 -07003789 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003790
3791 ret = rbd_dev_v2_features(rbd_dev);
3792 if (ret < 0)
3793 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003794
Alex Elder86b00e02012-10-25 23:34:42 -05003795 /* If the image supports layering, get the parent info */
3796
3797 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3798 ret = rbd_dev_v2_parent_info(rbd_dev);
3799 if (ret < 0)
3800 goto out_err;
3801 }
3802
Alex Elder6e14b1a2012-07-03 16:01:19 -05003803 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003804
Alex Elder6e14b1a2012-07-03 16:01:19 -05003805 rbd_dev->header.crypt_type = 0;
3806 rbd_dev->header.comp_type = 0;
3807
3808 /* Get the snapshot context, plus the header version */
3809
3810 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003811 if (ret)
3812 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003813 rbd_dev->header.obj_version = ver;
3814
Alex Eldera30b71b2012-07-10 20:30:11 -05003815 rbd_dev->image_format = 2;
3816
3817 dout("discovered version 2 image, header name is %s\n",
3818 rbd_dev->header_name);
3819
Alex Elder35152972012-08-31 17:29:55 -05003820 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003821out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003822 rbd_dev->parent_overlap = 0;
3823 rbd_spec_put(rbd_dev->parent_spec);
3824 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003825 kfree(rbd_dev->header_name);
3826 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003827 kfree(rbd_dev->header.object_prefix);
3828 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003829
3830 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003831}
3832
Alex Elder83a06262012-10-30 15:47:17 -05003833static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3834{
3835 int ret;
3836
3837 /* no need to lock here, as rbd_dev is not registered yet */
3838 ret = rbd_dev_snaps_update(rbd_dev);
3839 if (ret)
3840 return ret;
3841
Alex Elder9e15b772012-10-30 19:40:33 -05003842 ret = rbd_dev_probe_update_spec(rbd_dev);
3843 if (ret)
3844 goto err_out_snaps;
3845
Alex Elder83a06262012-10-30 15:47:17 -05003846 ret = rbd_dev_set_mapping(rbd_dev);
3847 if (ret)
3848 goto err_out_snaps;
3849
3850 /* generate unique id: find highest unique id, add one */
3851 rbd_dev_id_get(rbd_dev);
3852
3853 /* Fill in the device name, now that we have its id. */
3854 BUILD_BUG_ON(DEV_NAME_LEN
3855 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3856 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3857
3858 /* Get our block major device number. */
3859
3860 ret = register_blkdev(0, rbd_dev->name);
3861 if (ret < 0)
3862 goto err_out_id;
3863 rbd_dev->major = ret;
3864
3865 /* Set up the blkdev mapping. */
3866
3867 ret = rbd_init_disk(rbd_dev);
3868 if (ret)
3869 goto err_out_blkdev;
3870
3871 ret = rbd_bus_add_dev(rbd_dev);
3872 if (ret)
3873 goto err_out_disk;
3874
3875 /*
3876 * At this point cleanup in the event of an error is the job
3877 * of the sysfs code (initiated by rbd_bus_del_dev()).
3878 */
3879 down_write(&rbd_dev->header_rwsem);
3880 ret = rbd_dev_snaps_register(rbd_dev);
3881 up_write(&rbd_dev->header_rwsem);
3882 if (ret)
3883 goto err_out_bus;
3884
Alex Elder9969ebc2013-01-18 12:31:10 -06003885 ret = rbd_dev_header_watch_sync(rbd_dev, 1);
Alex Elder83a06262012-10-30 15:47:17 -05003886 if (ret)
3887 goto err_out_bus;
3888
3889 /* Everything's ready. Announce the disk to the world. */
3890
3891 add_disk(rbd_dev->disk);
3892
3893 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3894 (unsigned long long) rbd_dev->mapping.size);
3895
3896 return ret;
3897err_out_bus:
3898 /* this will also clean up rest of rbd_dev stuff */
3899
3900 rbd_bus_del_dev(rbd_dev);
3901
3902 return ret;
3903err_out_disk:
3904 rbd_free_disk(rbd_dev);
3905err_out_blkdev:
3906 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3907err_out_id:
3908 rbd_dev_id_put(rbd_dev);
3909err_out_snaps:
3910 rbd_remove_all_snaps(rbd_dev);
3911
3912 return ret;
3913}
3914
Alex Eldera30b71b2012-07-10 20:30:11 -05003915/*
3916 * Probe for the existence of the header object for the given rbd
3917 * device. For format 2 images this includes determining the image
3918 * id.
3919 */
3920static int rbd_dev_probe(struct rbd_device *rbd_dev)
3921{
3922 int ret;
3923
3924 /*
3925 * Get the id from the image id object. If it's not a
3926 * format 2 image, we'll get ENOENT back, and we'll assume
3927 * it's a format 1 image.
3928 */
3929 ret = rbd_dev_image_id(rbd_dev);
3930 if (ret)
3931 ret = rbd_dev_v1_probe(rbd_dev);
3932 else
3933 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003934 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003935 dout("probe failed, returning %d\n", ret);
3936
Alex Elder83a06262012-10-30 15:47:17 -05003937 return ret;
3938 }
3939
3940 ret = rbd_dev_probe_finish(rbd_dev);
3941 if (ret)
3942 rbd_header_free(&rbd_dev->header);
3943
Alex Eldera30b71b2012-07-10 20:30:11 -05003944 return ret;
3945}
3946
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003947static ssize_t rbd_add(struct bus_type *bus,
3948 const char *buf,
3949 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003950{
Alex Eldercb8627c2012-07-09 21:04:23 -05003951 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003952 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003953 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003954 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003955 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003956 struct ceph_osd_client *osdc;
3957 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003958
3959 if (!try_module_get(THIS_MODULE))
3960 return -ENODEV;
3961
Alex Eldera725f65e2012-02-02 08:13:30 -06003962 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003963 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003964 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003965 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003966
Alex Elder9d3997f2012-10-25 23:34:42 -05003967 rbdc = rbd_get_client(ceph_opts);
3968 if (IS_ERR(rbdc)) {
3969 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003970 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003971 }
Alex Elderc53d5892012-10-25 23:34:42 -05003972 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003973
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003974 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003975 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003976 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003977 if (rc < 0)
3978 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003979 spec->pool_id = (u64) rc;
3980
Alex Elder0903e872012-11-14 12:25:19 -06003981 /* The ceph file layout needs to fit pool id in 32 bits */
3982
3983 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
3984 rc = -EIO;
3985 goto err_out_client;
3986 }
3987
Alex Elderc53d5892012-10-25 23:34:42 -05003988 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003989 if (!rbd_dev)
3990 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05003991 rbdc = NULL; /* rbd_dev now owns this */
3992 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003993
Alex Elderbd4ba652012-10-25 23:34:42 -05003994 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05003995 kfree(rbd_opts);
3996 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05003997
Alex Eldera30b71b2012-07-10 20:30:11 -05003998 rc = rbd_dev_probe(rbd_dev);
3999 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05004000 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05004001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004002 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05004003err_out_rbd_dev:
4004 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05004005err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05004006 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05004007err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05004008 if (ceph_opts)
4009 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05004010 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05004011 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05004012err_out_module:
4013 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06004014
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004015 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06004016
4017 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004018}
4019
Alex Elderde71a292012-07-03 16:01:19 -05004020static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004021{
4022 struct list_head *tmp;
4023 struct rbd_device *rbd_dev;
4024
Alex Eldere124a82f2012-01-29 13:57:44 -06004025 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004026 list_for_each(tmp, &rbd_dev_list) {
4027 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05004028 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06004029 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004030 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06004031 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004032 }
Alex Eldere124a82f2012-01-29 13:57:44 -06004033 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004034 return NULL;
4035}
4036
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004037static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004038{
Alex Elder593a9e72012-02-07 12:03:37 -06004039 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004040
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07004041 if (rbd_dev->watch_event)
Alex Elder9969ebc2013-01-18 12:31:10 -06004042 rbd_dev_header_watch_sync(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004043
4044 /* clean up and free blkdev */
4045 rbd_free_disk(rbd_dev);
4046 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06004047
Alex Elder2ac4e752012-07-10 20:30:10 -05004048 /* release allocated disk header fields */
4049 rbd_header_free(&rbd_dev->header);
4050
Alex Elder32eec682012-02-08 16:11:14 -06004051 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05004052 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004053 rbd_assert(rbd_dev->rbd_client != NULL);
4054 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004055
4056 /* release module ref */
4057 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004058}
4059
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004060static ssize_t rbd_remove(struct bus_type *bus,
4061 const char *buf,
4062 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004063{
4064 struct rbd_device *rbd_dev = NULL;
4065 int target_id, rc;
4066 unsigned long ul;
4067 int ret = count;
4068
4069 rc = strict_strtoul(buf, 10, &ul);
4070 if (rc)
4071 return rc;
4072
4073 /* convert to int; abort if we lost anything in the conversion */
4074 target_id = (int) ul;
4075 if (target_id != ul)
4076 return -EINVAL;
4077
4078 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
4079
4080 rbd_dev = __rbd_get_dev(target_id);
4081 if (!rbd_dev) {
4082 ret = -ENOENT;
4083 goto done;
4084 }
4085
Alex Elder42382b72012-11-16 09:29:16 -06004086 if (rbd_dev->open_count) {
4087 ret = -EBUSY;
4088 goto done;
4089 }
4090
Alex Elder41f38c22012-10-25 23:34:40 -05004091 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004092 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004093
4094done:
4095 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05004096
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004097 return ret;
4098}
4099
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004100/*
4101 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004102 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004103 */
4104static int rbd_sysfs_init(void)
4105{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004106 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004107
Alex Elderfed4c142012-02-07 12:03:36 -06004108 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06004109 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004110 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004111
Alex Elderfed4c142012-02-07 12:03:36 -06004112 ret = bus_register(&rbd_bus_type);
4113 if (ret < 0)
4114 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004115
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004116 return ret;
4117}
4118
4119static void rbd_sysfs_cleanup(void)
4120{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004121 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06004122 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004123}
4124
4125int __init rbd_init(void)
4126{
4127 int rc;
4128
4129 rc = rbd_sysfs_init();
4130 if (rc)
4131 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06004132 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004133 return 0;
4134}
4135
4136void __exit rbd_exit(void)
4137{
4138 rbd_sysfs_cleanup();
4139}
4140
4141module_init(rbd_init);
4142module_exit(rbd_exit);
4143
4144MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
4145MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
4146MODULE_DESCRIPTION("rados block device");
4147
4148/* following authorship retained from original osdblk.c */
4149MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
4150
4151MODULE_LICENSE("GPL");