blob: 3802a7857280cb530a751b2e25a70a586026e2a6 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elder2647ba32012-11-19 22:55:21 -060055/* It might be useful to have these defined elsewhere */
Alex Elderdf111be2012-08-09 10:33:26 -070056
Alex Elder2647ba32012-11-19 22:55:21 -060057#define U8_MAX ((u8) (~0U))
58#define U16_MAX ((u16) (~0U))
59#define U32_MAX ((u32) (~0U))
60#define U64_MAX ((u64) (~0ULL))
Alex Elderdf111be2012-08-09 10:33:26 -070061
Alex Elderf0f8cef2012-01-29 13:57:44 -060062#define RBD_DRV_NAME "rbd"
63#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
65#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
66
Alex Elderd4b125e2012-07-03 16:01:19 -050067#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
68#define RBD_MAX_SNAP_NAME_LEN \
69 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
70
Alex Elder35d489f2012-07-03 16:01:19 -050071#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070072#define RBD_MAX_OPT_LEN 1024
73
74#define RBD_SNAP_HEAD_NAME "-"
75
Alex Elder9e15b772012-10-30 19:40:33 -050076/* This allows a single page to hold an image name sent by OSD */
77#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -050078#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -050079
Alex Elder1e130192012-07-03 16:01:19 -050080#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050081
Alex Elderd8891402012-10-09 13:50:17 -070082/* Feature bits */
83
84#define RBD_FEATURE_LAYERING 1
85
86/* Features supported by this (client software) implementation. */
87
88#define RBD_FEATURES_ALL (0)
89
Alex Elder81a89792012-02-02 08:13:30 -060090/*
91 * An RBD device name will be "rbd#", where the "rbd" comes from
92 * RBD_DRV_NAME above, and # is a unique integer identifier.
93 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
94 * enough to hold all possible device names.
95 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060097#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098
Alex Eldercc0538b2012-08-10 13:12:07 -070099#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700100
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700101/*
102 * block device image metadata (in-memory version)
103 */
104struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -0500105 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500106 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -0500107 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700108 __u8 obj_order;
109 __u8 crypt_type;
110 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700111
Alex Elderf84344f2012-08-31 17:29:51 -0500112 /* The remaining fields need to be updated occasionally */
113 u64 image_size;
114 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700115 char *snap_names;
116 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700117
118 u64 obj_version;
119};
120
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500121/*
122 * An rbd image specification.
123 *
124 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500125 * identify an image. Each rbd_dev structure includes a pointer to
126 * an rbd_spec structure that encapsulates this identity.
127 *
128 * Each of the id's in an rbd_spec has an associated name. For a
129 * user-mapped image, the names are supplied and the id's associated
130 * with them are looked up. For a layered image, a parent image is
131 * defined by the tuple, and the names are looked up.
132 *
133 * An rbd_dev structure contains a parent_spec pointer which is
134 * non-null if the image it represents is a child in a layered
135 * image. This pointer will refer to the rbd_spec structure used
136 * by the parent rbd_dev for its own identity (i.e., the structure
137 * is shared between the parent and child).
138 *
139 * Since these structures are populated once, during the discovery
140 * phase of image construction, they are effectively immutable so
141 * we make no effort to synchronize access to them.
142 *
143 * Note that code herein does not assume the image name is known (it
144 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500145 */
146struct rbd_spec {
147 u64 pool_id;
148 char *pool_name;
149
150 char *image_id;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500151 char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500152
153 u64 snap_id;
154 char *snap_name;
155
156 struct kref kref;
157};
158
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700159struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700160 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700161};
162
163/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600164 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700165 */
166struct rbd_client {
167 struct ceph_client *client;
168 struct kref kref;
169 struct list_head node;
170};
171
172/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600173 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700174 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700175struct rbd_req_status {
176 int done;
Alex Elder8986cb32012-11-08 08:01:39 -0600177 s32 rc;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700178 u64 bytes;
179};
180
181/*
182 * a collection of requests
183 */
184struct rbd_req_coll {
185 int total;
186 int num_done;
187 struct kref kref;
188 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700189};
190
Alex Elderf0f8cef2012-01-29 13:57:44 -0600191/*
192 * a single io request
193 */
194struct rbd_request {
195 struct request *rq; /* blk layer request */
196 struct bio *bio; /* cloned bio */
197 struct page **pages; /* list of used pages */
198 u64 len;
199 int coll_index;
200 struct rbd_req_coll *coll;
201};
202
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800203struct rbd_snap {
204 struct device dev;
205 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800206 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800207 struct list_head node;
208 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500209 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800210};
211
Alex Elderf84344f2012-08-31 17:29:51 -0500212struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500213 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500214 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500215 bool read_only;
216};
217
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700218/*
219 * a single device
220 */
221struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500222 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700223
224 int major; /* blkdev assigned major */
225 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700226
Alex Eldera30b71b2012-07-10 20:30:11 -0500227 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700228 struct rbd_client *rbd_client;
229
230 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
231
232 spinlock_t lock; /* queue lock */
233
234 struct rbd_image_header header;
Alex Elderd78b6502012-11-09 08:43:15 -0600235 atomic_t exists;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500236 struct rbd_spec *spec;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700237
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500238 char *header_name;
Alex Elder971f8392012-10-25 23:34:41 -0500239
Alex Elder0903e872012-11-14 12:25:19 -0600240 struct ceph_file_layout layout;
241
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700242 struct ceph_osd_event *watch_event;
243 struct ceph_osd_request *watch_request;
244
Alex Elder86b00e02012-10-25 23:34:42 -0500245 struct rbd_spec *parent_spec;
246 u64 parent_overlap;
247
Josh Durginc6666012011-11-21 17:11:12 -0800248 /* protects updating the header */
249 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500250
251 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700252
253 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800254
255 /* list of snapshots */
256 struct list_head snaps;
257
258 /* sysfs related */
259 struct device dev;
Alex Elder42382b72012-11-16 09:29:16 -0600260 unsigned long open_count;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800261};
262
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700263static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600264
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700265static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600266static DEFINE_SPINLOCK(rbd_dev_list_lock);
267
Alex Elder432b8582012-01-29 13:57:44 -0600268static LIST_HEAD(rbd_client_list); /* clients */
269static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700270
Alex Elder304f6802012-08-31 17:29:52 -0500271static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
272static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
273
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800274static void rbd_dev_release(struct device *dev);
Alex Elder41f38c22012-10-25 23:34:40 -0500275static void rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800276
Alex Elderf0f8cef2012-01-29 13:57:44 -0600277static ssize_t rbd_add(struct bus_type *bus, const char *buf,
278 size_t count);
279static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
280 size_t count);
281
282static struct bus_attribute rbd_bus_attrs[] = {
283 __ATTR(add, S_IWUSR, NULL, rbd_add),
284 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
285 __ATTR_NULL
286};
287
288static struct bus_type rbd_bus_type = {
289 .name = "rbd",
290 .bus_attrs = rbd_bus_attrs,
291};
292
293static void rbd_root_dev_release(struct device *dev)
294{
295}
296
297static struct device rbd_root_dev = {
298 .init_name = "rbd",
299 .release = rbd_root_dev_release,
300};
301
Alex Elder06ecc6c2012-11-01 10:17:15 -0500302static __printf(2, 3)
303void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
304{
305 struct va_format vaf;
306 va_list args;
307
308 va_start(args, fmt);
309 vaf.fmt = fmt;
310 vaf.va = &args;
311
312 if (!rbd_dev)
313 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
314 else if (rbd_dev->disk)
315 printk(KERN_WARNING "%s: %s: %pV\n",
316 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
317 else if (rbd_dev->spec && rbd_dev->spec->image_name)
318 printk(KERN_WARNING "%s: image %s: %pV\n",
319 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
320 else if (rbd_dev->spec && rbd_dev->spec->image_id)
321 printk(KERN_WARNING "%s: id %s: %pV\n",
322 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
323 else /* punt */
324 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
325 RBD_DRV_NAME, rbd_dev, &vaf);
326 va_end(args);
327}
328
Alex Elderaafb2302012-09-06 16:00:54 -0500329#ifdef RBD_DEBUG
330#define rbd_assert(expr) \
331 if (unlikely(!(expr))) { \
332 printk(KERN_ERR "\nAssertion failure in %s() " \
333 "at line %d:\n\n" \
334 "\trbd_assert(%s);\n\n", \
335 __func__, __LINE__, #expr); \
336 BUG(); \
337 }
338#else /* !RBD_DEBUG */
339# define rbd_assert(expr) ((void) 0)
340#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800341
Alex Elder117973f2012-08-31 17:29:55 -0500342static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver);
343static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700344
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700345static int rbd_open(struct block_device *bdev, fmode_t mode)
346{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600347 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700348
Alex Elderf84344f2012-08-31 17:29:51 -0500349 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350 return -EROFS;
351
Alex Elder42382b72012-11-16 09:29:16 -0600352 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elderc3e946c2012-11-16 09:29:16 -0600353 (void) get_device(&rbd_dev->dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500354 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder42382b72012-11-16 09:29:16 -0600355 rbd_dev->open_count++;
356 mutex_unlock(&ctl_mutex);
Alex Elder340c7a22012-08-10 13:12:07 -0700357
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358 return 0;
359}
360
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800361static int rbd_release(struct gendisk *disk, fmode_t mode)
362{
363 struct rbd_device *rbd_dev = disk->private_data;
364
Alex Elder42382b72012-11-16 09:29:16 -0600365 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
366 rbd_assert(rbd_dev->open_count > 0);
367 rbd_dev->open_count--;
Alex Elderc3e946c2012-11-16 09:29:16 -0600368 put_device(&rbd_dev->dev);
Alex Elder42382b72012-11-16 09:29:16 -0600369 mutex_unlock(&ctl_mutex);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800370
371 return 0;
372}
373
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700374static const struct block_device_operations rbd_bd_ops = {
375 .owner = THIS_MODULE,
376 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800377 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700378};
379
380/*
381 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500382 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383 */
Alex Elderf8c38922012-08-10 13:12:07 -0700384static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700385{
386 struct rbd_client *rbdc;
387 int ret = -ENOMEM;
388
389 dout("rbd_client_create\n");
390 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
391 if (!rbdc)
392 goto out_opt;
393
394 kref_init(&rbdc->kref);
395 INIT_LIST_HEAD(&rbdc->node);
396
Alex Elderbc534d82012-01-29 13:57:44 -0600397 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
398
Alex Elder43ae4702012-07-03 16:01:18 -0500399 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700400 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600401 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500402 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700403
404 ret = ceph_open_session(rbdc->client);
405 if (ret < 0)
406 goto out_err;
407
Alex Elder432b8582012-01-29 13:57:44 -0600408 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700409 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600410 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700411
Alex Elderbc534d82012-01-29 13:57:44 -0600412 mutex_unlock(&ctl_mutex);
413
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700414 dout("rbd_client_create created %p\n", rbdc);
415 return rbdc;
416
417out_err:
418 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600419out_mutex:
420 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421 kfree(rbdc);
422out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500423 if (ceph_opts)
424 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400425 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700426}
427
428/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700429 * Find a ceph client with specific addr and configuration. If
430 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700432static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433{
434 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700435 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436
Alex Elder43ae4702012-07-03 16:01:18 -0500437 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438 return NULL;
439
Alex Elder1f7ba332012-08-10 13:12:07 -0700440 spin_lock(&rbd_client_list_lock);
441 list_for_each_entry(client_node, &rbd_client_list, node) {
442 if (!ceph_compare_options(ceph_opts, client_node->client)) {
443 kref_get(&client_node->kref);
444 found = true;
445 break;
446 }
447 }
448 spin_unlock(&rbd_client_list_lock);
449
450 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451}
452
453/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700454 * mount options
455 */
456enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700457 Opt_last_int,
458 /* int args above */
459 Opt_last_string,
460 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700461 Opt_read_only,
462 Opt_read_write,
463 /* Boolean args above */
464 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700465};
466
Alex Elder43ae4702012-07-03 16:01:18 -0500467static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700468 /* int args above */
469 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500470 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700471 {Opt_read_only, "ro"}, /* Alternate spelling */
472 {Opt_read_write, "read_write"},
473 {Opt_read_write, "rw"}, /* Alternate spelling */
474 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700475 {-1, NULL}
476};
477
478static int parse_rbd_opts_token(char *c, void *private)
479{
Alex Elder43ae4702012-07-03 16:01:18 -0500480 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700481 substring_t argstr[MAX_OPT_ARGS];
482 int token, intval, ret;
483
Alex Elder43ae4702012-07-03 16:01:18 -0500484 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700485 if (token < 0)
486 return -EINVAL;
487
488 if (token < Opt_last_int) {
489 ret = match_int(&argstr[0], &intval);
490 if (ret < 0) {
491 pr_err("bad mount option arg (not int) "
492 "at '%s'\n", c);
493 return ret;
494 }
495 dout("got int token %d val %d\n", token, intval);
496 } else if (token > Opt_last_int && token < Opt_last_string) {
497 dout("got string token %d val %s\n", token,
498 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700499 } else if (token > Opt_last_string && token < Opt_last_bool) {
500 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700501 } else {
502 dout("got token %d\n", token);
503 }
504
505 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700506 case Opt_read_only:
507 rbd_opts->read_only = true;
508 break;
509 case Opt_read_write:
510 rbd_opts->read_only = false;
511 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700512 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500513 rbd_assert(false);
514 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700515 }
516 return 0;
517}
518
519/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700520 * Get a ceph client with specific addr and configuration, if one does
521 * not exist create it.
522 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500523static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700524{
Alex Elderf8c38922012-08-10 13:12:07 -0700525 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700526
Alex Elder1f7ba332012-08-10 13:12:07 -0700527 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500528 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500529 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500530 else
Alex Elderf8c38922012-08-10 13:12:07 -0700531 rbdc = rbd_client_create(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700532
Alex Elder9d3997f2012-10-25 23:34:42 -0500533 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700534}
535
536/*
537 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600538 *
Alex Elder432b8582012-01-29 13:57:44 -0600539 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540 */
541static void rbd_client_release(struct kref *kref)
542{
543 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
544
545 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500546 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700547 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500548 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700549
550 ceph_destroy_client(rbdc->client);
551 kfree(rbdc);
552}
553
554/*
555 * Drop reference to ceph client node. If it's not referenced anymore, release
556 * it.
557 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500558static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700559{
Alex Elderc53d5892012-10-25 23:34:42 -0500560 if (rbdc)
561 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700562}
563
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700564/*
565 * Destroy requests collection
566 */
567static void rbd_coll_release(struct kref *kref)
568{
569 struct rbd_req_coll *coll =
570 container_of(kref, struct rbd_req_coll, kref);
571
572 dout("rbd_coll_release %p\n", coll);
573 kfree(coll);
574}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575
Alex Eldera30b71b2012-07-10 20:30:11 -0500576static bool rbd_image_format_valid(u32 image_format)
577{
578 return image_format == 1 || image_format == 2;
579}
580
Alex Elder8e94af82012-07-25 09:32:40 -0500581static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
582{
Alex Elder103a1502012-08-02 11:29:45 -0500583 size_t size;
584 u32 snap_count;
585
586 /* The header has to start with the magic rbd header text */
587 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
588 return false;
589
Alex Elderdb2388b2012-10-20 22:17:27 -0500590 /* The bio layer requires at least sector-sized I/O */
591
592 if (ondisk->options.order < SECTOR_SHIFT)
593 return false;
594
595 /* If we use u64 in a few spots we may be able to loosen this */
596
597 if (ondisk->options.order > 8 * sizeof (int) - 1)
598 return false;
599
Alex Elder103a1502012-08-02 11:29:45 -0500600 /*
601 * The size of a snapshot header has to fit in a size_t, and
602 * that limits the number of snapshots.
603 */
604 snap_count = le32_to_cpu(ondisk->snap_count);
605 size = SIZE_MAX - sizeof (struct ceph_snap_context);
606 if (snap_count > size / sizeof (__le64))
607 return false;
608
609 /*
610 * Not only that, but the size of the entire the snapshot
611 * header must also be representable in a size_t.
612 */
613 size -= snap_count * sizeof (__le64);
614 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
615 return false;
616
617 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500618}
619
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700620/*
621 * Create a new header structure, translate header format from the on-disk
622 * header.
623 */
624static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500625 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626{
Alex Elderccece232012-07-10 20:30:10 -0500627 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500628 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500629 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500630 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631
Alex Elder6a523252012-07-19 17:12:59 -0500632 memset(header, 0, sizeof (*header));
633
Alex Elder103a1502012-08-02 11:29:45 -0500634 snap_count = le32_to_cpu(ondisk->snap_count);
635
Alex Elder58c17b02012-08-23 23:22:06 -0500636 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
637 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500638 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500640 memcpy(header->object_prefix, ondisk->object_prefix, len);
641 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600642
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500644 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
645
Alex Elder621901d2012-08-23 23:22:06 -0500646 /* Save a copy of the snapshot names */
647
Alex Elderf785cc12012-08-23 23:22:06 -0500648 if (snap_names_len > (u64) SIZE_MAX)
649 return -EIO;
650 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500652 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500653 /*
654 * Note that rbd_dev_v1_header_read() guarantees
655 * the ondisk buffer we're working with has
656 * snap_names_len bytes beyond the end of the
657 * snapshot id array, this memcpy() is safe.
658 */
659 memcpy(header->snap_names, &ondisk->snaps[snap_count],
660 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500661
Alex Elder621901d2012-08-23 23:22:06 -0500662 /* Record each snapshot's size */
663
Alex Elderd2bb24e2012-07-26 23:37:14 -0500664 size = snap_count * sizeof (*header->snap_sizes);
665 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500667 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500668 for (i = 0; i < snap_count; i++)
669 header->snap_sizes[i] =
670 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671 } else {
Alex Elderccece232012-07-10 20:30:10 -0500672 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673 header->snap_names = NULL;
674 header->snap_sizes = NULL;
675 }
Alex Elder849b4262012-07-09 21:04:24 -0500676
Alex Elder34b13182012-07-13 20:35:12 -0500677 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678 header->obj_order = ondisk->options.order;
679 header->crypt_type = ondisk->options.crypt_type;
680 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500681
Alex Elder621901d2012-08-23 23:22:06 -0500682 /* Allocate and fill in the snapshot context */
683
Alex Elderf84344f2012-08-31 17:29:51 -0500684 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500685 size = sizeof (struct ceph_snap_context);
686 size += snap_count * sizeof (header->snapc->snaps[0]);
687 header->snapc = kzalloc(size, GFP_KERNEL);
688 if (!header->snapc)
689 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690
691 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500692 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700693 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500694 for (i = 0; i < snap_count; i++)
695 header->snapc->snaps[i] =
696 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700697
698 return 0;
699
Alex Elder6a523252012-07-19 17:12:59 -0500700out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500701 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500702 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500704 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500705 kfree(header->object_prefix);
706 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500707
Alex Elder00f1f362012-02-07 12:03:36 -0600708 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709}
710
Alex Elder9e15b772012-10-30 19:40:33 -0500711static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
712{
713 struct rbd_snap *snap;
714
715 if (snap_id == CEPH_NOSNAP)
716 return RBD_SNAP_HEAD_NAME;
717
718 list_for_each_entry(snap, &rbd_dev->snaps, node)
719 if (snap_id == snap->id)
720 return snap->name;
721
722 return NULL;
723}
724
Alex Elder8836b992012-08-30 14:42:15 -0500725static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727
Alex Eldere86924a2012-07-10 20:30:11 -0500728 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600729
Alex Eldere86924a2012-07-10 20:30:11 -0500730 list_for_each_entry(snap, &rbd_dev->snaps, node) {
731 if (!strcmp(snap_name, snap->name)) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500732 rbd_dev->spec->snap_id = snap->id;
Alex Eldere86924a2012-07-10 20:30:11 -0500733 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500734 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600735
Alex Eldere86924a2012-07-10 20:30:11 -0500736 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600737 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738 }
Alex Eldere86924a2012-07-10 20:30:11 -0500739
Alex Elder00f1f362012-02-07 12:03:36 -0600740 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700741}
742
Alex Elder819d52b2012-10-25 23:34:41 -0500743static int rbd_dev_set_mapping(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744{
Alex Elder78dc4472012-07-19 08:49:18 -0500745 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700746
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500747 if (!memcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800748 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500749 rbd_dev->spec->snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500750 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500751 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Eldere86924a2012-07-10 20:30:11 -0500752 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500754 ret = snap_by_name(rbd_dev, rbd_dev->spec->snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700755 if (ret < 0)
756 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500757 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758 }
Alex Elderd78b6502012-11-09 08:43:15 -0600759 atomic_set(&rbd_dev->exists, 1);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700760done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700761 return ret;
762}
763
764static void rbd_header_free(struct rbd_image_header *header)
765{
Alex Elder849b4262012-07-09 21:04:24 -0500766 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500767 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700768 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500769 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500770 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500771 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800772 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500773 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700774}
775
Alex Elder65ccfe22012-08-09 10:33:26 -0700776static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700777{
Alex Elder65ccfe22012-08-09 10:33:26 -0700778 char *name;
779 u64 segment;
780 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700781
Alex Elder2fd82b92012-11-09 15:05:54 -0600782 name = kmalloc(MAX_OBJ_NAME_SIZE + 1, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -0700783 if (!name)
784 return NULL;
785 segment = offset >> rbd_dev->header.obj_order;
Alex Elder2fd82b92012-11-09 15:05:54 -0600786 ret = snprintf(name, MAX_OBJ_NAME_SIZE + 1, "%s.%012llx",
Alex Elder65ccfe22012-08-09 10:33:26 -0700787 rbd_dev->header.object_prefix, segment);
Alex Elder2fd82b92012-11-09 15:05:54 -0600788 if (ret < 0 || ret > MAX_OBJ_NAME_SIZE) {
Alex Elder65ccfe22012-08-09 10:33:26 -0700789 pr_err("error formatting segment name for #%llu (%d)\n",
790 segment, ret);
791 kfree(name);
792 name = NULL;
793 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700794
Alex Elder65ccfe22012-08-09 10:33:26 -0700795 return name;
796}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700797
Alex Elder65ccfe22012-08-09 10:33:26 -0700798static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
799{
800 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700801
Alex Elder65ccfe22012-08-09 10:33:26 -0700802 return offset & (segment_size - 1);
803}
804
805static u64 rbd_segment_length(struct rbd_device *rbd_dev,
806 u64 offset, u64 length)
807{
808 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
809
810 offset &= segment_size - 1;
811
Alex Elderaafb2302012-09-06 16:00:54 -0500812 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700813 if (offset + length > segment_size)
814 length = segment_size - offset;
815
816 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700817}
818
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700819static int rbd_get_num_segments(struct rbd_image_header *header,
820 u64 ofs, u64 len)
821{
Alex Elderdf111be2012-08-09 10:33:26 -0700822 u64 start_seg;
823 u64 end_seg;
824
825 if (!len)
826 return 0;
827 if (len - 1 > U64_MAX - ofs)
828 return -ERANGE;
829
830 start_seg = ofs >> header->obj_order;
831 end_seg = (ofs + len - 1) >> header->obj_order;
832
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700833 return end_seg - start_seg + 1;
834}
835
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700836/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700837 * returns the size of an object in the image
838 */
839static u64 rbd_obj_bytes(struct rbd_image_header *header)
840{
841 return 1 << header->obj_order;
842}
843
844/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845 * bio helpers
846 */
847
848static void bio_chain_put(struct bio *chain)
849{
850 struct bio *tmp;
851
852 while (chain) {
853 tmp = chain;
854 chain = chain->bi_next;
855 bio_put(tmp);
856 }
857}
858
859/*
860 * zeros a bio chain, starting at specific offset
861 */
862static void zero_bio_chain(struct bio *chain, int start_ofs)
863{
864 struct bio_vec *bv;
865 unsigned long flags;
866 void *buf;
867 int i;
868 int pos = 0;
869
870 while (chain) {
871 bio_for_each_segment(bv, chain, i) {
872 if (pos + bv->bv_len > start_ofs) {
873 int remainder = max(start_ofs - pos, 0);
874 buf = bvec_kmap_irq(bv, &flags);
875 memset(buf + remainder, 0,
876 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200877 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878 }
879 pos += bv->bv_len;
880 }
881
882 chain = chain->bi_next;
883 }
884}
885
886/*
Alex Elderf7760da2012-10-20 22:17:27 -0500887 * Clone a portion of a bio, starting at the given byte offset
888 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889 */
Alex Elderf7760da2012-10-20 22:17:27 -0500890static struct bio *bio_clone_range(struct bio *bio_src,
891 unsigned int offset,
892 unsigned int len,
893 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700894{
Alex Elderf7760da2012-10-20 22:17:27 -0500895 struct bio_vec *bv;
896 unsigned int resid;
897 unsigned short idx;
898 unsigned int voff;
899 unsigned short end_idx;
900 unsigned short vcnt;
901 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700902
Alex Elderf7760da2012-10-20 22:17:27 -0500903 /* Handle the easy case for the caller */
904
905 if (!offset && len == bio_src->bi_size)
906 return bio_clone(bio_src, gfpmask);
907
908 if (WARN_ON_ONCE(!len))
909 return NULL;
910 if (WARN_ON_ONCE(len > bio_src->bi_size))
911 return NULL;
912 if (WARN_ON_ONCE(offset > bio_src->bi_size - len))
913 return NULL;
914
915 /* Find first affected segment... */
916
917 resid = offset;
918 __bio_for_each_segment(bv, bio_src, idx, 0) {
919 if (resid < bv->bv_len)
920 break;
921 resid -= bv->bv_len;
922 }
923 voff = resid;
924
925 /* ...and the last affected segment */
926
927 resid += len;
928 __bio_for_each_segment(bv, bio_src, end_idx, idx) {
929 if (resid <= bv->bv_len)
930 break;
931 resid -= bv->bv_len;
932 }
933 vcnt = end_idx - idx + 1;
934
935 /* Build the clone */
936
937 bio = bio_alloc(gfpmask, (unsigned int) vcnt);
938 if (!bio)
939 return NULL; /* ENOMEM */
940
941 bio->bi_bdev = bio_src->bi_bdev;
942 bio->bi_sector = bio_src->bi_sector + (offset >> SECTOR_SHIFT);
943 bio->bi_rw = bio_src->bi_rw;
944 bio->bi_flags |= 1 << BIO_CLONED;
945
946 /*
947 * Copy over our part of the bio_vec, then update the first
948 * and last (or only) entries.
949 */
950 memcpy(&bio->bi_io_vec[0], &bio_src->bi_io_vec[idx],
951 vcnt * sizeof (struct bio_vec));
952 bio->bi_io_vec[0].bv_offset += voff;
953 if (vcnt > 1) {
954 bio->bi_io_vec[0].bv_len -= voff;
955 bio->bi_io_vec[vcnt - 1].bv_len = resid;
956 } else {
957 bio->bi_io_vec[0].bv_len = len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958 }
959
Alex Elderf7760da2012-10-20 22:17:27 -0500960 bio->bi_vcnt = vcnt;
961 bio->bi_size = len;
962 bio->bi_idx = 0;
Alex Elder542582f2012-08-09 10:33:25 -0700963
Alex Elderf7760da2012-10-20 22:17:27 -0500964 return bio;
965}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700966
Alex Elderf7760da2012-10-20 22:17:27 -0500967/*
968 * Clone a portion of a bio chain, starting at the given byte offset
969 * into the first bio in the source chain and continuing for the
970 * number of bytes indicated. The result is another bio chain of
971 * exactly the given length, or a null pointer on error.
972 *
973 * The bio_src and offset parameters are both in-out. On entry they
974 * refer to the first source bio and the offset into that bio where
975 * the start of data to be cloned is located.
976 *
977 * On return, bio_src is updated to refer to the bio in the source
978 * chain that contains first un-cloned byte, and *offset will
979 * contain the offset of that byte within that bio.
980 */
981static struct bio *bio_chain_clone_range(struct bio **bio_src,
982 unsigned int *offset,
983 unsigned int len,
984 gfp_t gfpmask)
985{
986 struct bio *bi = *bio_src;
987 unsigned int off = *offset;
988 struct bio *chain = NULL;
989 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700990
Alex Elderf7760da2012-10-20 22:17:27 -0500991 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700992
Alex Elderf7760da2012-10-20 22:17:27 -0500993 if (!bi || off >= bi->bi_size || !len)
994 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995
Alex Elderf7760da2012-10-20 22:17:27 -0500996 end = &chain;
997 while (len) {
998 unsigned int bi_size;
999 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001000
Alex Elderf5400b72012-11-01 10:17:15 -05001001 if (!bi) {
1002 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001003 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001004 }
Alex Elderf7760da2012-10-20 22:17:27 -05001005 bi_size = min_t(unsigned int, bi->bi_size - off, len);
1006 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1007 if (!bio)
1008 goto out_err; /* ENOMEM */
1009
1010 *end = bio;
1011 end = &bio->bi_next;
1012
1013 off += bi_size;
1014 if (off == bi->bi_size) {
1015 bi = bi->bi_next;
1016 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017 }
Alex Elderf7760da2012-10-20 22:17:27 -05001018 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001019 }
Alex Elderf7760da2012-10-20 22:17:27 -05001020 *bio_src = bi;
1021 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001022
Alex Elderf7760da2012-10-20 22:17:27 -05001023 return chain;
1024out_err:
1025 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001027 return NULL;
1028}
1029
Alex Elder8d23bf22012-11-19 22:55:21 -06001030struct ceph_osd_req_op *rbd_osd_req_op_create(u16 opcode, ...)
1031{
1032 struct ceph_osd_req_op *op;
1033 va_list args;
Alex Elder2647ba32012-11-19 22:55:21 -06001034 size_t size;
Alex Elder8d23bf22012-11-19 22:55:21 -06001035
1036 op = kzalloc(sizeof (*op), GFP_NOIO);
1037 if (!op)
1038 return NULL;
1039 op->op = opcode;
1040 va_start(args, opcode);
1041 switch (opcode) {
1042 case CEPH_OSD_OP_READ:
1043 case CEPH_OSD_OP_WRITE:
1044 /* rbd_osd_req_op_create(READ, offset, length) */
1045 /* rbd_osd_req_op_create(WRITE, offset, length) */
1046 op->extent.offset = va_arg(args, u64);
1047 op->extent.length = va_arg(args, u64);
1048 if (opcode == CEPH_OSD_OP_WRITE)
1049 op->payload_len = op->extent.length;
1050 break;
Alex Elder2647ba32012-11-19 22:55:21 -06001051 case CEPH_OSD_OP_CALL:
1052 /* rbd_osd_req_op_create(CALL, class, method, data, datalen) */
1053 op->cls.class_name = va_arg(args, char *);
1054 size = strlen(op->cls.class_name);
1055 rbd_assert(size <= (size_t) U8_MAX);
1056 op->cls.class_len = size;
1057 op->payload_len = size;
1058
1059 op->cls.method_name = va_arg(args, char *);
1060 size = strlen(op->cls.method_name);
1061 rbd_assert(size <= (size_t) U8_MAX);
1062 op->cls.method_len = size;
1063 op->payload_len += size;
1064
1065 op->cls.argc = 0;
1066 op->cls.indata = va_arg(args, void *);
1067 size = va_arg(args, size_t);
1068 rbd_assert(size <= (size_t) U32_MAX);
1069 op->cls.indata_len = (u32) size;
1070 op->payload_len += size;
1071 break;
Alex Elder5efea492012-11-19 22:55:21 -06001072 case CEPH_OSD_OP_NOTIFY_ACK:
1073 case CEPH_OSD_OP_WATCH:
1074 /* rbd_osd_req_op_create(NOTIFY_ACK, cookie, version) */
1075 /* rbd_osd_req_op_create(WATCH, cookie, version, flag) */
1076 op->watch.cookie = va_arg(args, u64);
1077 op->watch.ver = va_arg(args, u64);
1078 op->watch.ver = cpu_to_le64(op->watch.ver);
1079 if (opcode == CEPH_OSD_OP_WATCH && va_arg(args, int))
1080 op->watch.flag = (u8) 1;
1081 break;
Alex Elder8d23bf22012-11-19 22:55:21 -06001082 default:
1083 rbd_warn(NULL, "unsupported opcode %hu\n", opcode);
1084 kfree(op);
1085 op = NULL;
1086 break;
1087 }
1088 va_end(args);
1089
1090 return op;
1091}
1092
1093static void rbd_osd_req_op_destroy(struct ceph_osd_req_op *op)
1094{
1095 kfree(op);
1096}
1097
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001098static void rbd_coll_end_req_index(struct request *rq,
1099 struct rbd_req_coll *coll,
1100 int index,
Alex Elder8986cb32012-11-08 08:01:39 -06001101 s32 ret, u64 len)
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001102{
1103 struct request_queue *q;
1104 int min, max, i;
1105
Alex Elderbd919d42012-07-13 20:35:11 -05001106 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
Alex Elder8986cb32012-11-08 08:01:39 -06001107 coll, index, (int)ret, (unsigned long long)len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001108
1109 if (!rq)
1110 return;
1111
1112 if (!coll) {
1113 blk_end_request(rq, ret, len);
1114 return;
1115 }
1116
1117 q = rq->q;
1118
1119 spin_lock_irq(q->queue_lock);
1120 coll->status[index].done = 1;
1121 coll->status[index].rc = ret;
1122 coll->status[index].bytes = len;
1123 max = min = coll->num_done;
1124 while (max < coll->total && coll->status[max].done)
1125 max++;
1126
1127 for (i = min; i<max; i++) {
Alex Elder8986cb32012-11-08 08:01:39 -06001128 __blk_end_request(rq, (int)coll->status[i].rc,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001129 coll->status[i].bytes);
1130 coll->num_done++;
1131 kref_put(&coll->kref, rbd_coll_release);
1132 }
1133 spin_unlock_irq(q->queue_lock);
1134}
1135
Alex Elder725afc92012-11-08 08:01:39 -06001136static void rbd_coll_end_req(struct rbd_request *rbd_req,
Alex Elder8986cb32012-11-08 08:01:39 -06001137 s32 ret, u64 len)
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001138{
Alex Elder725afc92012-11-08 08:01:39 -06001139 rbd_coll_end_req_index(rbd_req->rq,
1140 rbd_req->coll, rbd_req->coll_index,
1141 ret, len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001142}
1143
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144/*
1145 * Send ceph osd request
1146 */
1147static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001148 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149 struct ceph_snap_context *snapc,
1150 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001151 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001152 struct bio *bio,
1153 struct page **pages,
1154 int num_pages,
1155 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001156 struct ceph_osd_req_op *op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001157 struct rbd_req_coll *coll,
1158 int coll_index,
Alex Elder5f29ddd2012-11-08 08:01:39 -06001159 void (*rbd_cb)(struct ceph_osd_request *,
1160 struct ceph_msg *),
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001161 struct ceph_osd_request **linger_req,
1162 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001163{
Alex Elder1dbb4392012-01-24 10:08:37 -06001164 struct ceph_osd_client *osdc;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001165 struct ceph_osd_request *osd_req;
1166 struct rbd_request *rbd_req = NULL;
1167 struct timespec mtime = CURRENT_TIME;
1168 int ret;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001169
Alex Elderf7760da2012-10-20 22:17:27 -05001170 dout("rbd_do_request object_name=%s ofs=%llu len=%llu coll=%p[%d]\n",
1171 object_name, (unsigned long long) ofs,
1172 (unsigned long long) len, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001173
Alex Elder0ce1a792012-07-03 16:01:18 -05001174 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder30573d62012-11-13 21:11:15 -06001175 osd_req = ceph_osdc_alloc_request(osdc, snapc, 1, false, GFP_NOIO);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001176 if (!osd_req)
1177 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178
Alex Elderd178a9e2012-11-13 21:11:15 -06001179 osd_req->r_flags = flags;
Alex Elder54a54002012-11-13 21:11:15 -06001180 osd_req->r_pages = pages;
1181 if (bio) {
1182 osd_req->r_bio = bio;
1183 bio_get(osd_req->r_bio);
1184 }
Alex Elder2e53c6c2012-11-30 09:59:47 -06001185
Alex Elder18216652012-11-30 09:59:47 -06001186 if (coll) {
Alex Elder2e53c6c2012-11-30 09:59:47 -06001187 ret = -ENOMEM;
1188 rbd_req = kmalloc(sizeof(*rbd_req), GFP_NOIO);
1189 if (!rbd_req)
1190 goto done_osd_req;
1191
1192 rbd_req->rq = rq;
1193 rbd_req->bio = bio;
1194 rbd_req->pages = pages;
1195 rbd_req->len = len;
1196 rbd_req->coll = coll;
Alex Elder18216652012-11-30 09:59:47 -06001197 rbd_req->coll_index = coll_index;
Alex Elder2e53c6c2012-11-30 09:59:47 -06001198 }
1199
Alex Elder5f29ddd2012-11-08 08:01:39 -06001200 osd_req->r_callback = rbd_cb;
Alex Elder5f29ddd2012-11-08 08:01:39 -06001201 osd_req->r_priv = rbd_req;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001202
Alex Elder5f29ddd2012-11-08 08:01:39 -06001203 strncpy(osd_req->r_oid, object_name, sizeof(osd_req->r_oid));
1204 osd_req->r_oid_len = strlen(osd_req->r_oid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001205
Alex Elder0903e872012-11-14 12:25:19 -06001206 osd_req->r_file_layout = rbd_dev->layout; /* struct */
Alex Eldere01e7922012-11-14 12:25:18 -06001207 osd_req->r_num_pages = calc_pages_for(ofs, len);
1208 osd_req->r_page_alignment = ofs & ~PAGE_MASK;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001209
Alex Elder30573d62012-11-13 21:11:15 -06001210 ceph_osdc_build_request(osd_req, ofs, len, 1, op,
Alex Elderae7ca4a32012-11-13 21:11:15 -06001211 snapc, snapid, &mtime);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001213 if (linger_req) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001214 ceph_osdc_set_request_linger(osdc, osd_req);
1215 *linger_req = osd_req;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001216 }
1217
Alex Elder5f29ddd2012-11-08 08:01:39 -06001218 ret = ceph_osdc_start_request(osdc, osd_req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001219 if (ret < 0)
1220 goto done_err;
1221
1222 if (!rbd_cb) {
Alex Elder5f29ddd2012-11-08 08:01:39 -06001223 u64 version;
1224
1225 ret = ceph_osdc_wait_request(osdc, osd_req);
1226 version = le64_to_cpu(osd_req->r_reassert_version.version);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001227 if (ver)
Alex Elder5f29ddd2012-11-08 08:01:39 -06001228 *ver = version;
1229 dout("reassert_ver=%llu\n", (unsigned long long) version);
1230 ceph_osdc_put_request(osd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231 }
1232 return ret;
1233
1234done_err:
Alex Elder2e53c6c2012-11-30 09:59:47 -06001235 if (bio)
1236 bio_chain_put(osd_req->r_bio);
Alex Elder725afc92012-11-08 08:01:39 -06001237 kfree(rbd_req);
Alex Elder2e53c6c2012-11-30 09:59:47 -06001238done_osd_req:
1239 ceph_osdc_put_request(osd_req);
1240
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001241 return ret;
1242}
1243
1244/*
1245 * Ceph osd op callback
1246 */
Alex Elder5f29ddd2012-11-08 08:01:39 -06001247static void rbd_req_cb(struct ceph_osd_request *osd_req, struct ceph_msg *msg)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001248{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001249 struct rbd_request *rbd_req = osd_req->r_priv;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001250 struct ceph_osd_reply_head *replyhead;
1251 struct ceph_osd_op *op;
Alex Elder8986cb32012-11-08 08:01:39 -06001252 s32 rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001253 u64 bytes;
1254 int read_op;
1255
1256 /* parse reply */
1257 replyhead = msg->front.iov_base;
1258 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1259 op = (void *)(replyhead + 1);
Alex Elder8986cb32012-11-08 08:01:39 -06001260 rc = (s32)le32_to_cpu(replyhead->result);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001262 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001263
Alex Elderbd919d42012-07-13 20:35:11 -05001264 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1265 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001266
Alex Elder8986cb32012-11-08 08:01:39 -06001267 if (rc == (s32)-ENOENT && read_op) {
Alex Elder725afc92012-11-08 08:01:39 -06001268 zero_bio_chain(rbd_req->bio, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001269 rc = 0;
Alex Elder725afc92012-11-08 08:01:39 -06001270 } else if (rc == 0 && read_op && bytes < rbd_req->len) {
1271 zero_bio_chain(rbd_req->bio, bytes);
1272 bytes = rbd_req->len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001273 }
1274
Alex Elder725afc92012-11-08 08:01:39 -06001275 rbd_coll_end_req(rbd_req, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001276
Alex Elder725afc92012-11-08 08:01:39 -06001277 if (rbd_req->bio)
1278 bio_chain_put(rbd_req->bio);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001279
Alex Elder5f29ddd2012-11-08 08:01:39 -06001280 ceph_osdc_put_request(osd_req);
Alex Elder725afc92012-11-08 08:01:39 -06001281 kfree(rbd_req);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001282}
1283
Alex Elder5f29ddd2012-11-08 08:01:39 -06001284static void rbd_simple_req_cb(struct ceph_osd_request *osd_req,
1285 struct ceph_msg *msg)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001286{
Alex Elder5f29ddd2012-11-08 08:01:39 -06001287 ceph_osdc_put_request(osd_req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001288}
1289
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001290/*
1291 * Do a synchronous ceph osd operation
1292 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001293static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001294 int flags,
Alex Elder30573d62012-11-13 21:11:15 -06001295 struct ceph_osd_req_op *op,
Alex Elderaded07e2012-07-03 16:01:18 -05001296 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001297 u64 ofs, u64 inbound_size,
1298 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001299 struct ceph_osd_request **linger_req,
1300 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001301{
1302 int ret;
1303 struct page **pages;
1304 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001305
Alex Elder30573d62012-11-13 21:11:15 -06001306 rbd_assert(op != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001307
Alex Elderf8d4de62012-07-03 16:01:19 -05001308 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001309 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001310 if (IS_ERR(pages))
1311 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001312
Alex Elder25704ac2012-11-09 08:43:16 -06001313 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderf8d4de62012-07-03 16:01:19 -05001314 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001315 pages, num_pages,
1316 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001317 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001318 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001319 NULL,
1320 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001321 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001322 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001323
Alex Elderf8d4de62012-07-03 16:01:19 -05001324 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1325 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001326
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001327done:
1328 ceph_release_page_vector(pages, num_pages);
1329 return ret;
1330}
1331
1332/*
1333 * Do an asynchronous ceph osd operation
1334 */
1335static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001336 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001337 struct ceph_snap_context *snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001338 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001339 struct bio *bio,
1340 struct rbd_req_coll *coll,
1341 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001342{
1343 char *seg_name;
1344 u64 seg_ofs;
1345 u64 seg_len;
1346 int ret;
Alex Elder139b4312012-11-13 21:11:15 -06001347 struct ceph_osd_req_op *op;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001348 int opcode;
1349 int flags;
Alex Elder46342462012-10-10 18:59:29 -07001350 u64 snapid;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001351
Alex Elder65ccfe22012-08-09 10:33:26 -07001352 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001353 if (!seg_name)
1354 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001355 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1356 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001357
Alex Elderff2e4bb2012-10-10 18:59:29 -07001358 if (rq_data_dir(rq) == WRITE) {
1359 opcode = CEPH_OSD_OP_WRITE;
1360 flags = CEPH_OSD_FLAG_WRITE|CEPH_OSD_FLAG_ONDISK;
Alex Elder46342462012-10-10 18:59:29 -07001361 snapid = CEPH_NOSNAP;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001362 } else {
1363 opcode = CEPH_OSD_OP_READ;
1364 flags = CEPH_OSD_FLAG_READ;
Alex Eldera7b4c652012-11-09 08:43:15 -06001365 rbd_assert(!snapc);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001366 snapid = rbd_dev->spec->snap_id;
Alex Elderff2e4bb2012-10-10 18:59:29 -07001367 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001368
Alex Elder57cfc102012-06-26 12:57:03 -07001369 ret = -ENOMEM;
Alex Elder8d23bf22012-11-19 22:55:21 -06001370 op = rbd_osd_req_op_create(opcode, seg_ofs, seg_len);
Alex Elder139b4312012-11-13 21:11:15 -06001371 if (!op)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001372 goto done;
1373
1374 /* we've taken care of segment sizes earlier when we
1375 cloned the bios. We should never have a segment
1376 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001377 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001378
1379 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1380 seg_name, seg_ofs, seg_len,
1381 bio,
1382 NULL, 0,
1383 flags,
Alex Elder30573d62012-11-13 21:11:15 -06001384 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001385 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001386 rbd_req_cb, 0, NULL);
Alex Eldercd323ac2012-11-08 08:01:39 -06001387 if (ret < 0)
1388 rbd_coll_end_req_index(rq, coll, coll_index,
1389 (s32)ret, seg_len);
Alex Elder8d23bf22012-11-19 22:55:21 -06001390 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001391done:
1392 kfree(seg_name);
1393 return ret;
1394}
1395
1396/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001397 * Request sync osd read
1398 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001399static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001400 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001401 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001402 char *buf,
1403 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001404{
Alex Elder139b4312012-11-13 21:11:15 -06001405 struct ceph_osd_req_op *op;
Alex Elder913d2fd2012-06-26 12:57:03 -07001406 int ret;
1407
Alex Elder8d23bf22012-11-19 22:55:21 -06001408 op = rbd_osd_req_op_create(CEPH_OSD_OP_READ, ofs, len);
Alex Elder139b4312012-11-13 21:11:15 -06001409 if (!op)
Alex Elder913d2fd2012-06-26 12:57:03 -07001410 return -ENOMEM;
1411
Alex Elder25704ac2012-11-09 08:43:16 -06001412 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ,
Alex Elder30573d62012-11-13 21:11:15 -06001413 op, object_name, ofs, len, buf, NULL, ver);
Alex Elder8d23bf22012-11-19 22:55:21 -06001414 rbd_osd_req_op_destroy(op);
Alex Elder913d2fd2012-06-26 12:57:03 -07001415
1416 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001417}
1418
1419/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001420 * Request sync osd watch
1421 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001422static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001423 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001424 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001425{
Alex Elder139b4312012-11-13 21:11:15 -06001426 struct ceph_osd_req_op *op;
Sage Weil11f77002011-05-12 16:13:54 -07001427 int ret;
1428
Alex Elder5efea492012-11-19 22:55:21 -06001429 op = rbd_osd_req_op_create(CEPH_OSD_OP_NOTIFY_ACK, notify_id, ver);
Alex Elder139b4312012-11-13 21:11:15 -06001430 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001431 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001432
Alex Elder0ce1a792012-07-03 16:01:18 -05001433 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001434 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001435 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001436 CEPH_OSD_FLAG_READ,
Alex Elder30573d62012-11-13 21:11:15 -06001437 op,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001438 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001439 rbd_simple_req_cb, 0, NULL);
1440
Alex Elder5efea492012-11-19 22:55:21 -06001441 rbd_osd_req_op_destroy(op);
1442
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001443 return ret;
1444}
1445
1446static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1447{
Alex Elder0ce1a792012-07-03 16:01:18 -05001448 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001449 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001450 int rc;
1451
Alex Elder0ce1a792012-07-03 16:01:18 -05001452 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001453 return;
1454
Alex Elderbd919d42012-07-13 20:35:11 -05001455 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1456 rbd_dev->header_name, (unsigned long long) notify_id,
1457 (unsigned int) opcode);
Alex Elder117973f2012-08-31 17:29:55 -05001458 rc = rbd_dev_refresh(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001459 if (rc)
Alex Elder06ecc6c2012-11-01 10:17:15 -05001460 rbd_warn(rbd_dev, "got notification but failed to "
1461 " update snaps: %d\n", rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001462
Alex Elder7f0a24d2012-07-25 09:32:40 -05001463 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001464}
1465
1466/*
Alex Elder907703d2012-11-13 21:11:15 -06001467 * Request sync osd watch/unwatch. The value of "start" determines
1468 * whether a watch request is being initiated or torn down.
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001469 */
Alex Elder907703d2012-11-13 21:11:15 -06001470static int rbd_req_sync_watch(struct rbd_device *rbd_dev, int start)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001471{
Alex Elder907703d2012-11-13 21:11:15 -06001472 struct ceph_osd_request **linger_req = NULL;
Alex Elder5efea492012-11-19 22:55:21 -06001473 struct ceph_osd_req_op *op;
1474 int ret = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001475
Alex Elder907703d2012-11-13 21:11:15 -06001476 if (start) {
1477 struct ceph_osd_client *osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001478
Alex Elder907703d2012-11-13 21:11:15 -06001479 osdc = &rbd_dev->rbd_client->client->osdc;
1480 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, rbd_dev,
1481 &rbd_dev->watch_event);
1482 if (ret < 0)
Alex Elder5efea492012-11-19 22:55:21 -06001483 return ret;
Alex Elder907703d2012-11-13 21:11:15 -06001484 linger_req = &rbd_dev->watch_request;
Alex Elder5efea492012-11-19 22:55:21 -06001485 } else {
1486 rbd_assert(rbd_dev->watch_request != NULL);
Alex Elder907703d2012-11-13 21:11:15 -06001487 }
1488
Alex Elder5efea492012-11-19 22:55:21 -06001489 op = rbd_osd_req_op_create(CEPH_OSD_OP_WATCH,
1490 rbd_dev->watch_event->cookie,
1491 rbd_dev->header.obj_version, start);
1492 if (op)
1493 ret = rbd_req_sync_op(rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001494 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Alex Elder907703d2012-11-13 21:11:15 -06001495 op, rbd_dev->header_name,
1496 0, 0, NULL, linger_req, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001497
Alex Elder5efea492012-11-19 22:55:21 -06001498 /* Cancel the event if we're tearing down, or on error */
1499
1500 if (!start || !op || ret < 0) {
Alex Elder907703d2012-11-13 21:11:15 -06001501 ceph_osdc_cancel_event(rbd_dev->watch_event);
1502 rbd_dev->watch_event = NULL;
1503 }
Alex Elder5efea492012-11-19 22:55:21 -06001504 rbd_osd_req_op_destroy(op);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001505
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001506 return ret;
1507}
1508
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001509/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001510 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001511 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001512static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001513 const char *object_name,
1514 const char *class_name,
1515 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001516 const char *outbound,
1517 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001518 char *inbound,
1519 size_t inbound_size,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001520 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521{
Alex Elder139b4312012-11-13 21:11:15 -06001522 struct ceph_osd_req_op *op;
Alex Elder57cfc102012-06-26 12:57:03 -07001523 int ret;
1524
Alex Elder3cb4a682012-06-26 12:57:03 -07001525 /*
1526 * Any input parameters required by the method we're calling
1527 * will be sent along with the class and method names as
1528 * part of the message payload. That data and its size are
1529 * supplied via the indata and indata_len fields (named from
1530 * the perspective of the server side) in the OSD request
1531 * operation.
1532 */
Alex Elder2647ba32012-11-19 22:55:21 -06001533 op = rbd_osd_req_op_create(CEPH_OSD_OP_CALL, class_name,
1534 method_name, outbound, outbound_size);
Alex Elder139b4312012-11-13 21:11:15 -06001535 if (!op)
Alex Elder57cfc102012-06-26 12:57:03 -07001536 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001537
Alex Elder30573d62012-11-13 21:11:15 -06001538 ret = rbd_req_sync_op(rbd_dev, CEPH_OSD_FLAG_READ, op,
Alex Elderf8d4de62012-07-03 16:01:19 -05001539 object_name, 0, inbound_size, inbound,
1540 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001541
Alex Elder2647ba32012-11-19 22:55:21 -06001542 rbd_osd_req_op_destroy(op);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543
1544 dout("cls_exec returned %d\n", ret);
1545 return ret;
1546}
1547
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001548static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1549{
1550 struct rbd_req_coll *coll =
1551 kzalloc(sizeof(struct rbd_req_coll) +
1552 sizeof(struct rbd_req_status) * num_reqs,
1553 GFP_ATOMIC);
1554
1555 if (!coll)
1556 return NULL;
1557 coll->total = num_reqs;
1558 kref_init(&coll->kref);
1559 return coll;
1560}
1561
Alex Elder8295cda2012-11-08 08:01:39 -06001562static int rbd_dev_do_request(struct request *rq,
1563 struct rbd_device *rbd_dev,
1564 struct ceph_snap_context *snapc,
1565 u64 ofs, unsigned int size,
1566 struct bio *bio_chain)
1567{
1568 int num_segs;
1569 struct rbd_req_coll *coll;
1570 unsigned int bio_offset;
1571 int cur_seg = 0;
1572
1573 dout("%s 0x%x bytes at 0x%llx\n",
1574 rq_data_dir(rq) == WRITE ? "write" : "read",
1575 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
1576
1577 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1578 if (num_segs <= 0)
1579 return num_segs;
1580
1581 coll = rbd_alloc_coll(num_segs);
1582 if (!coll)
1583 return -ENOMEM;
1584
1585 bio_offset = 0;
1586 do {
1587 u64 limit = rbd_segment_length(rbd_dev, ofs, size);
1588 unsigned int clone_size;
1589 struct bio *bio_clone;
1590
1591 BUG_ON(limit > (u64)UINT_MAX);
1592 clone_size = (unsigned int)limit;
1593 dout("bio_chain->bi_vcnt=%hu\n", bio_chain->bi_vcnt);
1594
1595 kref_get(&coll->kref);
1596
1597 /* Pass a cloned bio chain via an osd request */
1598
1599 bio_clone = bio_chain_clone_range(&bio_chain,
1600 &bio_offset, clone_size,
1601 GFP_ATOMIC);
1602 if (bio_clone)
1603 (void)rbd_do_op(rq, rbd_dev, snapc,
1604 ofs, clone_size,
1605 bio_clone, coll, cur_seg);
1606 else
1607 rbd_coll_end_req_index(rq, coll, cur_seg,
1608 (s32)-ENOMEM,
1609 clone_size);
1610 size -= clone_size;
1611 ofs += clone_size;
1612
1613 cur_seg++;
1614 } while (size > 0);
1615 kref_put(&coll->kref, rbd_coll_release);
1616
1617 return 0;
1618}
1619
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001620/*
1621 * block device queue callback
1622 */
1623static void rbd_rq_fn(struct request_queue *q)
1624{
1625 struct rbd_device *rbd_dev = q->queuedata;
Alex Elderb395e8b2012-11-08 08:01:39 -06001626 bool read_only = rbd_dev->mapping.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001627 struct request *rq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001628
Alex Elder00f1f362012-02-07 12:03:36 -06001629 while ((rq = blk_fetch_request(q))) {
Alex Elderb395e8b2012-11-08 08:01:39 -06001630 struct ceph_snap_context *snapc = NULL;
1631 unsigned int size = 0;
Alex Elder8295cda2012-11-08 08:01:39 -06001632 int result;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001633
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001634 dout("fetched request\n");
1635
Alex Elderb395e8b2012-11-08 08:01:39 -06001636 /* Filter out block requests we don't understand */
1637
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001638 if ((rq->cmd_type != REQ_TYPE_FS)) {
1639 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001640 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001641 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001642 spin_unlock_irq(q->queue_lock);
1643
Alex Eldera7b4c652012-11-09 08:43:15 -06001644 /* Write requests need a reference to the snapshot context */
Alex Elderb395e8b2012-11-08 08:01:39 -06001645
Alex Eldera7b4c652012-11-09 08:43:15 -06001646 if (rq_data_dir(rq) == WRITE) {
1647 result = -EROFS;
1648 if (read_only) /* Can't write to a read-only device */
1649 goto out_end_request;
Alex Elderb395e8b2012-11-08 08:01:39 -06001650
Alex Eldera7b4c652012-11-09 08:43:15 -06001651 /*
1652 * Note that each osd request will take its
1653 * own reference to the snapshot context
1654 * supplied. The reference we take here
1655 * just guarantees the one we provide stays
1656 * valid.
1657 */
1658 down_read(&rbd_dev->header_rwsem);
Alex Elderb395e8b2012-11-08 08:01:39 -06001659 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
Alex Eldera7b4c652012-11-09 08:43:15 -06001660 up_read(&rbd_dev->header_rwsem);
Alex Elderb395e8b2012-11-08 08:01:39 -06001661 rbd_assert(snapc != NULL);
Alex Eldera7b4c652012-11-09 08:43:15 -06001662 } else if (!atomic_read(&rbd_dev->exists)) {
Alex Elderb395e8b2012-11-08 08:01:39 -06001663 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
1664 dout("request for non-existent snapshot");
1665 result = -ENXIO;
1666 goto out_end_request;
1667 }
Alex Elderf7760da2012-10-20 22:17:27 -05001668
Alex Elderb395e8b2012-11-08 08:01:39 -06001669 size = blk_rq_bytes(rq);
1670 result = rbd_dev_do_request(rq, rbd_dev, snapc,
1671 blk_rq_pos(rq) * SECTOR_SIZE,
1672 size, rq->bio);
1673out_end_request:
Alex Eldera7b4c652012-11-09 08:43:15 -06001674 if (snapc)
1675 ceph_put_snap_context(snapc);
Alex Elder8295cda2012-11-08 08:01:39 -06001676 spin_lock_irq(q->queue_lock);
1677 if (!size || result < 0)
1678 __blk_end_request_all(rq, result);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001679 }
1680}
1681
1682/*
1683 * a queue callback. Makes sure that we don't create a bio that spans across
1684 * multiple osd objects. One exception would be with a single page bios,
Alex Elderf7760da2012-10-20 22:17:27 -05001685 * which we handle later at bio_chain_clone_range()
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001686 */
1687static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1688 struct bio_vec *bvec)
1689{
1690 struct rbd_device *rbd_dev = q->queuedata;
Alex Eldere5cfeed22012-10-20 22:17:27 -05001691 sector_t sector_offset;
1692 sector_t sectors_per_obj;
1693 sector_t obj_sector_offset;
1694 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001695
Alex Eldere5cfeed22012-10-20 22:17:27 -05001696 /*
1697 * Find how far into its rbd object the partition-relative
1698 * bio start sector is to offset relative to the enclosing
1699 * device.
1700 */
1701 sector_offset = get_start_sect(bmd->bi_bdev) + bmd->bi_sector;
1702 sectors_per_obj = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1703 obj_sector_offset = sector_offset & (sectors_per_obj - 1);
Alex Elder593a9e72012-02-07 12:03:37 -06001704
Alex Eldere5cfeed22012-10-20 22:17:27 -05001705 /*
1706 * Compute the number of bytes from that offset to the end
1707 * of the object. Account for what's already used by the bio.
1708 */
1709 ret = (int) (sectors_per_obj - obj_sector_offset) << SECTOR_SHIFT;
1710 if (ret > bmd->bi_size)
1711 ret -= bmd->bi_size;
1712 else
1713 ret = 0;
1714
1715 /*
1716 * Don't send back more than was asked for. And if the bio
1717 * was empty, let the whole thing through because: "Note
1718 * that a block device *must* allow a single page to be
1719 * added to an empty bio."
1720 */
1721 rbd_assert(bvec->bv_len <= PAGE_SIZE);
1722 if (ret > (int) bvec->bv_len || !bmd->bi_size)
1723 ret = (int) bvec->bv_len;
1724
1725 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001726}
1727
1728static void rbd_free_disk(struct rbd_device *rbd_dev)
1729{
1730 struct gendisk *disk = rbd_dev->disk;
1731
1732 if (!disk)
1733 return;
1734
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001735 if (disk->flags & GENHD_FL_UP)
1736 del_gendisk(disk);
1737 if (disk->queue)
1738 blk_cleanup_queue(disk->queue);
1739 put_disk(disk);
1740}
1741
1742/*
Alex Elder4156d992012-08-02 11:29:46 -05001743 * Read the complete header for the given rbd device.
1744 *
1745 * Returns a pointer to a dynamically-allocated buffer containing
1746 * the complete and validated header. Caller can pass the address
1747 * of a variable that will be filled in with the version of the
1748 * header object at the time it was read.
1749 *
1750 * Returns a pointer-coded errno if a failure occurs.
1751 */
1752static struct rbd_image_header_ondisk *
1753rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1754{
1755 struct rbd_image_header_ondisk *ondisk = NULL;
1756 u32 snap_count = 0;
1757 u64 names_size = 0;
1758 u32 want_count;
1759 int ret;
1760
1761 /*
1762 * The complete header will include an array of its 64-bit
1763 * snapshot ids, followed by the names of those snapshots as
1764 * a contiguous block of NUL-terminated strings. Note that
1765 * the number of snapshots could change by the time we read
1766 * it in, in which case we re-read it.
1767 */
1768 do {
1769 size_t size;
1770
1771 kfree(ondisk);
1772
1773 size = sizeof (*ondisk);
1774 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1775 size += names_size;
1776 ondisk = kmalloc(size, GFP_KERNEL);
1777 if (!ondisk)
1778 return ERR_PTR(-ENOMEM);
1779
Alex Elder47756182012-11-09 08:43:15 -06001780 ret = rbd_req_sync_read(rbd_dev, rbd_dev->header_name,
Alex Elder4156d992012-08-02 11:29:46 -05001781 0, size,
1782 (char *) ondisk, version);
1783
1784 if (ret < 0)
1785 goto out_err;
1786 if (WARN_ON((size_t) ret < size)) {
1787 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05001788 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
1789 size, ret);
Alex Elder4156d992012-08-02 11:29:46 -05001790 goto out_err;
1791 }
1792 if (!rbd_dev_ondisk_valid(ondisk)) {
1793 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05001794 rbd_warn(rbd_dev, "invalid header");
Alex Elder4156d992012-08-02 11:29:46 -05001795 goto out_err;
1796 }
1797
1798 names_size = le64_to_cpu(ondisk->snap_names_len);
1799 want_count = snap_count;
1800 snap_count = le32_to_cpu(ondisk->snap_count);
1801 } while (snap_count != want_count);
1802
1803 return ondisk;
1804
1805out_err:
1806 kfree(ondisk);
1807
1808 return ERR_PTR(ret);
1809}
1810
1811/*
1812 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813 */
1814static int rbd_read_header(struct rbd_device *rbd_dev,
1815 struct rbd_image_header *header)
1816{
Alex Elder4156d992012-08-02 11:29:46 -05001817 struct rbd_image_header_ondisk *ondisk;
1818 u64 ver = 0;
1819 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001820
Alex Elder4156d992012-08-02 11:29:46 -05001821 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1822 if (IS_ERR(ondisk))
1823 return PTR_ERR(ondisk);
1824 ret = rbd_header_from_disk(header, ondisk);
1825 if (ret >= 0)
1826 header->obj_version = ver;
1827 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001828
Alex Elder4156d992012-08-02 11:29:46 -05001829 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001830}
1831
Alex Elder41f38c22012-10-25 23:34:40 -05001832static void rbd_remove_all_snaps(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001833{
1834 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001835 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001836
Alex Eldera0593292012-07-19 09:09:27 -05001837 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder41f38c22012-10-25 23:34:40 -05001838 rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001839}
1840
Alex Elder94785542012-10-09 13:50:17 -07001841static void rbd_update_mapping_size(struct rbd_device *rbd_dev)
1842{
1843 sector_t size;
1844
Alex Elder0d7dbfc2012-10-25 23:34:41 -05001845 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Alex Elder94785542012-10-09 13:50:17 -07001846 return;
1847
1848 size = (sector_t) rbd_dev->header.image_size / SECTOR_SIZE;
1849 dout("setting size to %llu sectors", (unsigned long long) size);
1850 rbd_dev->mapping.size = (u64) size;
1851 set_capacity(rbd_dev->disk, size);
1852}
1853
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001854/*
1855 * only read the first part of the ondisk header, without the snaps info
1856 */
Alex Elder117973f2012-08-31 17:29:55 -05001857static int rbd_dev_v1_refresh(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001858{
1859 int ret;
1860 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001861
1862 ret = rbd_read_header(rbd_dev, &h);
1863 if (ret < 0)
1864 return ret;
1865
Josh Durgina51aa0c2011-12-05 10:35:04 -08001866 down_write(&rbd_dev->header_rwsem);
1867
Alex Elder94785542012-10-09 13:50:17 -07001868 /* Update image size, and check for resize of mapped image */
1869 rbd_dev->header.image_size = h.image_size;
1870 rbd_update_mapping_size(rbd_dev);
Sage Weil9db4b3e2011-04-19 22:49:06 -07001871
Alex Elder849b4262012-07-09 21:04:24 -05001872 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001873 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001874 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001875 /* osd requests may still refer to snapc */
1876 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001877
Alex Elderb8136232012-07-25 09:32:41 -05001878 if (hver)
1879 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001880 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001881 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001882 rbd_dev->header.snapc = h.snapc;
1883 rbd_dev->header.snap_names = h.snap_names;
1884 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001885 /* Free the extra copy of the object prefix */
1886 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1887 kfree(h.object_prefix);
1888
Alex Elder304f6802012-08-31 17:29:52 -05001889 ret = rbd_dev_snaps_update(rbd_dev);
1890 if (!ret)
1891 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001892
Josh Durginc6666012011-11-21 17:11:12 -08001893 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001894
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001895 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001896}
1897
Alex Elder117973f2012-08-31 17:29:55 -05001898static int rbd_dev_refresh(struct rbd_device *rbd_dev, u64 *hver)
Alex Elder1fe5e992012-07-25 09:32:41 -05001899{
1900 int ret;
1901
Alex Elder117973f2012-08-31 17:29:55 -05001902 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder1fe5e992012-07-25 09:32:41 -05001903 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder117973f2012-08-31 17:29:55 -05001904 if (rbd_dev->image_format == 1)
1905 ret = rbd_dev_v1_refresh(rbd_dev, hver);
1906 else
1907 ret = rbd_dev_v2_refresh(rbd_dev, hver);
Alex Elder1fe5e992012-07-25 09:32:41 -05001908 mutex_unlock(&ctl_mutex);
1909
1910 return ret;
1911}
1912
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001913static int rbd_init_disk(struct rbd_device *rbd_dev)
1914{
1915 struct gendisk *disk;
1916 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001917 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001918
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001919 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001920 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1921 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001922 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001923
Alex Elderf0f8cef2012-01-29 13:57:44 -06001924 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001925 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001926 disk->major = rbd_dev->major;
1927 disk->first_minor = 0;
1928 disk->fops = &rbd_bd_ops;
1929 disk->private_data = rbd_dev;
1930
1931 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001932 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1933 if (!q)
1934 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001935
Alex Elder593a9e72012-02-07 12:03:37 -06001936 /* We use the default size, but let's be explicit about it. */
1937 blk_queue_physical_block_size(q, SECTOR_SIZE);
1938
Josh Durgin029bcbd2011-07-22 11:35:23 -07001939 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001940 segment_size = rbd_obj_bytes(&rbd_dev->header);
1941 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1942 blk_queue_max_segment_size(q, segment_size);
1943 blk_queue_io_min(q, segment_size);
1944 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001945
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001946 blk_queue_merge_bvec(q, rbd_merge_bvec);
1947 disk->queue = q;
1948
1949 q->queuedata = rbd_dev;
1950
1951 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001952
Alex Elder12f02942012-08-29 17:11:07 -05001953 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1954
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001955 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001956out_disk:
1957 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001958
1959 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001960}
1961
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001962/*
1963 sysfs
1964*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001965
Alex Elder593a9e72012-02-07 12:03:37 -06001966static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1967{
1968 return container_of(dev, struct rbd_device, dev);
1969}
1970
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001971static ssize_t rbd_size_show(struct device *dev,
1972 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001973{
Alex Elder593a9e72012-02-07 12:03:37 -06001974 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001975 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001976
Josh Durgina51aa0c2011-12-05 10:35:04 -08001977 down_read(&rbd_dev->header_rwsem);
1978 size = get_capacity(rbd_dev->disk);
1979 up_read(&rbd_dev->header_rwsem);
1980
1981 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001982}
1983
Alex Elder34b13182012-07-13 20:35:12 -05001984/*
1985 * Note this shows the features for whatever's mapped, which is not
1986 * necessarily the base image.
1987 */
1988static ssize_t rbd_features_show(struct device *dev,
1989 struct device_attribute *attr, char *buf)
1990{
1991 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1992
1993 return sprintf(buf, "0x%016llx\n",
1994 (unsigned long long) rbd_dev->mapping.features);
1995}
1996
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001997static ssize_t rbd_major_show(struct device *dev,
1998 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001999{
Alex Elder593a9e72012-02-07 12:03:37 -06002000 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002001
2002 return sprintf(buf, "%d\n", rbd_dev->major);
2003}
2004
2005static ssize_t rbd_client_id_show(struct device *dev,
2006 struct device_attribute *attr, char *buf)
2007{
Alex Elder593a9e72012-02-07 12:03:37 -06002008 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002009
Alex Elder1dbb4392012-01-24 10:08:37 -06002010 return sprintf(buf, "client%lld\n",
2011 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002012}
2013
2014static ssize_t rbd_pool_show(struct device *dev,
2015 struct device_attribute *attr, char *buf)
2016{
Alex Elder593a9e72012-02-07 12:03:37 -06002017 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002018
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002019 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002020}
2021
Alex Elder9bb2f332012-07-12 10:46:35 -05002022static ssize_t rbd_pool_id_show(struct device *dev,
2023 struct device_attribute *attr, char *buf)
2024{
2025 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2026
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002027 return sprintf(buf, "%llu\n",
2028 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05002029}
2030
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002031static ssize_t rbd_name_show(struct device *dev,
2032 struct device_attribute *attr, char *buf)
2033{
Alex Elder593a9e72012-02-07 12:03:37 -06002034 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002035
Alex Eldera92ffdf2012-10-30 19:40:33 -05002036 if (rbd_dev->spec->image_name)
2037 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
2038
2039 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002040}
2041
Alex Elder589d30e2012-07-10 20:30:11 -05002042static ssize_t rbd_image_id_show(struct device *dev,
2043 struct device_attribute *attr, char *buf)
2044{
2045 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2046
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002047 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05002048}
2049
Alex Elder34b13182012-07-13 20:35:12 -05002050/*
2051 * Shows the name of the currently-mapped snapshot (or
2052 * RBD_SNAP_HEAD_NAME for the base image).
2053 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002054static ssize_t rbd_snap_show(struct device *dev,
2055 struct device_attribute *attr,
2056 char *buf)
2057{
Alex Elder593a9e72012-02-07 12:03:37 -06002058 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002059
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002060 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002061}
2062
Alex Elder86b00e02012-10-25 23:34:42 -05002063/*
2064 * For an rbd v2 image, shows the pool id, image id, and snapshot id
2065 * for the parent image. If there is no parent, simply shows
2066 * "(no parent image)".
2067 */
2068static ssize_t rbd_parent_show(struct device *dev,
2069 struct device_attribute *attr,
2070 char *buf)
2071{
2072 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
2073 struct rbd_spec *spec = rbd_dev->parent_spec;
2074 int count;
2075 char *bufp = buf;
2076
2077 if (!spec)
2078 return sprintf(buf, "(no parent image)\n");
2079
2080 count = sprintf(bufp, "pool_id %llu\npool_name %s\n",
2081 (unsigned long long) spec->pool_id, spec->pool_name);
2082 if (count < 0)
2083 return count;
2084 bufp += count;
2085
2086 count = sprintf(bufp, "image_id %s\nimage_name %s\n", spec->image_id,
2087 spec->image_name ? spec->image_name : "(unknown)");
2088 if (count < 0)
2089 return count;
2090 bufp += count;
2091
2092 count = sprintf(bufp, "snap_id %llu\nsnap_name %s\n",
2093 (unsigned long long) spec->snap_id, spec->snap_name);
2094 if (count < 0)
2095 return count;
2096 bufp += count;
2097
2098 count = sprintf(bufp, "overlap %llu\n", rbd_dev->parent_overlap);
2099 if (count < 0)
2100 return count;
2101 bufp += count;
2102
2103 return (ssize_t) (bufp - buf);
2104}
2105
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002106static ssize_t rbd_image_refresh(struct device *dev,
2107 struct device_attribute *attr,
2108 const char *buf,
2109 size_t size)
2110{
Alex Elder593a9e72012-02-07 12:03:37 -06002111 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05002112 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002113
Alex Elder117973f2012-08-31 17:29:55 -05002114 ret = rbd_dev_refresh(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05002115
2116 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002117}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002118
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002120static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002121static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
2122static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
2123static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05002124static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002125static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05002126static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002127static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
2128static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002129static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002130
2131static struct attribute *rbd_attrs[] = {
2132 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002133 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002134 &dev_attr_major.attr,
2135 &dev_attr_client_id.attr,
2136 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002137 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002138 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05002139 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002140 &dev_attr_current_snap.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05002141 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002142 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002143 NULL
2144};
2145
2146static struct attribute_group rbd_attr_group = {
2147 .attrs = rbd_attrs,
2148};
2149
2150static const struct attribute_group *rbd_attr_groups[] = {
2151 &rbd_attr_group,
2152 NULL
2153};
2154
2155static void rbd_sysfs_dev_release(struct device *dev)
2156{
2157}
2158
2159static struct device_type rbd_device_type = {
2160 .name = "rbd",
2161 .groups = rbd_attr_groups,
2162 .release = rbd_sysfs_dev_release,
2163};
2164
2165
2166/*
2167 sysfs - snapshots
2168*/
2169
2170static ssize_t rbd_snap_size_show(struct device *dev,
2171 struct device_attribute *attr,
2172 char *buf)
2173{
2174 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2175
Josh Durgin3591538f2011-12-05 18:25:13 -08002176 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002177}
2178
2179static ssize_t rbd_snap_id_show(struct device *dev,
2180 struct device_attribute *attr,
2181 char *buf)
2182{
2183 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2184
Josh Durgin3591538f2011-12-05 18:25:13 -08002185 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002186}
2187
Alex Elder34b13182012-07-13 20:35:12 -05002188static ssize_t rbd_snap_features_show(struct device *dev,
2189 struct device_attribute *attr,
2190 char *buf)
2191{
2192 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2193
2194 return sprintf(buf, "0x%016llx\n",
2195 (unsigned long long) snap->features);
2196}
2197
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002198static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2199static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002200static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002201
2202static struct attribute *rbd_snap_attrs[] = {
2203 &dev_attr_snap_size.attr,
2204 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002205 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002206 NULL,
2207};
2208
2209static struct attribute_group rbd_snap_attr_group = {
2210 .attrs = rbd_snap_attrs,
2211};
2212
2213static void rbd_snap_dev_release(struct device *dev)
2214{
2215 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2216 kfree(snap->name);
2217 kfree(snap);
2218}
2219
2220static const struct attribute_group *rbd_snap_attr_groups[] = {
2221 &rbd_snap_attr_group,
2222 NULL
2223};
2224
2225static struct device_type rbd_snap_device_type = {
2226 .groups = rbd_snap_attr_groups,
2227 .release = rbd_snap_dev_release,
2228};
2229
Alex Elder8b8fb992012-10-26 17:25:24 -05002230static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
2231{
2232 kref_get(&spec->kref);
2233
2234 return spec;
2235}
2236
2237static void rbd_spec_free(struct kref *kref);
2238static void rbd_spec_put(struct rbd_spec *spec)
2239{
2240 if (spec)
2241 kref_put(&spec->kref, rbd_spec_free);
2242}
2243
2244static struct rbd_spec *rbd_spec_alloc(void)
2245{
2246 struct rbd_spec *spec;
2247
2248 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
2249 if (!spec)
2250 return NULL;
2251 kref_init(&spec->kref);
2252
2253 rbd_spec_put(rbd_spec_get(spec)); /* TEMPORARY */
2254
2255 return spec;
2256}
2257
2258static void rbd_spec_free(struct kref *kref)
2259{
2260 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
2261
2262 kfree(spec->pool_name);
2263 kfree(spec->image_id);
2264 kfree(spec->image_name);
2265 kfree(spec->snap_name);
2266 kfree(spec);
2267}
2268
Alex Elderc53d5892012-10-25 23:34:42 -05002269struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
2270 struct rbd_spec *spec)
2271{
2272 struct rbd_device *rbd_dev;
2273
2274 rbd_dev = kzalloc(sizeof (*rbd_dev), GFP_KERNEL);
2275 if (!rbd_dev)
2276 return NULL;
2277
2278 spin_lock_init(&rbd_dev->lock);
Alex Elderd78b6502012-11-09 08:43:15 -06002279 atomic_set(&rbd_dev->exists, 0);
Alex Elderc53d5892012-10-25 23:34:42 -05002280 INIT_LIST_HEAD(&rbd_dev->node);
2281 INIT_LIST_HEAD(&rbd_dev->snaps);
2282 init_rwsem(&rbd_dev->header_rwsem);
2283
2284 rbd_dev->spec = spec;
2285 rbd_dev->rbd_client = rbdc;
2286
Alex Elder0903e872012-11-14 12:25:19 -06002287 /* Initialize the layout used for all rbd requests */
2288
2289 rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2290 rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
2291 rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
2292 rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
2293
Alex Elderc53d5892012-10-25 23:34:42 -05002294 return rbd_dev;
2295}
2296
2297static void rbd_dev_destroy(struct rbd_device *rbd_dev)
2298{
Alex Elder86b00e02012-10-25 23:34:42 -05002299 rbd_spec_put(rbd_dev->parent_spec);
Alex Elderc53d5892012-10-25 23:34:42 -05002300 kfree(rbd_dev->header_name);
2301 rbd_put_client(rbd_dev->rbd_client);
2302 rbd_spec_put(rbd_dev->spec);
2303 kfree(rbd_dev);
2304}
2305
Alex Elder304f6802012-08-31 17:29:52 -05002306static bool rbd_snap_registered(struct rbd_snap *snap)
2307{
2308 bool ret = snap->dev.type == &rbd_snap_device_type;
2309 bool reg = device_is_registered(&snap->dev);
2310
2311 rbd_assert(!ret ^ reg);
2312
2313 return ret;
2314}
2315
Alex Elder41f38c22012-10-25 23:34:40 -05002316static void rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002317{
2318 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002319 if (device_is_registered(&snap->dev))
2320 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002321}
2322
Alex Elder14e70852012-07-19 09:09:27 -05002323static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002324 struct device *parent)
2325{
2326 struct device *dev = &snap->dev;
2327 int ret;
2328
2329 dev->type = &rbd_snap_device_type;
2330 dev->parent = parent;
2331 dev->release = rbd_snap_dev_release;
Alex Elderd4b125e2012-07-03 16:01:19 -05002332 dev_set_name(dev, "%s%s", RBD_SNAP_DEV_NAME_PREFIX, snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002333 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2334
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002335 ret = device_register(dev);
2336
2337 return ret;
2338}
2339
Alex Elder4e891e02012-07-10 20:30:10 -05002340static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002341 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002342 u64 snap_id, u64 snap_size,
2343 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002344{
Alex Elder4e891e02012-07-10 20:30:10 -05002345 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002346 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002347
2348 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002349 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002350 return ERR_PTR(-ENOMEM);
2351
2352 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002353 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002354 if (!snap->name)
2355 goto err;
2356
Alex Elderc8d18422012-07-10 20:30:11 -05002357 snap->id = snap_id;
2358 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002359 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002360
2361 return snap;
2362
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002363err:
2364 kfree(snap->name);
2365 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002366
2367 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002368}
2369
Alex Eldercd892122012-07-03 16:01:19 -05002370static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2371 u64 *snap_size, u64 *snap_features)
2372{
2373 char *snap_name;
2374
2375 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2376
2377 *snap_size = rbd_dev->header.snap_sizes[which];
2378 *snap_features = 0; /* No features for v1 */
2379
2380 /* Skip over names until we find the one we are looking for */
2381
2382 snap_name = rbd_dev->header.snap_names;
2383 while (which--)
2384 snap_name += strlen(snap_name) + 1;
2385
2386 return snap_name;
2387}
2388
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002389/*
Alex Elder9d475de2012-07-03 16:01:19 -05002390 * Get the size and object order for an image snapshot, or if
2391 * snap_id is CEPH_NOSNAP, gets this information for the base
2392 * image.
2393 */
2394static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2395 u8 *order, u64 *snap_size)
2396{
2397 __le64 snapid = cpu_to_le64(snap_id);
2398 int ret;
2399 struct {
2400 u8 order;
2401 __le64 size;
2402 } __attribute__ ((packed)) size_buf = { 0 };
2403
2404 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2405 "rbd", "get_size",
2406 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002407 (char *) &size_buf, sizeof (size_buf), NULL);
Alex Elder9d475de2012-07-03 16:01:19 -05002408 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2409 if (ret < 0)
2410 return ret;
2411
2412 *order = size_buf.order;
2413 *snap_size = le64_to_cpu(size_buf.size);
2414
2415 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2416 (unsigned long long) snap_id, (unsigned int) *order,
2417 (unsigned long long) *snap_size);
2418
2419 return 0;
2420}
2421
2422static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2423{
2424 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2425 &rbd_dev->header.obj_order,
2426 &rbd_dev->header.image_size);
2427}
2428
Alex Elder1e130192012-07-03 16:01:19 -05002429static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2430{
2431 void *reply_buf;
2432 int ret;
2433 void *p;
2434
2435 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2436 if (!reply_buf)
2437 return -ENOMEM;
2438
2439 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2440 "rbd", "get_object_prefix",
2441 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002442 reply_buf, RBD_OBJ_PREFIX_LEN_MAX, NULL);
Alex Elder1e130192012-07-03 16:01:19 -05002443 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2444 if (ret < 0)
2445 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07002446 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder1e130192012-07-03 16:01:19 -05002447
2448 p = reply_buf;
2449 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2450 p + RBD_OBJ_PREFIX_LEN_MAX,
2451 NULL, GFP_NOIO);
2452
2453 if (IS_ERR(rbd_dev->header.object_prefix)) {
2454 ret = PTR_ERR(rbd_dev->header.object_prefix);
2455 rbd_dev->header.object_prefix = NULL;
2456 } else {
2457 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2458 }
2459
2460out:
2461 kfree(reply_buf);
2462
2463 return ret;
2464}
2465
Alex Elderb1b54022012-07-03 16:01:19 -05002466static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
2467 u64 *snap_features)
2468{
2469 __le64 snapid = cpu_to_le64(snap_id);
2470 struct {
2471 __le64 features;
2472 __le64 incompat;
2473 } features_buf = { 0 };
Alex Elderd8891402012-10-09 13:50:17 -07002474 u64 incompat;
Alex Elderb1b54022012-07-03 16:01:19 -05002475 int ret;
2476
2477 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2478 "rbd", "get_features",
2479 (char *) &snapid, sizeof (snapid),
2480 (char *) &features_buf, sizeof (features_buf),
Alex Elder07b23912012-11-09 08:43:16 -06002481 NULL);
Alex Elderb1b54022012-07-03 16:01:19 -05002482 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2483 if (ret < 0)
2484 return ret;
Alex Elderd8891402012-10-09 13:50:17 -07002485
2486 incompat = le64_to_cpu(features_buf.incompat);
2487 if (incompat & ~RBD_FEATURES_ALL)
Alex Elderb8f5c6e2012-11-01 08:39:26 -05002488 return -ENXIO;
Alex Elderd8891402012-10-09 13:50:17 -07002489
Alex Elderb1b54022012-07-03 16:01:19 -05002490 *snap_features = le64_to_cpu(features_buf.features);
2491
2492 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
2493 (unsigned long long) snap_id,
2494 (unsigned long long) *snap_features,
2495 (unsigned long long) le64_to_cpu(features_buf.incompat));
2496
2497 return 0;
2498}
2499
2500static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
2501{
2502 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
2503 &rbd_dev->header.features);
2504}
2505
Alex Elder86b00e02012-10-25 23:34:42 -05002506static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
2507{
2508 struct rbd_spec *parent_spec;
2509 size_t size;
2510 void *reply_buf = NULL;
2511 __le64 snapid;
2512 void *p;
2513 void *end;
2514 char *image_id;
2515 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05002516 int ret;
2517
2518 parent_spec = rbd_spec_alloc();
2519 if (!parent_spec)
2520 return -ENOMEM;
2521
2522 size = sizeof (__le64) + /* pool_id */
2523 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
2524 sizeof (__le64) + /* snap_id */
2525 sizeof (__le64); /* overlap */
2526 reply_buf = kmalloc(size, GFP_KERNEL);
2527 if (!reply_buf) {
2528 ret = -ENOMEM;
2529 goto out_err;
2530 }
2531
2532 snapid = cpu_to_le64(CEPH_NOSNAP);
2533 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2534 "rbd", "get_parent",
2535 (char *) &snapid, sizeof (snapid),
Alex Elder07b23912012-11-09 08:43:16 -06002536 (char *) reply_buf, size, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05002537 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2538 if (ret < 0)
2539 goto out_err;
2540
2541 ret = -ERANGE;
2542 p = reply_buf;
2543 end = (char *) reply_buf + size;
2544 ceph_decode_64_safe(&p, end, parent_spec->pool_id, out_err);
2545 if (parent_spec->pool_id == CEPH_NOPOOL)
2546 goto out; /* No parent? No problem. */
2547
Alex Elder0903e872012-11-14 12:25:19 -06002548 /* The ceph file layout needs to fit pool id in 32 bits */
2549
2550 ret = -EIO;
2551 if (WARN_ON(parent_spec->pool_id > (u64) U32_MAX))
2552 goto out;
2553
Alex Elder979ed482012-11-01 08:39:26 -05002554 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05002555 if (IS_ERR(image_id)) {
2556 ret = PTR_ERR(image_id);
2557 goto out_err;
2558 }
2559 parent_spec->image_id = image_id;
2560 ceph_decode_64_safe(&p, end, parent_spec->snap_id, out_err);
2561 ceph_decode_64_safe(&p, end, overlap, out_err);
2562
2563 rbd_dev->parent_overlap = overlap;
2564 rbd_dev->parent_spec = parent_spec;
2565 parent_spec = NULL; /* rbd_dev now owns this */
2566out:
2567 ret = 0;
2568out_err:
2569 kfree(reply_buf);
2570 rbd_spec_put(parent_spec);
2571
2572 return ret;
2573}
2574
Alex Elder9e15b772012-10-30 19:40:33 -05002575static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
2576{
2577 size_t image_id_size;
2578 char *image_id;
2579 void *p;
2580 void *end;
2581 size_t size;
2582 void *reply_buf = NULL;
2583 size_t len = 0;
2584 char *image_name = NULL;
2585 int ret;
2586
2587 rbd_assert(!rbd_dev->spec->image_name);
2588
Alex Elder69e7a022012-11-01 08:39:26 -05002589 len = strlen(rbd_dev->spec->image_id);
2590 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05002591 image_id = kmalloc(image_id_size, GFP_KERNEL);
2592 if (!image_id)
2593 return NULL;
2594
2595 p = image_id;
2596 end = (char *) image_id + image_id_size;
Alex Elder69e7a022012-11-01 08:39:26 -05002597 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32) len);
Alex Elder9e15b772012-10-30 19:40:33 -05002598
2599 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
2600 reply_buf = kmalloc(size, GFP_KERNEL);
2601 if (!reply_buf)
2602 goto out;
2603
2604 ret = rbd_req_sync_exec(rbd_dev, RBD_DIRECTORY,
2605 "rbd", "dir_get_name",
2606 image_id, image_id_size,
Alex Elder07b23912012-11-09 08:43:16 -06002607 (char *) reply_buf, size, NULL);
Alex Elder9e15b772012-10-30 19:40:33 -05002608 if (ret < 0)
2609 goto out;
2610 p = reply_buf;
2611 end = (char *) reply_buf + size;
2612 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
2613 if (IS_ERR(image_name))
2614 image_name = NULL;
2615 else
2616 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
2617out:
2618 kfree(reply_buf);
2619 kfree(image_id);
2620
2621 return image_name;
2622}
2623
2624/*
2625 * When a parent image gets probed, we only have the pool, image,
2626 * and snapshot ids but not the names of any of them. This call
2627 * is made later to fill in those names. It has to be done after
2628 * rbd_dev_snaps_update() has completed because some of the
2629 * information (in particular, snapshot name) is not available
2630 * until then.
2631 */
2632static int rbd_dev_probe_update_spec(struct rbd_device *rbd_dev)
2633{
2634 struct ceph_osd_client *osdc;
2635 const char *name;
2636 void *reply_buf = NULL;
2637 int ret;
2638
2639 if (rbd_dev->spec->pool_name)
2640 return 0; /* Already have the names */
2641
2642 /* Look up the pool name */
2643
2644 osdc = &rbd_dev->rbd_client->client->osdc;
2645 name = ceph_pg_pool_name_by_id(osdc->osdmap, rbd_dev->spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05002646 if (!name) {
2647 rbd_warn(rbd_dev, "there is no pool with id %llu",
2648 rbd_dev->spec->pool_id); /* Really a BUG() */
2649 return -EIO;
2650 }
Alex Elder9e15b772012-10-30 19:40:33 -05002651
2652 rbd_dev->spec->pool_name = kstrdup(name, GFP_KERNEL);
2653 if (!rbd_dev->spec->pool_name)
2654 return -ENOMEM;
2655
2656 /* Fetch the image name; tolerate failure here */
2657
2658 name = rbd_dev_image_name(rbd_dev);
Alex Elder69e7a022012-11-01 08:39:26 -05002659 if (name)
Alex Elder9e15b772012-10-30 19:40:33 -05002660 rbd_dev->spec->image_name = (char *) name;
Alex Elder69e7a022012-11-01 08:39:26 -05002661 else
Alex Elder06ecc6c2012-11-01 10:17:15 -05002662 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05002663
2664 /* Look up the snapshot name. */
2665
2666 name = rbd_snap_name(rbd_dev, rbd_dev->spec->snap_id);
2667 if (!name) {
Alex Elder935dc892012-11-01 10:17:15 -05002668 rbd_warn(rbd_dev, "no snapshot with id %llu",
2669 rbd_dev->spec->snap_id); /* Really a BUG() */
Alex Elder9e15b772012-10-30 19:40:33 -05002670 ret = -EIO;
2671 goto out_err;
2672 }
2673 rbd_dev->spec->snap_name = kstrdup(name, GFP_KERNEL);
2674 if(!rbd_dev->spec->snap_name)
2675 goto out_err;
2676
2677 return 0;
2678out_err:
2679 kfree(reply_buf);
2680 kfree(rbd_dev->spec->pool_name);
2681 rbd_dev->spec->pool_name = NULL;
2682
2683 return ret;
2684}
2685
Alex Elder6e14b1a2012-07-03 16:01:19 -05002686static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev, u64 *ver)
Alex Elder35d489f2012-07-03 16:01:19 -05002687{
2688 size_t size;
2689 int ret;
2690 void *reply_buf;
2691 void *p;
2692 void *end;
2693 u64 seq;
2694 u32 snap_count;
2695 struct ceph_snap_context *snapc;
2696 u32 i;
2697
2698 /*
2699 * We'll need room for the seq value (maximum snapshot id),
2700 * snapshot count, and array of that many snapshot ids.
2701 * For now we have a fixed upper limit on the number we're
2702 * prepared to receive.
2703 */
2704 size = sizeof (__le64) + sizeof (__le32) +
2705 RBD_MAX_SNAP_COUNT * sizeof (__le64);
2706 reply_buf = kzalloc(size, GFP_KERNEL);
2707 if (!reply_buf)
2708 return -ENOMEM;
2709
2710 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2711 "rbd", "get_snapcontext",
2712 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06002713 reply_buf, size, ver);
Alex Elder35d489f2012-07-03 16:01:19 -05002714 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2715 if (ret < 0)
2716 goto out;
2717
2718 ret = -ERANGE;
2719 p = reply_buf;
2720 end = (char *) reply_buf + size;
2721 ceph_decode_64_safe(&p, end, seq, out);
2722 ceph_decode_32_safe(&p, end, snap_count, out);
2723
2724 /*
2725 * Make sure the reported number of snapshot ids wouldn't go
2726 * beyond the end of our buffer. But before checking that,
2727 * make sure the computed size of the snapshot context we
2728 * allocate is representable in a size_t.
2729 */
2730 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
2731 / sizeof (u64)) {
2732 ret = -EINVAL;
2733 goto out;
2734 }
2735 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
2736 goto out;
2737
2738 size = sizeof (struct ceph_snap_context) +
2739 snap_count * sizeof (snapc->snaps[0]);
2740 snapc = kmalloc(size, GFP_KERNEL);
2741 if (!snapc) {
2742 ret = -ENOMEM;
2743 goto out;
2744 }
2745
2746 atomic_set(&snapc->nref, 1);
2747 snapc->seq = seq;
2748 snapc->num_snaps = snap_count;
2749 for (i = 0; i < snap_count; i++)
2750 snapc->snaps[i] = ceph_decode_64(&p);
2751
2752 rbd_dev->header.snapc = snapc;
2753
2754 dout(" snap context seq = %llu, snap_count = %u\n",
2755 (unsigned long long) seq, (unsigned int) snap_count);
2756
2757out:
2758 kfree(reply_buf);
2759
2760 return 0;
2761}
2762
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002763static char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev, u32 which)
2764{
2765 size_t size;
2766 void *reply_buf;
2767 __le64 snap_id;
2768 int ret;
2769 void *p;
2770 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002771 char *snap_name;
2772
2773 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
2774 reply_buf = kmalloc(size, GFP_KERNEL);
2775 if (!reply_buf)
2776 return ERR_PTR(-ENOMEM);
2777
2778 snap_id = cpu_to_le64(rbd_dev->header.snapc->snaps[which]);
2779 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2780 "rbd", "get_snapshot_name",
2781 (char *) &snap_id, sizeof (snap_id),
Alex Elder07b23912012-11-09 08:43:16 -06002782 reply_buf, size, NULL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002783 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2784 if (ret < 0)
2785 goto out;
2786
2787 p = reply_buf;
2788 end = (char *) reply_buf + size;
Alex Eldere5c35532012-10-25 23:34:41 -05002789 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002790 if (IS_ERR(snap_name)) {
2791 ret = PTR_ERR(snap_name);
2792 goto out;
2793 } else {
2794 dout(" snap_id 0x%016llx snap_name = %s\n",
2795 (unsigned long long) le64_to_cpu(snap_id), snap_name);
2796 }
2797 kfree(reply_buf);
2798
2799 return snap_name;
2800out:
2801 kfree(reply_buf);
2802
2803 return ERR_PTR(ret);
2804}
2805
2806static char *rbd_dev_v2_snap_info(struct rbd_device *rbd_dev, u32 which,
2807 u64 *snap_size, u64 *snap_features)
2808{
2809 __le64 snap_id;
2810 u8 order;
2811 int ret;
2812
2813 snap_id = rbd_dev->header.snapc->snaps[which];
2814 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, &order, snap_size);
2815 if (ret)
2816 return ERR_PTR(ret);
2817 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, snap_features);
2818 if (ret)
2819 return ERR_PTR(ret);
2820
2821 return rbd_dev_v2_snap_name(rbd_dev, which);
2822}
2823
2824static char *rbd_dev_snap_info(struct rbd_device *rbd_dev, u32 which,
2825 u64 *snap_size, u64 *snap_features)
2826{
2827 if (rbd_dev->image_format == 1)
2828 return rbd_dev_v1_snap_info(rbd_dev, which,
2829 snap_size, snap_features);
2830 if (rbd_dev->image_format == 2)
2831 return rbd_dev_v2_snap_info(rbd_dev, which,
2832 snap_size, snap_features);
2833 return ERR_PTR(-EINVAL);
2834}
2835
Alex Elder117973f2012-08-31 17:29:55 -05002836static int rbd_dev_v2_refresh(struct rbd_device *rbd_dev, u64 *hver)
2837{
2838 int ret;
2839 __u8 obj_order;
2840
2841 down_write(&rbd_dev->header_rwsem);
2842
2843 /* Grab old order first, to see if it changes */
2844
2845 obj_order = rbd_dev->header.obj_order,
2846 ret = rbd_dev_v2_image_size(rbd_dev);
2847 if (ret)
2848 goto out;
2849 if (rbd_dev->header.obj_order != obj_order) {
2850 ret = -EIO;
2851 goto out;
2852 }
2853 rbd_update_mapping_size(rbd_dev);
2854
2855 ret = rbd_dev_v2_snap_context(rbd_dev, hver);
2856 dout("rbd_dev_v2_snap_context returned %d\n", ret);
2857 if (ret)
2858 goto out;
2859 ret = rbd_dev_snaps_update(rbd_dev);
2860 dout("rbd_dev_snaps_update returned %d\n", ret);
2861 if (ret)
2862 goto out;
2863 ret = rbd_dev_snaps_register(rbd_dev);
2864 dout("rbd_dev_snaps_register returned %d\n", ret);
2865out:
2866 up_write(&rbd_dev->header_rwsem);
2867
2868 return ret;
2869}
2870
Alex Elder9d475de2012-07-03 16:01:19 -05002871/*
Alex Elder35938152012-08-02 11:29:46 -05002872 * Scan the rbd device's current snapshot list and compare it to the
2873 * newly-received snapshot context. Remove any existing snapshots
2874 * not present in the new snapshot context. Add a new snapshot for
2875 * any snaphots in the snapshot context not in the current list.
2876 * And verify there are no changes to snapshots we already know
2877 * about.
2878 *
2879 * Assumes the snapshots in the snapshot context are sorted by
2880 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2881 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002882 */
Alex Elder304f6802012-08-31 17:29:52 -05002883static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002884{
Alex Elder35938152012-08-02 11:29:46 -05002885 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2886 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002887 struct list_head *head = &rbd_dev->snaps;
2888 struct list_head *links = head->next;
2889 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002890
Alex Elder9fcbb802012-08-23 23:48:49 -05002891 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002892 while (index < snap_count || links != head) {
2893 u64 snap_id;
2894 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002895 char *snap_name;
2896 u64 snap_size = 0;
2897 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002898
Alex Elder35938152012-08-02 11:29:46 -05002899 snap_id = index < snap_count ? snapc->snaps[index]
2900 : CEPH_NOSNAP;
2901 snap = links != head ? list_entry(links, struct rbd_snap, node)
2902 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002903 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002904
Alex Elder35938152012-08-02 11:29:46 -05002905 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2906 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002907
Alex Elder35938152012-08-02 11:29:46 -05002908 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002909
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002910 if (rbd_dev->spec->snap_id == snap->id)
Alex Elderd78b6502012-11-09 08:43:15 -06002911 atomic_set(&rbd_dev->exists, 0);
Alex Elder41f38c22012-10-25 23:34:40 -05002912 rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002913 dout("%ssnap id %llu has been removed\n",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05002914 rbd_dev->spec->snap_id == snap->id ?
2915 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002916 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002917
Alex Elder35938152012-08-02 11:29:46 -05002918 /* Done with this list entry; advance */
2919
2920 links = next;
2921 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002922 }
Alex Elder35938152012-08-02 11:29:46 -05002923
Alex Elderb8b1e2d2012-07-03 16:01:19 -05002924 snap_name = rbd_dev_snap_info(rbd_dev, index,
2925 &snap_size, &snap_features);
Alex Eldercd892122012-07-03 16:01:19 -05002926 if (IS_ERR(snap_name))
2927 return PTR_ERR(snap_name);
2928
Alex Elder9fcbb802012-08-23 23:48:49 -05002929 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2930 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002931 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2932 struct rbd_snap *new_snap;
2933
2934 /* We haven't seen this snapshot before */
2935
Alex Elderc8d18422012-07-10 20:30:11 -05002936 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002937 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002938 if (IS_ERR(new_snap)) {
2939 int err = PTR_ERR(new_snap);
2940
2941 dout(" failed to add dev, error %d\n", err);
2942
2943 return err;
2944 }
Alex Elder35938152012-08-02 11:29:46 -05002945
2946 /* New goes before existing, or at end of list */
2947
Alex Elder9fcbb802012-08-23 23:48:49 -05002948 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002949 if (snap)
2950 list_add_tail(&new_snap->node, &snap->node);
2951 else
Alex Elder523f3252012-08-30 00:16:37 -05002952 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002953 } else {
2954 /* Already have this one */
2955
Alex Elder9fcbb802012-08-23 23:48:49 -05002956 dout(" already present\n");
2957
Alex Eldercd892122012-07-03 16:01:19 -05002958 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002959 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002960 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002961
2962 /* Done with this list entry; advance */
2963
2964 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002965 }
Alex Elder35938152012-08-02 11:29:46 -05002966
2967 /* Advance to the next entry in the snapshot context */
2968
2969 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002970 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002971 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002972
2973 return 0;
2974}
2975
Alex Elder304f6802012-08-31 17:29:52 -05002976/*
2977 * Scan the list of snapshots and register the devices for any that
2978 * have not already been registered.
2979 */
2980static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2981{
2982 struct rbd_snap *snap;
2983 int ret = 0;
2984
2985 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002986 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2987 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002988
2989 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2990 if (!rbd_snap_registered(snap)) {
2991 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2992 if (ret < 0)
2993 break;
2994 }
2995 }
2996 dout("%s: returning %d\n", __func__, ret);
2997
2998 return ret;
2999}
3000
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003001static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
3002{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003003 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05003004 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003005
3006 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003007
Alex Eldercd789ab2012-08-30 00:16:38 -05003008 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003009 dev->bus = &rbd_bus_type;
3010 dev->type = &rbd_device_type;
3011 dev->parent = &rbd_root_dev;
3012 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05003013 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003014 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003015
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003016 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05003017
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003018 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003019}
3020
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003021static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
3022{
3023 device_unregister(&rbd_dev->dev);
3024}
3025
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003026static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
3027{
3028 int ret, rc;
3029
3030 do {
Alex Elder907703d2012-11-13 21:11:15 -06003031 ret = rbd_req_sync_watch(rbd_dev, 1);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003032 if (ret == -ERANGE) {
Alex Elder117973f2012-08-31 17:29:55 -05003033 rc = rbd_dev_refresh(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003034 if (rc < 0)
3035 return rc;
3036 }
3037 } while (ret == -ERANGE);
3038
3039 return ret;
3040}
3041
Alex Eldere2839302012-08-29 17:11:06 -05003042static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06003043
3044/*
Alex Elder499afd52012-02-02 08:13:29 -06003045 * Get a unique rbd identifier for the given new rbd_dev, and add
3046 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06003047 */
Alex Eldere2839302012-08-29 17:11:06 -05003048static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06003049{
Alex Eldere2839302012-08-29 17:11:06 -05003050 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06003051
3052 spin_lock(&rbd_dev_list_lock);
3053 list_add_tail(&rbd_dev->node, &rbd_dev_list);
3054 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05003055 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
3056 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06003057}
Alex Elderb7f23c32012-01-29 13:57:43 -06003058
Alex Elder1ddbe942012-01-29 13:57:44 -06003059/*
Alex Elder499afd52012-02-02 08:13:29 -06003060 * Remove an rbd_dev from the global list, and record that its
3061 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06003062 */
Alex Eldere2839302012-08-29 17:11:06 -05003063static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06003064{
Alex Elderd184f6b2012-01-29 13:57:44 -06003065 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05003066 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003067 int max_id;
3068
Alex Elderaafb2302012-09-06 16:00:54 -05003069 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06003070
Alex Eldere2839302012-08-29 17:11:06 -05003071 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
3072 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06003073 spin_lock(&rbd_dev_list_lock);
3074 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06003075
3076 /*
3077 * If the id being "put" is not the current maximum, there
3078 * is nothing special we need to do.
3079 */
Alex Eldere2839302012-08-29 17:11:06 -05003080 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06003081 spin_unlock(&rbd_dev_list_lock);
3082 return;
3083 }
3084
3085 /*
3086 * We need to update the current maximum id. Search the
3087 * list to find out what it is. We're more likely to find
3088 * the maximum at the end, so search the list backward.
3089 */
3090 max_id = 0;
3091 list_for_each_prev(tmp, &rbd_dev_list) {
3092 struct rbd_device *rbd_dev;
3093
3094 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderb213e0b2012-10-10 21:19:13 -07003095 if (rbd_dev->dev_id > max_id)
3096 max_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06003097 }
Alex Elder499afd52012-02-02 08:13:29 -06003098 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06003099
Alex Elder1ddbe942012-01-29 13:57:44 -06003100 /*
Alex Eldere2839302012-08-29 17:11:06 -05003101 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06003102 * which case it now accurately reflects the new maximum.
3103 * Be careful not to overwrite the maximum value in that
3104 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06003105 */
Alex Eldere2839302012-08-29 17:11:06 -05003106 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
3107 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06003108}
3109
Alex Eldera725f65e2012-02-02 08:13:30 -06003110/*
Alex Eldere28fff262012-02-02 08:13:30 -06003111 * Skips over white space at *buf, and updates *buf to point to the
3112 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06003113 * the token (string of non-white space characters) found. Note
3114 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06003115 */
3116static inline size_t next_token(const char **buf)
3117{
3118 /*
3119 * These are the characters that produce nonzero for
3120 * isspace() in the "C" and "POSIX" locales.
3121 */
3122 const char *spaces = " \f\n\r\t\v";
3123
3124 *buf += strspn(*buf, spaces); /* Find start of token */
3125
3126 return strcspn(*buf, spaces); /* Return token length */
3127}
3128
3129/*
3130 * Finds the next token in *buf, and if the provided token buffer is
3131 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06003132 * copied, is guaranteed to be terminated with '\0'. Note that *buf
3133 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06003134 *
3135 * Returns the length of the token found (not including the '\0').
3136 * Return value will be 0 if no token is found, and it will be >=
3137 * token_size if the token would not fit.
3138 *
Alex Elder593a9e72012-02-07 12:03:37 -06003139 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06003140 * found token. Note that this occurs even if the token buffer is
3141 * too small to hold it.
3142 */
3143static inline size_t copy_token(const char **buf,
3144 char *token,
3145 size_t token_size)
3146{
3147 size_t len;
3148
3149 len = next_token(buf);
3150 if (len < token_size) {
3151 memcpy(token, *buf, len);
3152 *(token + len) = '\0';
3153 }
3154 *buf += len;
3155
3156 return len;
3157}
3158
3159/*
Alex Elderea3352f2012-07-09 21:04:23 -05003160 * Finds the next token in *buf, dynamically allocates a buffer big
3161 * enough to hold a copy of it, and copies the token into the new
3162 * buffer. The copy is guaranteed to be terminated with '\0'. Note
3163 * that a duplicate buffer is created even for a zero-length token.
3164 *
3165 * Returns a pointer to the newly-allocated duplicate, or a null
3166 * pointer if memory for the duplicate was not available. If
3167 * the lenp argument is a non-null pointer, the length of the token
3168 * (not including the '\0') is returned in *lenp.
3169 *
3170 * If successful, the *buf pointer will be updated to point beyond
3171 * the end of the found token.
3172 *
3173 * Note: uses GFP_KERNEL for allocation.
3174 */
3175static inline char *dup_token(const char **buf, size_t *lenp)
3176{
3177 char *dup;
3178 size_t len;
3179
3180 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05003181 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05003182 if (!dup)
3183 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05003184 *(dup + len) = '\0';
3185 *buf += len;
3186
3187 if (lenp)
3188 *lenp = len;
3189
3190 return dup;
3191}
3192
3193/*
Alex Elder859c31d2012-10-25 23:34:42 -05003194 * Parse the options provided for an "rbd add" (i.e., rbd image
3195 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
3196 * and the data written is passed here via a NUL-terminated buffer.
3197 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05003198 *
Alex Elder859c31d2012-10-25 23:34:42 -05003199 * The information extracted from these options is recorded in
3200 * the other parameters which return dynamically-allocated
3201 * structures:
3202 * ceph_opts
3203 * The address of a pointer that will refer to a ceph options
3204 * structure. Caller must release the returned pointer using
3205 * ceph_destroy_options() when it is no longer needed.
3206 * rbd_opts
3207 * Address of an rbd options pointer. Fully initialized by
3208 * this function; caller must release with kfree().
3209 * spec
3210 * Address of an rbd image specification pointer. Fully
3211 * initialized by this function based on parsed options.
3212 * Caller must release with rbd_spec_put().
3213 *
3214 * The options passed take this form:
3215 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
3216 * where:
3217 * <mon_addrs>
3218 * A comma-separated list of one or more monitor addresses.
3219 * A monitor address is an ip address, optionally followed
3220 * by a port number (separated by a colon).
3221 * I.e.: ip1[:port1][,ip2[:port2]...]
3222 * <options>
3223 * A comma-separated list of ceph and/or rbd options.
3224 * <pool_name>
3225 * The name of the rados pool containing the rbd image.
3226 * <image_name>
3227 * The name of the image in that pool to map.
3228 * <snap_id>
3229 * An optional snapshot id. If provided, the mapping will
3230 * present data from the image at the time that snapshot was
3231 * created. The image head is used if no snapshot id is
3232 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06003233 */
Alex Elder859c31d2012-10-25 23:34:42 -05003234static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05003235 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05003236 struct rbd_options **opts,
3237 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06003238{
Alex Elderd22f76e2012-07-12 10:46:35 -05003239 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05003240 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05003241 const char *mon_addrs;
3242 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05003243 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003244 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003245 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05003246 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06003247
3248 /* The first four tokens are required */
3249
Alex Elder7ef32142012-02-02 08:13:30 -06003250 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05003251 if (!len) {
3252 rbd_warn(NULL, "no monitor address(es) provided");
3253 return -EINVAL;
3254 }
Alex Elder0ddebc02012-10-25 23:34:41 -05003255 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05003256 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06003257 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06003258
Alex Elderdc79b112012-10-25 23:34:41 -05003259 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05003260 options = dup_token(&buf, NULL);
3261 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05003262 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003263 if (!*options) {
3264 rbd_warn(NULL, "no options provided");
3265 goto out_err;
3266 }
Alex Eldera725f65e2012-02-02 08:13:30 -06003267
Alex Elder859c31d2012-10-25 23:34:42 -05003268 spec = rbd_spec_alloc();
3269 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05003270 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003271
3272 spec->pool_name = dup_token(&buf, NULL);
3273 if (!spec->pool_name)
3274 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003275 if (!*spec->pool_name) {
3276 rbd_warn(NULL, "no pool name provided");
3277 goto out_err;
3278 }
Alex Eldere28fff262012-02-02 08:13:30 -06003279
Alex Elder69e7a022012-11-01 08:39:26 -05003280 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05003281 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003282 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05003283 if (!*spec->image_name) {
3284 rbd_warn(NULL, "no image name provided");
3285 goto out_err;
3286 }
Alex Eldere28fff262012-02-02 08:13:30 -06003287
Alex Elderf28e5652012-10-25 23:34:41 -05003288 /*
3289 * Snapshot name is optional; default is to use "-"
3290 * (indicating the head/no snapshot).
3291 */
Alex Elder3feeb8942012-08-31 17:29:52 -05003292 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05003293 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05003294 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
3295 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05003296 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05003297 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05003298 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05003299 }
Alex Elder4caf35f2012-11-01 08:39:27 -05003300 spec->snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
Alex Elder859c31d2012-10-25 23:34:42 -05003301 if (!spec->snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05003302 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05003303 *(spec->snap_name + len) = '\0';
Alex Eldere5c35532012-10-25 23:34:41 -05003304
Alex Elder0ddebc02012-10-25 23:34:41 -05003305 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06003306
Alex Elder4e9afeb2012-10-25 23:34:41 -05003307 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
3308 if (!rbd_opts)
3309 goto out_mem;
3310
3311 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05003312
Alex Elder859c31d2012-10-25 23:34:42 -05003313 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05003314 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05003315 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003316 if (IS_ERR(copts)) {
3317 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05003318 goto out_err;
3319 }
Alex Elder859c31d2012-10-25 23:34:42 -05003320 kfree(options);
3321
3322 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003323 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05003324 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05003325
Alex Elderdc79b112012-10-25 23:34:41 -05003326 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05003327out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05003328 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05003329out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05003330 kfree(rbd_opts);
3331 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05003332 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05003333
Alex Elderdc79b112012-10-25 23:34:41 -05003334 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06003335}
3336
Alex Elder589d30e2012-07-10 20:30:11 -05003337/*
3338 * An rbd format 2 image has a unique identifier, distinct from the
3339 * name given to it by the user. Internally, that identifier is
3340 * what's used to specify the names of objects related to the image.
3341 *
3342 * A special "rbd id" object is used to map an rbd image name to its
3343 * id. If that object doesn't exist, then there is no v2 rbd image
3344 * with the supplied name.
3345 *
3346 * This function will record the given rbd_dev's image_id field if
3347 * it can be determined, and in that case will return 0. If any
3348 * errors occur a negative errno will be returned and the rbd_dev's
3349 * image_id field will be unchanged (and should be NULL).
3350 */
3351static int rbd_dev_image_id(struct rbd_device *rbd_dev)
3352{
3353 int ret;
3354 size_t size;
3355 char *object_name;
3356 void *response;
3357 void *p;
3358
3359 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05003360 * When probing a parent image, the image id is already
3361 * known (and the image name likely is not). There's no
3362 * need to fetch the image id again in this case.
3363 */
3364 if (rbd_dev->spec->image_id)
3365 return 0;
3366
3367 /*
Alex Elder589d30e2012-07-10 20:30:11 -05003368 * First, see if the format 2 image id file exists, and if
3369 * so, get the image's persistent id from it.
3370 */
Alex Elder69e7a022012-11-01 08:39:26 -05003371 size = sizeof (RBD_ID_PREFIX) + strlen(rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003372 object_name = kmalloc(size, GFP_NOIO);
3373 if (!object_name)
3374 return -ENOMEM;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003375 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->spec->image_name);
Alex Elder589d30e2012-07-10 20:30:11 -05003376 dout("rbd id object name is %s\n", object_name);
3377
3378 /* Response will be an encoded string, which includes a length */
3379
3380 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
3381 response = kzalloc(size, GFP_NOIO);
3382 if (!response) {
3383 ret = -ENOMEM;
3384 goto out;
3385 }
3386
3387 ret = rbd_req_sync_exec(rbd_dev, object_name,
3388 "rbd", "get_id",
3389 NULL, 0,
Alex Elder07b23912012-11-09 08:43:16 -06003390 response, RBD_IMAGE_ID_LEN_MAX, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05003391 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
3392 if (ret < 0)
3393 goto out;
Alex Eldera0ea3a42012-10-10 21:19:13 -07003394 ret = 0; /* rbd_req_sync_exec() can return positive */
Alex Elder589d30e2012-07-10 20:30:11 -05003395
3396 p = response;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003397 rbd_dev->spec->image_id = ceph_extract_encoded_string(&p,
Alex Elder589d30e2012-07-10 20:30:11 -05003398 p + RBD_IMAGE_ID_LEN_MAX,
Alex Elder979ed482012-11-01 08:39:26 -05003399 NULL, GFP_NOIO);
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003400 if (IS_ERR(rbd_dev->spec->image_id)) {
3401 ret = PTR_ERR(rbd_dev->spec->image_id);
3402 rbd_dev->spec->image_id = NULL;
Alex Elder589d30e2012-07-10 20:30:11 -05003403 } else {
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003404 dout("image_id is %s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05003405 }
3406out:
3407 kfree(response);
3408 kfree(object_name);
3409
3410 return ret;
3411}
3412
Alex Eldera30b71b2012-07-10 20:30:11 -05003413static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
3414{
3415 int ret;
3416 size_t size;
3417
3418 /* Version 1 images have no id; empty string is used */
3419
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003420 rbd_dev->spec->image_id = kstrdup("", GFP_KERNEL);
3421 if (!rbd_dev->spec->image_id)
Alex Eldera30b71b2012-07-10 20:30:11 -05003422 return -ENOMEM;
Alex Eldera30b71b2012-07-10 20:30:11 -05003423
3424 /* Record the header object name for this rbd image. */
3425
Alex Elder69e7a022012-11-01 08:39:26 -05003426 size = strlen(rbd_dev->spec->image_name) + sizeof (RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003427 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3428 if (!rbd_dev->header_name) {
3429 ret = -ENOMEM;
3430 goto out_err;
3431 }
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003432 sprintf(rbd_dev->header_name, "%s%s",
3433 rbd_dev->spec->image_name, RBD_SUFFIX);
Alex Eldera30b71b2012-07-10 20:30:11 -05003434
3435 /* Populate rbd image metadata */
3436
3437 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
3438 if (ret < 0)
3439 goto out_err;
Alex Elder86b00e02012-10-25 23:34:42 -05003440
3441 /* Version 1 images have no parent (no layering) */
3442
3443 rbd_dev->parent_spec = NULL;
3444 rbd_dev->parent_overlap = 0;
3445
Alex Eldera30b71b2012-07-10 20:30:11 -05003446 rbd_dev->image_format = 1;
3447
3448 dout("discovered version 1 image, header name is %s\n",
3449 rbd_dev->header_name);
3450
3451 return 0;
3452
3453out_err:
3454 kfree(rbd_dev->header_name);
3455 rbd_dev->header_name = NULL;
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003456 kfree(rbd_dev->spec->image_id);
3457 rbd_dev->spec->image_id = NULL;
Alex Eldera30b71b2012-07-10 20:30:11 -05003458
3459 return ret;
3460}
3461
3462static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
3463{
3464 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05003465 int ret;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003466 u64 ver = 0;
Alex Eldera30b71b2012-07-10 20:30:11 -05003467
3468 /*
3469 * Image id was filled in by the caller. Record the header
3470 * object name for this rbd image.
3471 */
Alex Elder979ed482012-11-01 08:39:26 -05003472 size = sizeof (RBD_HEADER_PREFIX) + strlen(rbd_dev->spec->image_id);
Alex Eldera30b71b2012-07-10 20:30:11 -05003473 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
3474 if (!rbd_dev->header_name)
3475 return -ENOMEM;
3476 sprintf(rbd_dev->header_name, "%s%s",
Alex Elder0d7dbfc2012-10-25 23:34:41 -05003477 RBD_HEADER_PREFIX, rbd_dev->spec->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05003478
3479 /* Get the size and object order for the image */
3480
3481 ret = rbd_dev_v2_image_size(rbd_dev);
3482 if (ret < 0)
3483 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05003484
3485 /* Get the object prefix (a.k.a. block_name) for the image */
3486
3487 ret = rbd_dev_v2_object_prefix(rbd_dev);
3488 if (ret < 0)
3489 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05003490
Alex Elderd8891402012-10-09 13:50:17 -07003491 /* Get the and check features for the image */
Alex Elderb1b54022012-07-03 16:01:19 -05003492
3493 ret = rbd_dev_v2_features(rbd_dev);
3494 if (ret < 0)
3495 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05003496
Alex Elder86b00e02012-10-25 23:34:42 -05003497 /* If the image supports layering, get the parent info */
3498
3499 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
3500 ret = rbd_dev_v2_parent_info(rbd_dev);
3501 if (ret < 0)
3502 goto out_err;
3503 }
3504
Alex Elder6e14b1a2012-07-03 16:01:19 -05003505 /* crypto and compression type aren't (yet) supported for v2 images */
Alex Elder35d489f2012-07-03 16:01:19 -05003506
Alex Elder6e14b1a2012-07-03 16:01:19 -05003507 rbd_dev->header.crypt_type = 0;
3508 rbd_dev->header.comp_type = 0;
3509
3510 /* Get the snapshot context, plus the header version */
3511
3512 ret = rbd_dev_v2_snap_context(rbd_dev, &ver);
Alex Elder35d489f2012-07-03 16:01:19 -05003513 if (ret)
3514 goto out_err;
Alex Elder6e14b1a2012-07-03 16:01:19 -05003515 rbd_dev->header.obj_version = ver;
3516
Alex Eldera30b71b2012-07-10 20:30:11 -05003517 rbd_dev->image_format = 2;
3518
3519 dout("discovered version 2 image, header name is %s\n",
3520 rbd_dev->header_name);
3521
Alex Elder35152972012-08-31 17:29:55 -05003522 return 0;
Alex Elder9d475de2012-07-03 16:01:19 -05003523out_err:
Alex Elder86b00e02012-10-25 23:34:42 -05003524 rbd_dev->parent_overlap = 0;
3525 rbd_spec_put(rbd_dev->parent_spec);
3526 rbd_dev->parent_spec = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003527 kfree(rbd_dev->header_name);
3528 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05003529 kfree(rbd_dev->header.object_prefix);
3530 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05003531
3532 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05003533}
3534
Alex Elder83a06262012-10-30 15:47:17 -05003535static int rbd_dev_probe_finish(struct rbd_device *rbd_dev)
3536{
3537 int ret;
3538
3539 /* no need to lock here, as rbd_dev is not registered yet */
3540 ret = rbd_dev_snaps_update(rbd_dev);
3541 if (ret)
3542 return ret;
3543
Alex Elder9e15b772012-10-30 19:40:33 -05003544 ret = rbd_dev_probe_update_spec(rbd_dev);
3545 if (ret)
3546 goto err_out_snaps;
3547
Alex Elder83a06262012-10-30 15:47:17 -05003548 ret = rbd_dev_set_mapping(rbd_dev);
3549 if (ret)
3550 goto err_out_snaps;
3551
3552 /* generate unique id: find highest unique id, add one */
3553 rbd_dev_id_get(rbd_dev);
3554
3555 /* Fill in the device name, now that we have its id. */
3556 BUILD_BUG_ON(DEV_NAME_LEN
3557 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
3558 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
3559
3560 /* Get our block major device number. */
3561
3562 ret = register_blkdev(0, rbd_dev->name);
3563 if (ret < 0)
3564 goto err_out_id;
3565 rbd_dev->major = ret;
3566
3567 /* Set up the blkdev mapping. */
3568
3569 ret = rbd_init_disk(rbd_dev);
3570 if (ret)
3571 goto err_out_blkdev;
3572
3573 ret = rbd_bus_add_dev(rbd_dev);
3574 if (ret)
3575 goto err_out_disk;
3576
3577 /*
3578 * At this point cleanup in the event of an error is the job
3579 * of the sysfs code (initiated by rbd_bus_del_dev()).
3580 */
3581 down_write(&rbd_dev->header_rwsem);
3582 ret = rbd_dev_snaps_register(rbd_dev);
3583 up_write(&rbd_dev->header_rwsem);
3584 if (ret)
3585 goto err_out_bus;
3586
3587 ret = rbd_init_watch_dev(rbd_dev);
3588 if (ret)
3589 goto err_out_bus;
3590
3591 /* Everything's ready. Announce the disk to the world. */
3592
3593 add_disk(rbd_dev->disk);
3594
3595 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
3596 (unsigned long long) rbd_dev->mapping.size);
3597
3598 return ret;
3599err_out_bus:
3600 /* this will also clean up rest of rbd_dev stuff */
3601
3602 rbd_bus_del_dev(rbd_dev);
3603
3604 return ret;
3605err_out_disk:
3606 rbd_free_disk(rbd_dev);
3607err_out_blkdev:
3608 unregister_blkdev(rbd_dev->major, rbd_dev->name);
3609err_out_id:
3610 rbd_dev_id_put(rbd_dev);
3611err_out_snaps:
3612 rbd_remove_all_snaps(rbd_dev);
3613
3614 return ret;
3615}
3616
Alex Eldera30b71b2012-07-10 20:30:11 -05003617/*
3618 * Probe for the existence of the header object for the given rbd
3619 * device. For format 2 images this includes determining the image
3620 * id.
3621 */
3622static int rbd_dev_probe(struct rbd_device *rbd_dev)
3623{
3624 int ret;
3625
3626 /*
3627 * Get the id from the image id object. If it's not a
3628 * format 2 image, we'll get ENOENT back, and we'll assume
3629 * it's a format 1 image.
3630 */
3631 ret = rbd_dev_image_id(rbd_dev);
3632 if (ret)
3633 ret = rbd_dev_v1_probe(rbd_dev);
3634 else
3635 ret = rbd_dev_v2_probe(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05003636 if (ret) {
Alex Eldera30b71b2012-07-10 20:30:11 -05003637 dout("probe failed, returning %d\n", ret);
3638
Alex Elder83a06262012-10-30 15:47:17 -05003639 return ret;
3640 }
3641
3642 ret = rbd_dev_probe_finish(rbd_dev);
3643 if (ret)
3644 rbd_header_free(&rbd_dev->header);
3645
Alex Eldera30b71b2012-07-10 20:30:11 -05003646 return ret;
3647}
3648
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003649static ssize_t rbd_add(struct bus_type *bus,
3650 const char *buf,
3651 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003652{
Alex Eldercb8627c2012-07-09 21:04:23 -05003653 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05003654 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05003655 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05003656 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05003657 struct rbd_client *rbdc;
Alex Elder27cc2592012-02-02 08:13:30 -06003658 struct ceph_osd_client *osdc;
3659 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003660
3661 if (!try_module_get(THIS_MODULE))
3662 return -ENODEV;
3663
Alex Eldera725f65e2012-02-02 08:13:30 -06003664 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05003665 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05003666 if (rc < 0)
Alex Elderbd4ba652012-10-25 23:34:42 -05003667 goto err_out_module;
Alex Eldera725f65e2012-02-02 08:13:30 -06003668
Alex Elder9d3997f2012-10-25 23:34:42 -05003669 rbdc = rbd_get_client(ceph_opts);
3670 if (IS_ERR(rbdc)) {
3671 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003672 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05003673 }
Alex Elderc53d5892012-10-25 23:34:42 -05003674 ceph_opts = NULL; /* rbd_dev client now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003675
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003676 /* pick the pool */
Alex Elder9d3997f2012-10-25 23:34:42 -05003677 osdc = &rbdc->client->osdc;
Alex Elder859c31d2012-10-25 23:34:42 -05003678 rc = ceph_pg_poolid_by_name(osdc->osdmap, spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003679 if (rc < 0)
3680 goto err_out_client;
Alex Elder859c31d2012-10-25 23:34:42 -05003681 spec->pool_id = (u64) rc;
3682
Alex Elder0903e872012-11-14 12:25:19 -06003683 /* The ceph file layout needs to fit pool id in 32 bits */
3684
3685 if (WARN_ON(spec->pool_id > (u64) U32_MAX)) {
3686 rc = -EIO;
3687 goto err_out_client;
3688 }
3689
Alex Elderc53d5892012-10-25 23:34:42 -05003690 rbd_dev = rbd_dev_create(rbdc, spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003691 if (!rbd_dev)
3692 goto err_out_client;
Alex Elderc53d5892012-10-25 23:34:42 -05003693 rbdc = NULL; /* rbd_dev now owns this */
3694 spec = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003695
Alex Elderbd4ba652012-10-25 23:34:42 -05003696 rbd_dev->mapping.read_only = rbd_opts->read_only;
Alex Elderc53d5892012-10-25 23:34:42 -05003697 kfree(rbd_opts);
3698 rbd_opts = NULL; /* done with this */
Alex Elderbd4ba652012-10-25 23:34:42 -05003699
Alex Eldera30b71b2012-07-10 20:30:11 -05003700 rc = rbd_dev_probe(rbd_dev);
3701 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05003702 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05003703
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003704 return count;
Alex Elderc53d5892012-10-25 23:34:42 -05003705err_out_rbd_dev:
3706 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05003707err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05003708 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05003709err_out_args:
Alex Elder78cea762012-10-25 23:34:41 -05003710 if (ceph_opts)
3711 ceph_destroy_options(ceph_opts);
Alex Elder4e9afeb2012-10-25 23:34:41 -05003712 kfree(rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05003713 rbd_spec_put(spec);
Alex Elderbd4ba652012-10-25 23:34:42 -05003714err_out_module:
3715 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06003716
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003717 dout("Error adding device %s\n", buf);
Alex Elder27cc2592012-02-02 08:13:30 -06003718
3719 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003720}
3721
Alex Elderde71a292012-07-03 16:01:19 -05003722static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003723{
3724 struct list_head *tmp;
3725 struct rbd_device *rbd_dev;
3726
Alex Eldere124a82f2012-01-29 13:57:44 -06003727 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003728 list_for_each(tmp, &rbd_dev_list) {
3729 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05003730 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06003731 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003732 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06003733 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003734 }
Alex Eldere124a82f2012-01-29 13:57:44 -06003735 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003736 return NULL;
3737}
3738
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003739static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003740{
Alex Elder593a9e72012-02-07 12:03:37 -06003741 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003742
Alex Elder1dbb4392012-01-24 10:08:37 -06003743 if (rbd_dev->watch_request) {
3744 struct ceph_client *client = rbd_dev->rbd_client->client;
3745
3746 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003747 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06003748 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07003749 if (rbd_dev->watch_event)
Alex Elder907703d2012-11-13 21:11:15 -06003750 rbd_req_sync_watch(rbd_dev, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003751
3752 /* clean up and free blkdev */
3753 rbd_free_disk(rbd_dev);
3754 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06003755
Alex Elder2ac4e752012-07-10 20:30:10 -05003756 /* release allocated disk header fields */
3757 rbd_header_free(&rbd_dev->header);
3758
Alex Elder32eec682012-02-08 16:11:14 -06003759 /* done with the id, and with the rbd_dev */
Alex Eldere2839302012-08-29 17:11:06 -05003760 rbd_dev_id_put(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05003761 rbd_assert(rbd_dev->rbd_client != NULL);
3762 rbd_dev_destroy(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003763
3764 /* release module ref */
3765 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003766}
3767
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003768static ssize_t rbd_remove(struct bus_type *bus,
3769 const char *buf,
3770 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003771{
3772 struct rbd_device *rbd_dev = NULL;
3773 int target_id, rc;
3774 unsigned long ul;
3775 int ret = count;
3776
3777 rc = strict_strtoul(buf, 10, &ul);
3778 if (rc)
3779 return rc;
3780
3781 /* convert to int; abort if we lost anything in the conversion */
3782 target_id = (int) ul;
3783 if (target_id != ul)
3784 return -EINVAL;
3785
3786 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
3787
3788 rbd_dev = __rbd_get_dev(target_id);
3789 if (!rbd_dev) {
3790 ret = -ENOENT;
3791 goto done;
3792 }
3793
Alex Elder42382b72012-11-16 09:29:16 -06003794 if (rbd_dev->open_count) {
3795 ret = -EBUSY;
3796 goto done;
3797 }
3798
Alex Elder41f38c22012-10-25 23:34:40 -05003799 rbd_remove_all_snaps(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003800 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003801
3802done:
3803 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003804
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003805 return ret;
3806}
3807
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003808/*
3809 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003810 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003811 */
3812static int rbd_sysfs_init(void)
3813{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003814 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003815
Alex Elderfed4c142012-02-07 12:03:36 -06003816 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003817 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003818 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003819
Alex Elderfed4c142012-02-07 12:03:36 -06003820 ret = bus_register(&rbd_bus_type);
3821 if (ret < 0)
3822 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003823
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003824 return ret;
3825}
3826
3827static void rbd_sysfs_cleanup(void)
3828{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003829 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003830 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003831}
3832
3833int __init rbd_init(void)
3834{
3835 int rc;
3836
3837 rc = rbd_sysfs_init();
3838 if (rc)
3839 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003840 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003841 return 0;
3842}
3843
3844void __exit rbd_exit(void)
3845{
3846 rbd_sysfs_cleanup();
3847}
3848
3849module_init(rbd_init);
3850module_exit(rbd_exit);
3851
3852MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3853MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3854MODULE_DESCRIPTION("rados block device");
3855
3856/* following authorship retained from original osdblk.c */
3857MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3858
3859MODULE_LICENSE("GPL");