blob: 64a4dd5f6f2b5d2712dc8c82553032717d2e7ea4 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderaafb2302012-09-06 16:00:54 -050044#define RBD_DEBUG /* Activate rbd_assert() calls */
45
Alex Elder593a9e72012-02-07 12:03:37 -060046/*
47 * The basic unit of block I/O is a sector. It is interpreted in a
48 * number of contexts in Linux (blk, bio, genhd), but the default is
49 * universally 512 bytes. These symbols are just slightly more
50 * meaningful than the bare numbers they represent.
51 */
52#define SECTOR_SHIFT 9
53#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
54
Alex Elderdf111be2012-08-09 10:33:26 -070055/* It might be useful to have this defined elsewhere too */
56
57#define U64_MAX ((u64) (~0ULL))
58
Alex Elderf0f8cef2012-01-29 13:57:44 -060059#define RBD_DRV_NAME "rbd"
60#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070061
62#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
63
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064#define RBD_MAX_SNAP_NAME_LEN 32
65#define RBD_MAX_OPT_LEN 1024
66
67#define RBD_SNAP_HEAD_NAME "-"
68
Alex Elder1e130192012-07-03 16:01:19 -050069#define RBD_IMAGE_ID_LEN_MAX 64
70#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -050071
Alex Elder81a89792012-02-02 08:13:30 -060072/*
73 * An RBD device name will be "rbd#", where the "rbd" comes from
74 * RBD_DRV_NAME above, and # is a unique integer identifier.
75 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
76 * enough to hold all possible device names.
77 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070078#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060079#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080
Alex Eldercc0538b2012-08-10 13:12:07 -070081#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070082
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083/*
84 * block device image metadata (in-memory version)
85 */
86struct rbd_image_header {
Alex Elderf84344f2012-08-31 17:29:51 -050087 /* These four fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -050088 char *object_prefix;
Alex Elder34b13182012-07-13 20:35:12 -050089 u64 features;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090 __u8 obj_order;
91 __u8 crypt_type;
92 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070093
Alex Elderf84344f2012-08-31 17:29:51 -050094 /* The remaining fields need to be updated occasionally */
95 u64 image_size;
96 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070097 char *snap_names;
98 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070099
100 u64 obj_version;
101};
102
103struct rbd_options {
Alex Eldercc0538b2012-08-10 13:12:07 -0700104 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
110struct rbd_client {
111 struct ceph_client *client;
112 struct kref kref;
113 struct list_head node;
114};
115
116/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600117 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700118 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700119struct rbd_req_status {
120 int done;
121 int rc;
122 u64 bytes;
123};
124
125/*
126 * a collection of requests
127 */
128struct rbd_req_coll {
129 int total;
130 int num_done;
131 struct kref kref;
132 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700133};
134
Alex Elderf0f8cef2012-01-29 13:57:44 -0600135/*
136 * a single io request
137 */
138struct rbd_request {
139 struct request *rq; /* blk layer request */
140 struct bio *bio; /* cloned bio */
141 struct page **pages; /* list of used pages */
142 u64 len;
143 int coll_index;
144 struct rbd_req_coll *coll;
145};
146
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800147struct rbd_snap {
148 struct device dev;
149 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800150 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800151 struct list_head node;
152 u64 id;
Alex Elder34b13182012-07-13 20:35:12 -0500153 u64 features;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800154};
155
Alex Elderf84344f2012-08-31 17:29:51 -0500156struct rbd_mapping {
157 char *snap_name;
158 u64 snap_id;
Alex Elder99c1f082012-08-30 14:42:15 -0500159 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500160 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500161 bool snap_exists;
162 bool read_only;
163};
164
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700165/*
166 * a single device
167 */
168struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500169 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700170
171 int major; /* blkdev assigned major */
172 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700173
Alex Eldera30b71b2012-07-10 20:30:11 -0500174 u32 image_format; /* Either 1 or 2 */
Alex Elderf8c38922012-08-10 13:12:07 -0700175 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700176 struct rbd_client *rbd_client;
177
178 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
179
180 spinlock_t lock; /* queue lock */
181
182 struct rbd_image_header header;
Alex Elder589d30e2012-07-10 20:30:11 -0500183 char *image_id;
184 size_t image_id_len;
Alex Elder0bed54d2012-07-03 16:01:18 -0500185 char *image_name;
186 size_t image_name_len;
187 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500188 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500189 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700190
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700191 struct ceph_osd_event *watch_event;
192 struct ceph_osd_request *watch_request;
193
Josh Durginc6666012011-11-21 17:11:12 -0800194 /* protects updating the header */
195 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500196
197 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198
199 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800200
201 /* list of snapshots */
202 struct list_head snaps;
203
204 /* sysfs related */
205 struct device dev;
206};
207
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700208static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600209
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700210static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600211static DEFINE_SPINLOCK(rbd_dev_list_lock);
212
Alex Elder432b8582012-01-29 13:57:44 -0600213static LIST_HEAD(rbd_client_list); /* clients */
214static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700215
Alex Elder304f6802012-08-31 17:29:52 -0500216static int rbd_dev_snaps_update(struct rbd_device *rbd_dev);
217static int rbd_dev_snaps_register(struct rbd_device *rbd_dev);
218
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800219static void rbd_dev_release(struct device *dev);
Alex Elder14e70852012-07-19 09:09:27 -0500220static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800221
Alex Elderf0f8cef2012-01-29 13:57:44 -0600222static ssize_t rbd_add(struct bus_type *bus, const char *buf,
223 size_t count);
224static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
225 size_t count);
226
227static struct bus_attribute rbd_bus_attrs[] = {
228 __ATTR(add, S_IWUSR, NULL, rbd_add),
229 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
230 __ATTR_NULL
231};
232
233static struct bus_type rbd_bus_type = {
234 .name = "rbd",
235 .bus_attrs = rbd_bus_attrs,
236};
237
238static void rbd_root_dev_release(struct device *dev)
239{
240}
241
242static struct device rbd_root_dev = {
243 .init_name = "rbd",
244 .release = rbd_root_dev_release,
245};
246
Alex Elderaafb2302012-09-06 16:00:54 -0500247#ifdef RBD_DEBUG
248#define rbd_assert(expr) \
249 if (unlikely(!(expr))) { \
250 printk(KERN_ERR "\nAssertion failure in %s() " \
251 "at line %d:\n\n" \
252 "\trbd_assert(%s);\n\n", \
253 __func__, __LINE__, #expr); \
254 BUG(); \
255 }
256#else /* !RBD_DEBUG */
257# define rbd_assert(expr) ((void) 0)
258#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800259
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800260static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
261{
262 return get_device(&rbd_dev->dev);
263}
264
265static void rbd_put_dev(struct rbd_device *rbd_dev)
266{
267 put_device(&rbd_dev->dev);
268}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269
Alex Elder1fe5e992012-07-25 09:32:41 -0500270static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700271
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700272static int rbd_open(struct block_device *bdev, fmode_t mode)
273{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600274 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700275
Alex Elderf84344f2012-08-31 17:29:51 -0500276 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700277 return -EROFS;
278
Alex Elder340c7a22012-08-10 13:12:07 -0700279 rbd_get_dev(rbd_dev);
Alex Elderf84344f2012-08-31 17:29:51 -0500280 set_device_ro(bdev, rbd_dev->mapping.read_only);
Alex Elder340c7a22012-08-10 13:12:07 -0700281
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700282 return 0;
283}
284
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800285static int rbd_release(struct gendisk *disk, fmode_t mode)
286{
287 struct rbd_device *rbd_dev = disk->private_data;
288
289 rbd_put_dev(rbd_dev);
290
291 return 0;
292}
293
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294static const struct block_device_operations rbd_bd_ops = {
295 .owner = THIS_MODULE,
296 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800297 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700298};
299
300/*
301 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500302 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700303 */
Alex Elderf8c38922012-08-10 13:12:07 -0700304static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305{
306 struct rbd_client *rbdc;
307 int ret = -ENOMEM;
308
309 dout("rbd_client_create\n");
310 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
311 if (!rbdc)
312 goto out_opt;
313
314 kref_init(&rbdc->kref);
315 INIT_LIST_HEAD(&rbdc->node);
316
Alex Elderbc534d82012-01-29 13:57:44 -0600317 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
318
Alex Elder43ae4702012-07-03 16:01:18 -0500319 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700320 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600321 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500322 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700323
324 ret = ceph_open_session(rbdc->client);
325 if (ret < 0)
326 goto out_err;
327
Alex Elder432b8582012-01-29 13:57:44 -0600328 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700329 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600330 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700331
Alex Elderbc534d82012-01-29 13:57:44 -0600332 mutex_unlock(&ctl_mutex);
333
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 dout("rbd_client_create created %p\n", rbdc);
335 return rbdc;
336
337out_err:
338 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600339out_mutex:
340 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700341 kfree(rbdc);
342out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500343 if (ceph_opts)
344 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400345 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346}
347
348/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700349 * Find a ceph client with specific addr and configuration. If
350 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700351 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700352static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700353{
354 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700355 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700356
Alex Elder43ae4702012-07-03 16:01:18 -0500357 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358 return NULL;
359
Alex Elder1f7ba332012-08-10 13:12:07 -0700360 spin_lock(&rbd_client_list_lock);
361 list_for_each_entry(client_node, &rbd_client_list, node) {
362 if (!ceph_compare_options(ceph_opts, client_node->client)) {
363 kref_get(&client_node->kref);
364 found = true;
365 break;
366 }
367 }
368 spin_unlock(&rbd_client_list_lock);
369
370 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700371}
372
373/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700374 * mount options
375 */
376enum {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700377 Opt_last_int,
378 /* int args above */
379 Opt_last_string,
380 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700381 Opt_read_only,
382 Opt_read_write,
383 /* Boolean args above */
384 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700385};
386
Alex Elder43ae4702012-07-03 16:01:18 -0500387static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700388 /* int args above */
389 /* string args above */
Alex Elderf84344f2012-08-31 17:29:51 -0500390 {Opt_read_only, "mapping.read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700391 {Opt_read_only, "ro"}, /* Alternate spelling */
392 {Opt_read_write, "read_write"},
393 {Opt_read_write, "rw"}, /* Alternate spelling */
394 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700395 {-1, NULL}
396};
397
398static int parse_rbd_opts_token(char *c, void *private)
399{
Alex Elder43ae4702012-07-03 16:01:18 -0500400 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700401 substring_t argstr[MAX_OPT_ARGS];
402 int token, intval, ret;
403
Alex Elder43ae4702012-07-03 16:01:18 -0500404 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700405 if (token < 0)
406 return -EINVAL;
407
408 if (token < Opt_last_int) {
409 ret = match_int(&argstr[0], &intval);
410 if (ret < 0) {
411 pr_err("bad mount option arg (not int) "
412 "at '%s'\n", c);
413 return ret;
414 }
415 dout("got int token %d val %d\n", token, intval);
416 } else if (token > Opt_last_int && token < Opt_last_string) {
417 dout("got string token %d val %s\n", token,
418 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700419 } else if (token > Opt_last_string && token < Opt_last_bool) {
420 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700421 } else {
422 dout("got token %d\n", token);
423 }
424
425 switch (token) {
Alex Eldercc0538b2012-08-10 13:12:07 -0700426 case Opt_read_only:
427 rbd_opts->read_only = true;
428 break;
429 case Opt_read_write:
430 rbd_opts->read_only = false;
431 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700432 default:
Alex Elderaafb2302012-09-06 16:00:54 -0500433 rbd_assert(false);
434 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700435 }
436 return 0;
437}
438
439/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440 * Get a ceph client with specific addr and configuration, if one does
441 * not exist create it.
442 */
Alex Elderf8c38922012-08-10 13:12:07 -0700443static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
444 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445{
Alex Elderf8c38922012-08-10 13:12:07 -0700446 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500447 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700448 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700449
Alex Eldercc0538b2012-08-10 13:12:07 -0700450 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700451
Alex Elder43ae4702012-07-03 16:01:18 -0500452 ceph_opts = ceph_parse_options(options, mon_addr,
453 mon_addr + mon_addr_len,
454 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700455 if (IS_ERR(ceph_opts))
456 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457
Alex Elder1f7ba332012-08-10 13:12:07 -0700458 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700459 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600460 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500461 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700462 } else {
463 rbdc = rbd_client_create(ceph_opts);
464 if (IS_ERR(rbdc))
465 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700466 }
Alex Elderf8c38922012-08-10 13:12:07 -0700467 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700468
Alex Elderf8c38922012-08-10 13:12:07 -0700469 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470}
471
472/*
473 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600474 *
Alex Elder432b8582012-01-29 13:57:44 -0600475 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700476 */
477static void rbd_client_release(struct kref *kref)
478{
479 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
480
481 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500482 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500484 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700485
486 ceph_destroy_client(rbdc->client);
487 kfree(rbdc);
488}
489
490/*
491 * Drop reference to ceph client node. If it's not referenced anymore, release
492 * it.
493 */
494static void rbd_put_client(struct rbd_device *rbd_dev)
495{
496 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
497 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700498}
499
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700500/*
501 * Destroy requests collection
502 */
503static void rbd_coll_release(struct kref *kref)
504{
505 struct rbd_req_coll *coll =
506 container_of(kref, struct rbd_req_coll, kref);
507
508 dout("rbd_coll_release %p\n", coll);
509 kfree(coll);
510}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700511
Alex Eldera30b71b2012-07-10 20:30:11 -0500512static bool rbd_image_format_valid(u32 image_format)
513{
514 return image_format == 1 || image_format == 2;
515}
516
Alex Elder8e94af82012-07-25 09:32:40 -0500517static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
518{
Alex Elder103a1502012-08-02 11:29:45 -0500519 size_t size;
520 u32 snap_count;
521
522 /* The header has to start with the magic rbd header text */
523 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
524 return false;
525
526 /*
527 * The size of a snapshot header has to fit in a size_t, and
528 * that limits the number of snapshots.
529 */
530 snap_count = le32_to_cpu(ondisk->snap_count);
531 size = SIZE_MAX - sizeof (struct ceph_snap_context);
532 if (snap_count > size / sizeof (__le64))
533 return false;
534
535 /*
536 * Not only that, but the size of the entire the snapshot
537 * header must also be representable in a size_t.
538 */
539 size -= snap_count * sizeof (__le64);
540 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
541 return false;
542
543 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500544}
545
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700546/*
547 * Create a new header structure, translate header format from the on-disk
548 * header.
549 */
550static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500551 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700552{
Alex Elderccece232012-07-10 20:30:10 -0500553 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500554 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500555 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500556 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700557
Alex Elder6a523252012-07-19 17:12:59 -0500558 memset(header, 0, sizeof (*header));
559
Alex Elder103a1502012-08-02 11:29:45 -0500560 snap_count = le32_to_cpu(ondisk->snap_count);
561
Alex Elder58c17b02012-08-23 23:22:06 -0500562 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
563 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500564 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500566 memcpy(header->object_prefix, ondisk->object_prefix, len);
567 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600568
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700569 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500570 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
571
Alex Elder621901d2012-08-23 23:22:06 -0500572 /* Save a copy of the snapshot names */
573
Alex Elderf785cc12012-08-23 23:22:06 -0500574 if (snap_names_len > (u64) SIZE_MAX)
575 return -EIO;
576 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700577 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500578 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500579 /*
580 * Note that rbd_dev_v1_header_read() guarantees
581 * the ondisk buffer we're working with has
582 * snap_names_len bytes beyond the end of the
583 * snapshot id array, this memcpy() is safe.
584 */
585 memcpy(header->snap_names, &ondisk->snaps[snap_count],
586 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500587
Alex Elder621901d2012-08-23 23:22:06 -0500588 /* Record each snapshot's size */
589
Alex Elderd2bb24e2012-07-26 23:37:14 -0500590 size = snap_count * sizeof (*header->snap_sizes);
591 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500593 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500594 for (i = 0; i < snap_count; i++)
595 header->snap_sizes[i] =
596 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597 } else {
Alex Elderccece232012-07-10 20:30:10 -0500598 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599 header->snap_names = NULL;
600 header->snap_sizes = NULL;
601 }
Alex Elder849b4262012-07-09 21:04:24 -0500602
Alex Elder34b13182012-07-13 20:35:12 -0500603 header->features = 0; /* No features support in v1 images */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700604 header->obj_order = ondisk->options.order;
605 header->crypt_type = ondisk->options.crypt_type;
606 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500607
Alex Elder621901d2012-08-23 23:22:06 -0500608 /* Allocate and fill in the snapshot context */
609
Alex Elderf84344f2012-08-31 17:29:51 -0500610 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elder6a523252012-07-19 17:12:59 -0500611 size = sizeof (struct ceph_snap_context);
612 size += snap_count * sizeof (header->snapc->snaps[0]);
613 header->snapc = kzalloc(size, GFP_KERNEL);
614 if (!header->snapc)
615 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700616
617 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500618 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500620 for (i = 0; i < snap_count; i++)
621 header->snapc->snaps[i] =
622 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700623
624 return 0;
625
Alex Elder6a523252012-07-19 17:12:59 -0500626out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500627 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500628 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500630 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500631 kfree(header->object_prefix);
632 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500633
Alex Elder00f1f362012-02-07 12:03:36 -0600634 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700635}
636
Alex Elder8836b992012-08-30 14:42:15 -0500637static int snap_by_name(struct rbd_device *rbd_dev, const char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638{
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700639
Alex Eldere86924a2012-07-10 20:30:11 -0500640 struct rbd_snap *snap;
Alex Elder00f1f362012-02-07 12:03:36 -0600641
Alex Eldere86924a2012-07-10 20:30:11 -0500642 list_for_each_entry(snap, &rbd_dev->snaps, node) {
643 if (!strcmp(snap_name, snap->name)) {
644 rbd_dev->mapping.snap_id = snap->id;
645 rbd_dev->mapping.size = snap->size;
Alex Elder34b13182012-07-13 20:35:12 -0500646 rbd_dev->mapping.features = snap->features;
Alex Elder00f1f362012-02-07 12:03:36 -0600647
Alex Eldere86924a2012-07-10 20:30:11 -0500648 return 0;
Alex Elder00f1f362012-02-07 12:03:36 -0600649 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700650 }
Alex Eldere86924a2012-07-10 20:30:11 -0500651
Alex Elder00f1f362012-02-07 12:03:36 -0600652 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700653}
654
Alex Elder5ed16172012-08-29 17:11:07 -0500655static int rbd_dev_set_mapping(struct rbd_device *rbd_dev, char *snap_name)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656{
Alex Elder78dc4472012-07-19 08:49:18 -0500657 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658
Alex Elder4e1105a2012-08-31 17:29:52 -0500659 if (!memcmp(snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800660 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elderf84344f2012-08-31 17:29:51 -0500661 rbd_dev->mapping.snap_id = CEPH_NOSNAP;
Alex Elder99c1f082012-08-30 14:42:15 -0500662 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder34b13182012-07-13 20:35:12 -0500663 rbd_dev->mapping.features = rbd_dev->header.features;
Alex Elderf84344f2012-08-31 17:29:51 -0500664 rbd_dev->mapping.snap_exists = false;
665 rbd_dev->mapping.read_only = rbd_dev->rbd_opts.read_only;
Alex Eldere86924a2012-07-10 20:30:11 -0500666 ret = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667 } else {
Alex Elder8836b992012-08-30 14:42:15 -0500668 ret = snap_by_name(rbd_dev, snap_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700669 if (ret < 0)
670 goto done;
Alex Elderf84344f2012-08-31 17:29:51 -0500671 rbd_dev->mapping.snap_exists = true;
672 rbd_dev->mapping.read_only = true;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673 }
Alex Elder4e1105a2012-08-31 17:29:52 -0500674 rbd_dev->mapping.snap_name = snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700675done:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700676 return ret;
677}
678
679static void rbd_header_free(struct rbd_image_header *header)
680{
Alex Elder849b4262012-07-09 21:04:24 -0500681 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500682 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700683 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500684 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500685 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500686 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800687 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500688 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689}
690
Alex Elder65ccfe22012-08-09 10:33:26 -0700691static char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700692{
Alex Elder65ccfe22012-08-09 10:33:26 -0700693 char *name;
694 u64 segment;
695 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696
Alex Elder65ccfe22012-08-09 10:33:26 -0700697 name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
698 if (!name)
699 return NULL;
700 segment = offset >> rbd_dev->header.obj_order;
701 ret = snprintf(name, RBD_MAX_SEG_NAME_LEN, "%s.%012llx",
702 rbd_dev->header.object_prefix, segment);
703 if (ret < 0 || ret >= RBD_MAX_SEG_NAME_LEN) {
704 pr_err("error formatting segment name for #%llu (%d)\n",
705 segment, ret);
706 kfree(name);
707 name = NULL;
708 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709
Alex Elder65ccfe22012-08-09 10:33:26 -0700710 return name;
711}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712
Alex Elder65ccfe22012-08-09 10:33:26 -0700713static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
714{
715 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
Alex Elder65ccfe22012-08-09 10:33:26 -0700717 return offset & (segment_size - 1);
718}
719
720static u64 rbd_segment_length(struct rbd_device *rbd_dev,
721 u64 offset, u64 length)
722{
723 u64 segment_size = (u64) 1 << rbd_dev->header.obj_order;
724
725 offset &= segment_size - 1;
726
Alex Elderaafb2302012-09-06 16:00:54 -0500727 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -0700728 if (offset + length > segment_size)
729 length = segment_size - offset;
730
731 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700732}
733
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700734static int rbd_get_num_segments(struct rbd_image_header *header,
735 u64 ofs, u64 len)
736{
Alex Elderdf111be2012-08-09 10:33:26 -0700737 u64 start_seg;
738 u64 end_seg;
739
740 if (!len)
741 return 0;
742 if (len - 1 > U64_MAX - ofs)
743 return -ERANGE;
744
745 start_seg = ofs >> header->obj_order;
746 end_seg = (ofs + len - 1) >> header->obj_order;
747
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700748 return end_seg - start_seg + 1;
749}
750
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700752 * returns the size of an object in the image
753 */
754static u64 rbd_obj_bytes(struct rbd_image_header *header)
755{
756 return 1 << header->obj_order;
757}
758
759/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700760 * bio helpers
761 */
762
763static void bio_chain_put(struct bio *chain)
764{
765 struct bio *tmp;
766
767 while (chain) {
768 tmp = chain;
769 chain = chain->bi_next;
770 bio_put(tmp);
771 }
772}
773
774/*
775 * zeros a bio chain, starting at specific offset
776 */
777static void zero_bio_chain(struct bio *chain, int start_ofs)
778{
779 struct bio_vec *bv;
780 unsigned long flags;
781 void *buf;
782 int i;
783 int pos = 0;
784
785 while (chain) {
786 bio_for_each_segment(bv, chain, i) {
787 if (pos + bv->bv_len > start_ofs) {
788 int remainder = max(start_ofs - pos, 0);
789 buf = bvec_kmap_irq(bv, &flags);
790 memset(buf + remainder, 0,
791 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200792 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700793 }
794 pos += bv->bv_len;
795 }
796
797 chain = chain->bi_next;
798 }
799}
800
801/*
802 * bio_chain_clone - clone a chain of bios up to a certain length.
803 * might return a bio_pair that will need to be released.
804 */
805static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
806 struct bio_pair **bp,
807 int len, gfp_t gfpmask)
808{
Alex Elder542582f2012-08-09 10:33:25 -0700809 struct bio *old_chain = *old;
810 struct bio *new_chain = NULL;
811 struct bio *tail;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700812 int total = 0;
813
814 if (*bp) {
815 bio_pair_release(*bp);
816 *bp = NULL;
817 }
818
819 while (old_chain && (total < len)) {
Alex Elder542582f2012-08-09 10:33:25 -0700820 struct bio *tmp;
821
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700822 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
823 if (!tmp)
824 goto err_out;
Alex Elder542582f2012-08-09 10:33:25 -0700825 gfpmask &= ~__GFP_WAIT; /* can't wait after the first */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700826
827 if (total + old_chain->bi_size > len) {
828 struct bio_pair *bp;
829
830 /*
831 * this split can only happen with a single paged bio,
832 * split_bio will BUG_ON if this is not the case
833 */
834 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500835 "bi_size=%u\n",
836 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700837
838 /* split the bio. We'll release it either in the next
839 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600840 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700841 if (!bp)
842 goto err_out;
843
844 __bio_clone(tmp, &bp->bio1);
845
846 *next = &bp->bio2;
847 } else {
848 __bio_clone(tmp, old_chain);
849 *next = old_chain->bi_next;
850 }
851
852 tmp->bi_bdev = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700853 tmp->bi_next = NULL;
Alex Elder542582f2012-08-09 10:33:25 -0700854 if (new_chain)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700855 tail->bi_next = tmp;
Alex Elder542582f2012-08-09 10:33:25 -0700856 else
857 new_chain = tmp;
858 tail = tmp;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700859 old_chain = old_chain->bi_next;
860
861 total += tmp->bi_size;
862 }
863
Alex Elderaafb2302012-09-06 16:00:54 -0500864 rbd_assert(total == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700866 *old = old_chain;
867
868 return new_chain;
869
870err_out:
871 dout("bio_chain_clone with err\n");
872 bio_chain_put(new_chain);
873 return NULL;
874}
875
876/*
877 * helpers for osd request op vectors.
878 */
Alex Elder57cfc102012-06-26 12:57:03 -0700879static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
880 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700881{
Alex Elder57cfc102012-06-26 12:57:03 -0700882 struct ceph_osd_req_op *ops;
883
884 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
885 if (!ops)
886 return NULL;
887
888 ops[0].op = opcode;
889
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700890 /*
891 * op extent offset and length will be set later on
892 * in calc_raw_layout()
893 */
Alex Elder57cfc102012-06-26 12:57:03 -0700894 ops[0].payload_len = payload_len;
895
896 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700897}
898
899static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
900{
901 kfree(ops);
902}
903
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700904static void rbd_coll_end_req_index(struct request *rq,
905 struct rbd_req_coll *coll,
906 int index,
907 int ret, u64 len)
908{
909 struct request_queue *q;
910 int min, max, i;
911
Alex Elderbd919d42012-07-13 20:35:11 -0500912 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
913 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700914
915 if (!rq)
916 return;
917
918 if (!coll) {
919 blk_end_request(rq, ret, len);
920 return;
921 }
922
923 q = rq->q;
924
925 spin_lock_irq(q->queue_lock);
926 coll->status[index].done = 1;
927 coll->status[index].rc = ret;
928 coll->status[index].bytes = len;
929 max = min = coll->num_done;
930 while (max < coll->total && coll->status[max].done)
931 max++;
932
933 for (i = min; i<max; i++) {
934 __blk_end_request(rq, coll->status[i].rc,
935 coll->status[i].bytes);
936 coll->num_done++;
937 kref_put(&coll->kref, rbd_coll_release);
938 }
939 spin_unlock_irq(q->queue_lock);
940}
941
942static void rbd_coll_end_req(struct rbd_request *req,
943 int ret, u64 len)
944{
945 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
946}
947
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700948/*
949 * Send ceph osd request
950 */
951static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500952 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700953 struct ceph_snap_context *snapc,
954 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500955 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956 struct bio *bio,
957 struct page **pages,
958 int num_pages,
959 int flags,
960 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700961 struct rbd_req_coll *coll,
962 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700964 struct ceph_msg *msg),
965 struct ceph_osd_request **linger_req,
966 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967{
968 struct ceph_osd_request *req;
969 struct ceph_file_layout *layout;
970 int ret;
971 u64 bno;
972 struct timespec mtime = CURRENT_TIME;
973 struct rbd_request *req_data;
974 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600975 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700976
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700977 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700978 if (!req_data) {
979 if (coll)
980 rbd_coll_end_req_index(rq, coll, coll_index,
981 -ENOMEM, len);
982 return -ENOMEM;
983 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700984
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700985 if (coll) {
986 req_data->coll = coll;
987 req_data->coll_index = coll_index;
988 }
989
Alex Elderbd919d42012-07-13 20:35:11 -0500990 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
991 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700992
Alex Elder0ce1a792012-07-03 16:01:18 -0500993 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600994 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
995 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700996 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700997 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700998 goto done_pages;
999 }
1000
1001 req->r_callback = rbd_cb;
1002
1003 req_data->rq = rq;
1004 req_data->bio = bio;
1005 req_data->pages = pages;
1006 req_data->len = len;
1007
1008 req->r_priv = req_data;
1009
1010 reqhead = req->r_request->front.iov_base;
1011 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
1012
Alex Elderaded07e2012-07-03 16:01:18 -05001013 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001014 req->r_oid_len = strlen(req->r_oid);
1015
1016 layout = &req->r_file_layout;
1017 memset(layout, 0, sizeof(*layout));
1018 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
1019 layout->fl_stripe_count = cpu_to_le32(1);
1020 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -05001021 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -06001022 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
1023 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024
1025 ceph_osdc_build_request(req, ofs, &len,
1026 ops,
1027 snapc,
1028 &mtime,
1029 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001031 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001032 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001033 *linger_req = req;
1034 }
1035
Alex Elder1dbb4392012-01-24 10:08:37 -06001036 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001037 if (ret < 0)
1038 goto done_err;
1039
1040 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -06001041 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001042 if (ver)
1043 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001044 dout("reassert_ver=%llu\n",
1045 (unsigned long long)
1046 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001047 ceph_osdc_put_request(req);
1048 }
1049 return ret;
1050
1051done_err:
1052 bio_chain_put(req_data->bio);
1053 ceph_osdc_put_request(req);
1054done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001055 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001057 return ret;
1058}
1059
1060/*
1061 * Ceph osd op callback
1062 */
1063static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1064{
1065 struct rbd_request *req_data = req->r_priv;
1066 struct ceph_osd_reply_head *replyhead;
1067 struct ceph_osd_op *op;
1068 __s32 rc;
1069 u64 bytes;
1070 int read_op;
1071
1072 /* parse reply */
1073 replyhead = msg->front.iov_base;
1074 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1075 op = (void *)(replyhead + 1);
1076 rc = le32_to_cpu(replyhead->result);
1077 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001078 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001079
Alex Elderbd919d42012-07-13 20:35:11 -05001080 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1081 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001082
1083 if (rc == -ENOENT && read_op) {
1084 zero_bio_chain(req_data->bio, 0);
1085 rc = 0;
1086 } else if (rc == 0 && read_op && bytes < req_data->len) {
1087 zero_bio_chain(req_data->bio, bytes);
1088 bytes = req_data->len;
1089 }
1090
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001091 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001092
1093 if (req_data->bio)
1094 bio_chain_put(req_data->bio);
1095
1096 ceph_osdc_put_request(req);
1097 kfree(req_data);
1098}
1099
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001100static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1101{
1102 ceph_osdc_put_request(req);
1103}
1104
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001105/*
1106 * Do a synchronous ceph osd operation
1107 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001108static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109 struct ceph_snap_context *snapc,
1110 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001111 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001112 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001113 const char *object_name,
Alex Elderf8d4de62012-07-03 16:01:19 -05001114 u64 ofs, u64 inbound_size,
1115 char *inbound,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001116 struct ceph_osd_request **linger_req,
1117 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001118{
1119 int ret;
1120 struct page **pages;
1121 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001122
Alex Elderaafb2302012-09-06 16:00:54 -05001123 rbd_assert(ops != NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001124
Alex Elderf8d4de62012-07-03 16:01:19 -05001125 num_pages = calc_pages_for(ofs, inbound_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001126 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001127 if (IS_ERR(pages))
1128 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129
Alex Elder0ce1a792012-07-03 16:01:18 -05001130 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderf8d4de62012-07-03 16:01:19 -05001131 object_name, ofs, inbound_size, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001132 pages, num_pages,
1133 flags,
1134 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001135 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001136 NULL,
1137 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001138 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001139 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001140
Alex Elderf8d4de62012-07-03 16:01:19 -05001141 if ((flags & CEPH_OSD_FLAG_READ) && inbound)
1142 ret = ceph_copy_from_page_vector(pages, inbound, ofs, ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144done:
1145 ceph_release_page_vector(pages, num_pages);
1146 return ret;
1147}
1148
1149/*
1150 * Do an asynchronous ceph osd operation
1151 */
1152static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001153 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001154 struct ceph_snap_context *snapc,
1155 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001156 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001157 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001158 struct bio *bio,
1159 struct rbd_req_coll *coll,
1160 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001161{
1162 char *seg_name;
1163 u64 seg_ofs;
1164 u64 seg_len;
1165 int ret;
1166 struct ceph_osd_req_op *ops;
1167 u32 payload_len;
1168
Alex Elder65ccfe22012-08-09 10:33:26 -07001169 seg_name = rbd_segment_name(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001170 if (!seg_name)
1171 return -ENOMEM;
Alex Elder65ccfe22012-08-09 10:33:26 -07001172 seg_len = rbd_segment_length(rbd_dev, ofs, len);
1173 seg_ofs = rbd_segment_offset(rbd_dev, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001174
1175 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1176
Alex Elder57cfc102012-06-26 12:57:03 -07001177 ret = -ENOMEM;
1178 ops = rbd_create_rw_ops(1, opcode, payload_len);
1179 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180 goto done;
1181
1182 /* we've taken care of segment sizes earlier when we
1183 cloned the bios. We should never have a segment
1184 truncated at this point */
Alex Elderaafb2302012-09-06 16:00:54 -05001185 rbd_assert(seg_len == len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001186
1187 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1188 seg_name, seg_ofs, seg_len,
1189 bio,
1190 NULL, 0,
1191 flags,
1192 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001193 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001194 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001195
1196 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001197done:
1198 kfree(seg_name);
1199 return ret;
1200}
1201
1202/*
1203 * Request async osd write
1204 */
1205static int rbd_req_write(struct request *rq,
1206 struct rbd_device *rbd_dev,
1207 struct ceph_snap_context *snapc,
1208 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001209 struct bio *bio,
1210 struct rbd_req_coll *coll,
1211 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212{
1213 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1214 CEPH_OSD_OP_WRITE,
1215 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001216 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001217}
1218
1219/*
1220 * Request async osd read
1221 */
1222static int rbd_req_read(struct request *rq,
1223 struct rbd_device *rbd_dev,
1224 u64 snapid,
1225 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001226 struct bio *bio,
1227 struct rbd_req_coll *coll,
1228 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001229{
1230 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001231 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001232 CEPH_OSD_OP_READ,
1233 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001234 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001235}
1236
1237/*
1238 * Request sync osd read
1239 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001240static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001241 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001242 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001243 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001244 char *buf,
1245 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001246{
Alex Elder913d2fd2012-06-26 12:57:03 -07001247 struct ceph_osd_req_op *ops;
1248 int ret;
1249
1250 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1251 if (!ops)
1252 return -ENOMEM;
1253
1254 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001255 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001256 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001257 ops, object_name, ofs, len, buf, NULL, ver);
1258 rbd_destroy_ops(ops);
1259
1260 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261}
1262
1263/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001264 * Request sync osd watch
1265 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001266static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001267 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001268 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269{
1270 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001271 int ret;
1272
Alex Elder57cfc102012-06-26 12:57:03 -07001273 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1274 if (!ops)
1275 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001276
Josh Durgina71b8912011-12-05 18:10:44 -08001277 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001278 ops[0].watch.cookie = notify_id;
1279 ops[0].watch.flag = 0;
1280
Alex Elder0ce1a792012-07-03 16:01:18 -05001281 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001282 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001283 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001284 CEPH_OSD_FLAG_READ,
1285 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001286 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001287 rbd_simple_req_cb, 0, NULL);
1288
1289 rbd_destroy_ops(ops);
1290 return ret;
1291}
1292
1293static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1294{
Alex Elder0ce1a792012-07-03 16:01:18 -05001295 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001296 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001297 int rc;
1298
Alex Elder0ce1a792012-07-03 16:01:18 -05001299 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001300 return;
1301
Alex Elderbd919d42012-07-13 20:35:11 -05001302 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1303 rbd_dev->header_name, (unsigned long long) notify_id,
1304 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001305 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001306 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001307 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001308 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001309
Alex Elder7f0a24d2012-07-25 09:32:40 -05001310 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001311}
1312
1313/*
1314 * Request sync osd watch
1315 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001316static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001317{
1318 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001319 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001320 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001321
Alex Elder57cfc102012-06-26 12:57:03 -07001322 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1323 if (!ops)
1324 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001325
1326 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001327 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001328 if (ret < 0)
1329 goto fail;
1330
Alex Elder0e6f3222012-07-25 09:32:40 -05001331 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001332 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001333 ops[0].watch.flag = 1;
1334
Alex Elder0ce1a792012-07-03 16:01:18 -05001335 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001336 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001337 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1338 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001339 rbd_dev->header_name,
1340 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001341 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001342
1343 if (ret < 0)
1344 goto fail_event;
1345
1346 rbd_destroy_ops(ops);
1347 return 0;
1348
1349fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001350 ceph_osdc_cancel_event(rbd_dev->watch_event);
1351 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001352fail:
1353 rbd_destroy_ops(ops);
1354 return ret;
1355}
1356
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001357/*
1358 * Request sync osd unwatch
1359 */
Alex Elder070c6332012-07-25 09:32:41 -05001360static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001361{
1362 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001363 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001364
Alex Elder57cfc102012-06-26 12:57:03 -07001365 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1366 if (!ops)
1367 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001368
1369 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001370 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001371 ops[0].watch.flag = 0;
1372
Alex Elder0ce1a792012-07-03 16:01:18 -05001373 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001374 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001375 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1376 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001377 rbd_dev->header_name,
1378 0, 0, NULL, NULL, NULL);
1379
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001380
1381 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001382 ceph_osdc_cancel_event(rbd_dev->watch_event);
1383 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001384 return ret;
1385}
1386
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001387/*
Alex Elder3cb4a682012-06-26 12:57:03 -07001388 * Synchronous osd object method call
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001389 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001390static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001391 const char *object_name,
1392 const char *class_name,
1393 const char *method_name,
Alex Elder3cb4a682012-06-26 12:57:03 -07001394 const char *outbound,
1395 size_t outbound_size,
Alex Elderf8d4de62012-07-03 16:01:19 -05001396 char *inbound,
1397 size_t inbound_size,
Alex Elder3cb4a682012-06-26 12:57:03 -07001398 int flags,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001399 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001400{
1401 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001402 int class_name_len = strlen(class_name);
1403 int method_name_len = strlen(method_name);
Alex Elder3cb4a682012-06-26 12:57:03 -07001404 int payload_size;
Alex Elder57cfc102012-06-26 12:57:03 -07001405 int ret;
1406
Alex Elder3cb4a682012-06-26 12:57:03 -07001407 /*
1408 * Any input parameters required by the method we're calling
1409 * will be sent along with the class and method names as
1410 * part of the message payload. That data and its size are
1411 * supplied via the indata and indata_len fields (named from
1412 * the perspective of the server side) in the OSD request
1413 * operation.
1414 */
1415 payload_size = class_name_len + method_name_len + outbound_size;
1416 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, payload_size);
Alex Elder57cfc102012-06-26 12:57:03 -07001417 if (!ops)
1418 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001419
Alex Elderaded07e2012-07-03 16:01:18 -05001420 ops[0].cls.class_name = class_name;
1421 ops[0].cls.class_len = (__u8) class_name_len;
1422 ops[0].cls.method_name = method_name;
1423 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001424 ops[0].cls.argc = 0;
Alex Elder3cb4a682012-06-26 12:57:03 -07001425 ops[0].cls.indata = outbound;
1426 ops[0].cls.indata_len = outbound_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001427
Alex Elder0ce1a792012-07-03 16:01:18 -05001428 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001429 CEPH_NOSNAP,
Alex Elder3cb4a682012-06-26 12:57:03 -07001430 flags, ops,
Alex Elderf8d4de62012-07-03 16:01:19 -05001431 object_name, 0, inbound_size, inbound,
1432 NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001433
1434 rbd_destroy_ops(ops);
1435
1436 dout("cls_exec returned %d\n", ret);
1437 return ret;
1438}
1439
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001440static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1441{
1442 struct rbd_req_coll *coll =
1443 kzalloc(sizeof(struct rbd_req_coll) +
1444 sizeof(struct rbd_req_status) * num_reqs,
1445 GFP_ATOMIC);
1446
1447 if (!coll)
1448 return NULL;
1449 coll->total = num_reqs;
1450 kref_init(&coll->kref);
1451 return coll;
1452}
1453
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001454/*
1455 * block device queue callback
1456 */
1457static void rbd_rq_fn(struct request_queue *q)
1458{
1459 struct rbd_device *rbd_dev = q->queuedata;
1460 struct request *rq;
1461 struct bio_pair *bp = NULL;
1462
Alex Elder00f1f362012-02-07 12:03:36 -06001463 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001464 struct bio *bio;
1465 struct bio *rq_bio, *next_bio = NULL;
1466 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001467 unsigned int size;
1468 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001469 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001470 int num_segs, cur_seg = 0;
1471 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001472 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001474 dout("fetched request\n");
1475
1476 /* filter out block requests we don't understand */
1477 if ((rq->cmd_type != REQ_TYPE_FS)) {
1478 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001479 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001480 }
1481
1482 /* deduce our operation (read, write) */
1483 do_write = (rq_data_dir(rq) == WRITE);
1484
1485 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001486 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001487 rq_bio = rq->bio;
Alex Elderf84344f2012-08-31 17:29:51 -05001488 if (do_write && rbd_dev->mapping.read_only) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001489 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001490 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001491 }
1492
1493 spin_unlock_irq(q->queue_lock);
1494
Josh Durgind1d25642011-12-05 14:03:05 -08001495 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001496
Alex Elderf84344f2012-08-31 17:29:51 -05001497 if (rbd_dev->mapping.snap_id != CEPH_NOSNAP &&
1498 !rbd_dev->mapping.snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001499 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001500 dout("request for non-existent snapshot");
1501 spin_lock_irq(q->queue_lock);
1502 __blk_end_request_all(rq, -ENXIO);
1503 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001504 }
1505
Josh Durgind1d25642011-12-05 14:03:05 -08001506 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1507
1508 up_read(&rbd_dev->header_rwsem);
1509
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001510 dout("%s 0x%x bytes at 0x%llx\n",
1511 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001512 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001513
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
Alex Elderdf111be2012-08-09 10:33:26 -07001515 if (num_segs <= 0) {
1516 spin_lock_irq(q->queue_lock);
1517 __blk_end_request_all(rq, num_segs);
1518 ceph_put_snap_context(snapc);
1519 continue;
1520 }
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521 coll = rbd_alloc_coll(num_segs);
1522 if (!coll) {
1523 spin_lock_irq(q->queue_lock);
1524 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001525 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001526 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001527 }
1528
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529 do {
1530 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001531 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Alex Elder65ccfe22012-08-09 10:33:26 -07001532 op_size = rbd_segment_length(rbd_dev, ofs, size);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001533 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1535 op_size, GFP_ATOMIC);
1536 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001537 rbd_coll_end_req_index(rq, coll, cur_seg,
1538 -ENOMEM, op_size);
1539 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001540 }
1541
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001542
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 /* init OSD command: write or read */
1544 if (do_write)
1545 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001546 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001547 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001548 op_size, bio,
1549 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001550 else
1551 rbd_req_read(rq, rbd_dev,
Alex Elderf84344f2012-08-31 17:29:51 -05001552 rbd_dev->mapping.snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001554 op_size, bio,
1555 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001556
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001557next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001558 size -= op_size;
1559 ofs += op_size;
1560
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001561 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001562 rq_bio = next_bio;
1563 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001564 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001565
1566 if (bp)
1567 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001568 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001569
1570 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001571 }
1572}
1573
1574/*
1575 * a queue callback. Makes sure that we don't create a bio that spans across
1576 * multiple osd objects. One exception would be with a single page bios,
1577 * which we handle later at bio_chain_clone
1578 */
1579static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1580 struct bio_vec *bvec)
1581{
1582 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001583 unsigned int chunk_sectors;
1584 sector_t sector;
1585 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001586 int max;
1587
Alex Elder593a9e72012-02-07 12:03:37 -06001588 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1589 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1590 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1591
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001592 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001593 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001594 if (max < 0)
1595 max = 0; /* bio_add cannot handle a negative return */
1596 if (max <= bvec->bv_len && bio_sectors == 0)
1597 return bvec->bv_len;
1598 return max;
1599}
1600
1601static void rbd_free_disk(struct rbd_device *rbd_dev)
1602{
1603 struct gendisk *disk = rbd_dev->disk;
1604
1605 if (!disk)
1606 return;
1607
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001608 if (disk->flags & GENHD_FL_UP)
1609 del_gendisk(disk);
1610 if (disk->queue)
1611 blk_cleanup_queue(disk->queue);
1612 put_disk(disk);
1613}
1614
1615/*
Alex Elder4156d992012-08-02 11:29:46 -05001616 * Read the complete header for the given rbd device.
1617 *
1618 * Returns a pointer to a dynamically-allocated buffer containing
1619 * the complete and validated header. Caller can pass the address
1620 * of a variable that will be filled in with the version of the
1621 * header object at the time it was read.
1622 *
1623 * Returns a pointer-coded errno if a failure occurs.
1624 */
1625static struct rbd_image_header_ondisk *
1626rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1627{
1628 struct rbd_image_header_ondisk *ondisk = NULL;
1629 u32 snap_count = 0;
1630 u64 names_size = 0;
1631 u32 want_count;
1632 int ret;
1633
1634 /*
1635 * The complete header will include an array of its 64-bit
1636 * snapshot ids, followed by the names of those snapshots as
1637 * a contiguous block of NUL-terminated strings. Note that
1638 * the number of snapshots could change by the time we read
1639 * it in, in which case we re-read it.
1640 */
1641 do {
1642 size_t size;
1643
1644 kfree(ondisk);
1645
1646 size = sizeof (*ondisk);
1647 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1648 size += names_size;
1649 ondisk = kmalloc(size, GFP_KERNEL);
1650 if (!ondisk)
1651 return ERR_PTR(-ENOMEM);
1652
1653 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1654 rbd_dev->header_name,
1655 0, size,
1656 (char *) ondisk, version);
1657
1658 if (ret < 0)
1659 goto out_err;
1660 if (WARN_ON((size_t) ret < size)) {
1661 ret = -ENXIO;
1662 pr_warning("short header read for image %s"
1663 " (want %zd got %d)\n",
1664 rbd_dev->image_name, size, ret);
1665 goto out_err;
1666 }
1667 if (!rbd_dev_ondisk_valid(ondisk)) {
1668 ret = -ENXIO;
1669 pr_warning("invalid header for image %s\n",
1670 rbd_dev->image_name);
1671 goto out_err;
1672 }
1673
1674 names_size = le64_to_cpu(ondisk->snap_names_len);
1675 want_count = snap_count;
1676 snap_count = le32_to_cpu(ondisk->snap_count);
1677 } while (snap_count != want_count);
1678
1679 return ondisk;
1680
1681out_err:
1682 kfree(ondisk);
1683
1684 return ERR_PTR(ret);
1685}
1686
1687/*
1688 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001689 */
1690static int rbd_read_header(struct rbd_device *rbd_dev,
1691 struct rbd_image_header *header)
1692{
Alex Elder4156d992012-08-02 11:29:46 -05001693 struct rbd_image_header_ondisk *ondisk;
1694 u64 ver = 0;
1695 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001696
Alex Elder4156d992012-08-02 11:29:46 -05001697 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1698 if (IS_ERR(ondisk))
1699 return PTR_ERR(ondisk);
1700 ret = rbd_header_from_disk(header, ondisk);
1701 if (ret >= 0)
1702 header->obj_version = ver;
1703 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001704
Alex Elder4156d992012-08-02 11:29:46 -05001705 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001706}
1707
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001708static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1709{
1710 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001711 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001712
Alex Eldera0593292012-07-19 09:09:27 -05001713 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001714 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001715}
1716
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001717/*
1718 * only read the first part of the ondisk header, without the snaps info
1719 */
Alex Elderb8136232012-07-25 09:32:41 -05001720static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001721{
1722 int ret;
1723 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001724
1725 ret = rbd_read_header(rbd_dev, &h);
1726 if (ret < 0)
1727 return ret;
1728
Josh Durgina51aa0c2011-12-05 10:35:04 -08001729 down_write(&rbd_dev->header_rwsem);
1730
Sage Weil9db4b3e2011-04-19 22:49:06 -07001731 /* resized? */
Alex Elderf84344f2012-08-31 17:29:51 -05001732 if (rbd_dev->mapping.snap_id == CEPH_NOSNAP) {
Josh Durgin474ef7c2011-11-21 17:13:54 -08001733 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1734
Alex Elder99c1f082012-08-30 14:42:15 -05001735 if (size != (sector_t) rbd_dev->mapping.size) {
1736 dout("setting size to %llu sectors",
1737 (unsigned long long) size);
1738 rbd_dev->mapping.size = (u64) size;
1739 set_capacity(rbd_dev->disk, size);
1740 }
Josh Durgin474ef7c2011-11-21 17:13:54 -08001741 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001742
Alex Elder849b4262012-07-09 21:04:24 -05001743 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001745 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001746 /* osd requests may still refer to snapc */
1747 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001748
Alex Elderb8136232012-07-25 09:32:41 -05001749 if (hver)
1750 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001751 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001752 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001753 rbd_dev->header.snapc = h.snapc;
1754 rbd_dev->header.snap_names = h.snap_names;
1755 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001756 /* Free the extra copy of the object prefix */
1757 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1758 kfree(h.object_prefix);
1759
Alex Elder304f6802012-08-31 17:29:52 -05001760 ret = rbd_dev_snaps_update(rbd_dev);
1761 if (!ret)
1762 ret = rbd_dev_snaps_register(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001763
Josh Durginc6666012011-11-21 17:11:12 -08001764 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001765
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001766 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001767}
1768
Alex Elder1fe5e992012-07-25 09:32:41 -05001769static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1770{
1771 int ret;
1772
1773 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1774 ret = __rbd_refresh_header(rbd_dev, hver);
1775 mutex_unlock(&ctl_mutex);
1776
1777 return ret;
1778}
1779
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780static int rbd_init_disk(struct rbd_device *rbd_dev)
1781{
1782 struct gendisk *disk;
1783 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06001784 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001785
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786 /* create gendisk info */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001787 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1788 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001789 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001790
Alex Elderf0f8cef2012-01-29 13:57:44 -06001791 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001792 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001793 disk->major = rbd_dev->major;
1794 disk->first_minor = 0;
1795 disk->fops = &rbd_bd_ops;
1796 disk->private_data = rbd_dev;
1797
1798 /* init rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1800 if (!q)
1801 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001802
Alex Elder593a9e72012-02-07 12:03:37 -06001803 /* We use the default size, but let's be explicit about it. */
1804 blk_queue_physical_block_size(q, SECTOR_SIZE);
1805
Josh Durgin029bcbd2011-07-22 11:35:23 -07001806 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001807 segment_size = rbd_obj_bytes(&rbd_dev->header);
1808 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1809 blk_queue_max_segment_size(q, segment_size);
1810 blk_queue_io_min(q, segment_size);
1811 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001812
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813 blk_queue_merge_bvec(q, rbd_merge_bvec);
1814 disk->queue = q;
1815
1816 q->queuedata = rbd_dev;
1817
1818 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001819
Alex Elder12f02942012-08-29 17:11:07 -05001820 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
1821
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001822 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001823out_disk:
1824 put_disk(disk);
Alex Elder1fcdb8a2012-08-29 17:11:06 -05001825
1826 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001827}
1828
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001829/*
1830 sysfs
1831*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832
Alex Elder593a9e72012-02-07 12:03:37 -06001833static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1834{
1835 return container_of(dev, struct rbd_device, dev);
1836}
1837
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001838static ssize_t rbd_size_show(struct device *dev,
1839 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840{
Alex Elder593a9e72012-02-07 12:03:37 -06001841 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001842 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001843
Josh Durgina51aa0c2011-12-05 10:35:04 -08001844 down_read(&rbd_dev->header_rwsem);
1845 size = get_capacity(rbd_dev->disk);
1846 up_read(&rbd_dev->header_rwsem);
1847
1848 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001849}
1850
Alex Elder34b13182012-07-13 20:35:12 -05001851/*
1852 * Note this shows the features for whatever's mapped, which is not
1853 * necessarily the base image.
1854 */
1855static ssize_t rbd_features_show(struct device *dev,
1856 struct device_attribute *attr, char *buf)
1857{
1858 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1859
1860 return sprintf(buf, "0x%016llx\n",
1861 (unsigned long long) rbd_dev->mapping.features);
1862}
1863
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001864static ssize_t rbd_major_show(struct device *dev,
1865 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001866{
Alex Elder593a9e72012-02-07 12:03:37 -06001867 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001868
1869 return sprintf(buf, "%d\n", rbd_dev->major);
1870}
1871
1872static ssize_t rbd_client_id_show(struct device *dev,
1873 struct device_attribute *attr, char *buf)
1874{
Alex Elder593a9e72012-02-07 12:03:37 -06001875 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001876
Alex Elder1dbb4392012-01-24 10:08:37 -06001877 return sprintf(buf, "client%lld\n",
1878 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001879}
1880
1881static ssize_t rbd_pool_show(struct device *dev,
1882 struct device_attribute *attr, char *buf)
1883{
Alex Elder593a9e72012-02-07 12:03:37 -06001884 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001885
1886 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1887}
1888
Alex Elder9bb2f332012-07-12 10:46:35 -05001889static ssize_t rbd_pool_id_show(struct device *dev,
1890 struct device_attribute *attr, char *buf)
1891{
1892 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1893
1894 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1895}
1896
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001897static ssize_t rbd_name_show(struct device *dev,
1898 struct device_attribute *attr, char *buf)
1899{
Alex Elder593a9e72012-02-07 12:03:37 -06001900 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001901
Alex Elder0bed54d2012-07-03 16:01:18 -05001902 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001903}
1904
Alex Elder589d30e2012-07-10 20:30:11 -05001905static ssize_t rbd_image_id_show(struct device *dev,
1906 struct device_attribute *attr, char *buf)
1907{
1908 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1909
1910 return sprintf(buf, "%s\n", rbd_dev->image_id);
1911}
1912
Alex Elder34b13182012-07-13 20:35:12 -05001913/*
1914 * Shows the name of the currently-mapped snapshot (or
1915 * RBD_SNAP_HEAD_NAME for the base image).
1916 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001917static ssize_t rbd_snap_show(struct device *dev,
1918 struct device_attribute *attr,
1919 char *buf)
1920{
Alex Elder593a9e72012-02-07 12:03:37 -06001921 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001922
Alex Elderf84344f2012-08-31 17:29:51 -05001923 return sprintf(buf, "%s\n", rbd_dev->mapping.snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001924}
1925
1926static ssize_t rbd_image_refresh(struct device *dev,
1927 struct device_attribute *attr,
1928 const char *buf,
1929 size_t size)
1930{
Alex Elder593a9e72012-02-07 12:03:37 -06001931 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001932 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001933
Alex Elder1fe5e992012-07-25 09:32:41 -05001934 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001935
1936 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001937}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001938
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001939static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05001940static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001941static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1942static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1943static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001944static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001945static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05001946static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001947static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1948static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001949
1950static struct attribute *rbd_attrs[] = {
1951 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05001952 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001953 &dev_attr_major.attr,
1954 &dev_attr_client_id.attr,
1955 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001956 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001957 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05001958 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001959 &dev_attr_current_snap.attr,
1960 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001961 NULL
1962};
1963
1964static struct attribute_group rbd_attr_group = {
1965 .attrs = rbd_attrs,
1966};
1967
1968static const struct attribute_group *rbd_attr_groups[] = {
1969 &rbd_attr_group,
1970 NULL
1971};
1972
1973static void rbd_sysfs_dev_release(struct device *dev)
1974{
1975}
1976
1977static struct device_type rbd_device_type = {
1978 .name = "rbd",
1979 .groups = rbd_attr_groups,
1980 .release = rbd_sysfs_dev_release,
1981};
1982
1983
1984/*
1985 sysfs - snapshots
1986*/
1987
1988static ssize_t rbd_snap_size_show(struct device *dev,
1989 struct device_attribute *attr,
1990 char *buf)
1991{
1992 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1993
Josh Durgin3591538f2011-12-05 18:25:13 -08001994 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001995}
1996
1997static ssize_t rbd_snap_id_show(struct device *dev,
1998 struct device_attribute *attr,
1999 char *buf)
2000{
2001 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2002
Josh Durgin3591538f2011-12-05 18:25:13 -08002003 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002004}
2005
Alex Elder34b13182012-07-13 20:35:12 -05002006static ssize_t rbd_snap_features_show(struct device *dev,
2007 struct device_attribute *attr,
2008 char *buf)
2009{
2010 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2011
2012 return sprintf(buf, "0x%016llx\n",
2013 (unsigned long long) snap->features);
2014}
2015
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002016static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2017static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05002018static DEVICE_ATTR(snap_features, S_IRUGO, rbd_snap_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002019
2020static struct attribute *rbd_snap_attrs[] = {
2021 &dev_attr_snap_size.attr,
2022 &dev_attr_snap_id.attr,
Alex Elder34b13182012-07-13 20:35:12 -05002023 &dev_attr_snap_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002024 NULL,
2025};
2026
2027static struct attribute_group rbd_snap_attr_group = {
2028 .attrs = rbd_snap_attrs,
2029};
2030
2031static void rbd_snap_dev_release(struct device *dev)
2032{
2033 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2034 kfree(snap->name);
2035 kfree(snap);
2036}
2037
2038static const struct attribute_group *rbd_snap_attr_groups[] = {
2039 &rbd_snap_attr_group,
2040 NULL
2041};
2042
2043static struct device_type rbd_snap_device_type = {
2044 .groups = rbd_snap_attr_groups,
2045 .release = rbd_snap_dev_release,
2046};
2047
Alex Elder304f6802012-08-31 17:29:52 -05002048static bool rbd_snap_registered(struct rbd_snap *snap)
2049{
2050 bool ret = snap->dev.type == &rbd_snap_device_type;
2051 bool reg = device_is_registered(&snap->dev);
2052
2053 rbd_assert(!ret ^ reg);
2054
2055 return ret;
2056}
2057
Alex Elder14e70852012-07-19 09:09:27 -05002058static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002059{
2060 list_del(&snap->node);
Alex Elder304f6802012-08-31 17:29:52 -05002061 if (device_is_registered(&snap->dev))
2062 device_unregister(&snap->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002063}
2064
Alex Elder14e70852012-07-19 09:09:27 -05002065static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002066 struct device *parent)
2067{
2068 struct device *dev = &snap->dev;
2069 int ret;
2070
2071 dev->type = &rbd_snap_device_type;
2072 dev->parent = parent;
2073 dev->release = rbd_snap_dev_release;
2074 dev_set_name(dev, "snap_%s", snap->name);
Alex Elder304f6802012-08-31 17:29:52 -05002075 dout("%s: registering device for snapshot %s\n", __func__, snap->name);
2076
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002077 ret = device_register(dev);
2078
2079 return ret;
2080}
2081
Alex Elder4e891e02012-07-10 20:30:10 -05002082static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
Alex Elderc8d18422012-07-10 20:30:11 -05002083 const char *snap_name,
Alex Elder34b13182012-07-13 20:35:12 -05002084 u64 snap_id, u64 snap_size,
2085 u64 snap_features)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002086{
Alex Elder4e891e02012-07-10 20:30:10 -05002087 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002088 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002089
2090 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002091 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002092 return ERR_PTR(-ENOMEM);
2093
2094 ret = -ENOMEM;
Alex Elderc8d18422012-07-10 20:30:11 -05002095 snap->name = kstrdup(snap_name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002096 if (!snap->name)
2097 goto err;
2098
Alex Elderc8d18422012-07-10 20:30:11 -05002099 snap->id = snap_id;
2100 snap->size = snap_size;
Alex Elder34b13182012-07-13 20:35:12 -05002101 snap->features = snap_features;
Alex Elder4e891e02012-07-10 20:30:10 -05002102
2103 return snap;
2104
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002105err:
2106 kfree(snap->name);
2107 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002108
2109 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002110}
2111
Alex Eldercd892122012-07-03 16:01:19 -05002112static char *rbd_dev_v1_snap_info(struct rbd_device *rbd_dev, u32 which,
2113 u64 *snap_size, u64 *snap_features)
2114{
2115 char *snap_name;
2116
2117 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
2118
2119 *snap_size = rbd_dev->header.snap_sizes[which];
2120 *snap_features = 0; /* No features for v1 */
2121
2122 /* Skip over names until we find the one we are looking for */
2123
2124 snap_name = rbd_dev->header.snap_names;
2125 while (which--)
2126 snap_name += strlen(snap_name) + 1;
2127
2128 return snap_name;
2129}
2130
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002131/*
Alex Elder9d475de2012-07-03 16:01:19 -05002132 * Get the size and object order for an image snapshot, or if
2133 * snap_id is CEPH_NOSNAP, gets this information for the base
2134 * image.
2135 */
2136static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
2137 u8 *order, u64 *snap_size)
2138{
2139 __le64 snapid = cpu_to_le64(snap_id);
2140 int ret;
2141 struct {
2142 u8 order;
2143 __le64 size;
2144 } __attribute__ ((packed)) size_buf = { 0 };
2145
2146 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2147 "rbd", "get_size",
2148 (char *) &snapid, sizeof (snapid),
2149 (char *) &size_buf, sizeof (size_buf),
2150 CEPH_OSD_FLAG_READ, NULL);
2151 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2152 if (ret < 0)
2153 return ret;
2154
2155 *order = size_buf.order;
2156 *snap_size = le64_to_cpu(size_buf.size);
2157
2158 dout(" snap_id 0x%016llx order = %u, snap_size = %llu\n",
2159 (unsigned long long) snap_id, (unsigned int) *order,
2160 (unsigned long long) *snap_size);
2161
2162 return 0;
2163}
2164
2165static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
2166{
2167 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
2168 &rbd_dev->header.obj_order,
2169 &rbd_dev->header.image_size);
2170}
2171
Alex Elder1e130192012-07-03 16:01:19 -05002172static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
2173{
2174 void *reply_buf;
2175 int ret;
2176 void *p;
2177
2178 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
2179 if (!reply_buf)
2180 return -ENOMEM;
2181
2182 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
2183 "rbd", "get_object_prefix",
2184 NULL, 0,
2185 reply_buf, RBD_OBJ_PREFIX_LEN_MAX,
2186 CEPH_OSD_FLAG_READ, NULL);
2187 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2188 if (ret < 0)
2189 goto out;
2190
2191 p = reply_buf;
2192 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
2193 p + RBD_OBJ_PREFIX_LEN_MAX,
2194 NULL, GFP_NOIO);
2195
2196 if (IS_ERR(rbd_dev->header.object_prefix)) {
2197 ret = PTR_ERR(rbd_dev->header.object_prefix);
2198 rbd_dev->header.object_prefix = NULL;
2199 } else {
2200 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
2201 }
2202
2203out:
2204 kfree(reply_buf);
2205
2206 return ret;
2207}
2208
Alex Elder9d475de2012-07-03 16:01:19 -05002209/*
Alex Elder35938152012-08-02 11:29:46 -05002210 * Scan the rbd device's current snapshot list and compare it to the
2211 * newly-received snapshot context. Remove any existing snapshots
2212 * not present in the new snapshot context. Add a new snapshot for
2213 * any snaphots in the snapshot context not in the current list.
2214 * And verify there are no changes to snapshots we already know
2215 * about.
2216 *
2217 * Assumes the snapshots in the snapshot context are sorted by
2218 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2219 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002220 */
Alex Elder304f6802012-08-31 17:29:52 -05002221static int rbd_dev_snaps_update(struct rbd_device *rbd_dev)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002222{
Alex Elder35938152012-08-02 11:29:46 -05002223 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2224 const u32 snap_count = snapc->num_snaps;
Alex Elder35938152012-08-02 11:29:46 -05002225 struct list_head *head = &rbd_dev->snaps;
2226 struct list_head *links = head->next;
2227 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002228
Alex Elder9fcbb802012-08-23 23:48:49 -05002229 dout("%s: snap count is %u\n", __func__, (unsigned int) snap_count);
Alex Elder35938152012-08-02 11:29:46 -05002230 while (index < snap_count || links != head) {
2231 u64 snap_id;
2232 struct rbd_snap *snap;
Alex Eldercd892122012-07-03 16:01:19 -05002233 char *snap_name;
2234 u64 snap_size = 0;
2235 u64 snap_features = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002236
Alex Elder35938152012-08-02 11:29:46 -05002237 snap_id = index < snap_count ? snapc->snaps[index]
2238 : CEPH_NOSNAP;
2239 snap = links != head ? list_entry(links, struct rbd_snap, node)
2240 : NULL;
Alex Elderaafb2302012-09-06 16:00:54 -05002241 rbd_assert(!snap || snap->id != CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002242
Alex Elder35938152012-08-02 11:29:46 -05002243 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2244 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002245
Alex Elder35938152012-08-02 11:29:46 -05002246 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002247
Alex Elderf84344f2012-08-31 17:29:51 -05002248 if (rbd_dev->mapping.snap_id == snap->id)
2249 rbd_dev->mapping.snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002250 __rbd_remove_snap_dev(snap);
Alex Elder9fcbb802012-08-23 23:48:49 -05002251 dout("%ssnap id %llu has been removed\n",
Alex Elderf84344f2012-08-31 17:29:51 -05002252 rbd_dev->mapping.snap_id == snap->id ?
2253 "mapped " : "",
Alex Elder9fcbb802012-08-23 23:48:49 -05002254 (unsigned long long) snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002255
Alex Elder35938152012-08-02 11:29:46 -05002256 /* Done with this list entry; advance */
2257
2258 links = next;
2259 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002260 }
Alex Elder35938152012-08-02 11:29:46 -05002261
Alex Eldercd892122012-07-03 16:01:19 -05002262 snap_name = rbd_dev_v1_snap_info(rbd_dev, index,
2263 &snap_size, &snap_features);
2264 if (IS_ERR(snap_name))
2265 return PTR_ERR(snap_name);
2266
Alex Elder9fcbb802012-08-23 23:48:49 -05002267 dout("entry %u: snap_id = %llu\n", (unsigned int) snap_count,
2268 (unsigned long long) snap_id);
Alex Elder35938152012-08-02 11:29:46 -05002269 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2270 struct rbd_snap *new_snap;
2271
2272 /* We haven't seen this snapshot before */
2273
Alex Elderc8d18422012-07-10 20:30:11 -05002274 new_snap = __rbd_add_snap_dev(rbd_dev, snap_name,
Alex Eldercd892122012-07-03 16:01:19 -05002275 snap_id, snap_size, snap_features);
Alex Elder9fcbb802012-08-23 23:48:49 -05002276 if (IS_ERR(new_snap)) {
2277 int err = PTR_ERR(new_snap);
2278
2279 dout(" failed to add dev, error %d\n", err);
2280
2281 return err;
2282 }
Alex Elder35938152012-08-02 11:29:46 -05002283
2284 /* New goes before existing, or at end of list */
2285
Alex Elder9fcbb802012-08-23 23:48:49 -05002286 dout(" added dev%s\n", snap ? "" : " at end\n");
Alex Elder35938152012-08-02 11:29:46 -05002287 if (snap)
2288 list_add_tail(&new_snap->node, &snap->node);
2289 else
Alex Elder523f3252012-08-30 00:16:37 -05002290 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002291 } else {
2292 /* Already have this one */
2293
Alex Elder9fcbb802012-08-23 23:48:49 -05002294 dout(" already present\n");
2295
Alex Eldercd892122012-07-03 16:01:19 -05002296 rbd_assert(snap->size == snap_size);
Alex Elderaafb2302012-09-06 16:00:54 -05002297 rbd_assert(!strcmp(snap->name, snap_name));
Alex Eldercd892122012-07-03 16:01:19 -05002298 rbd_assert(snap->features == snap_features);
Alex Elder35938152012-08-02 11:29:46 -05002299
2300 /* Done with this list entry; advance */
2301
2302 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002303 }
Alex Elder35938152012-08-02 11:29:46 -05002304
2305 /* Advance to the next entry in the snapshot context */
2306
2307 index++;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002308 }
Alex Elder9fcbb802012-08-23 23:48:49 -05002309 dout("%s: done\n", __func__);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002310
2311 return 0;
2312}
2313
Alex Elder304f6802012-08-31 17:29:52 -05002314/*
2315 * Scan the list of snapshots and register the devices for any that
2316 * have not already been registered.
2317 */
2318static int rbd_dev_snaps_register(struct rbd_device *rbd_dev)
2319{
2320 struct rbd_snap *snap;
2321 int ret = 0;
2322
2323 dout("%s called\n", __func__);
Alex Elder86ff77b2012-08-31 17:29:53 -05002324 if (WARN_ON(!device_is_registered(&rbd_dev->dev)))
2325 return -EIO;
Alex Elder304f6802012-08-31 17:29:52 -05002326
2327 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2328 if (!rbd_snap_registered(snap)) {
2329 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
2330 if (ret < 0)
2331 break;
2332 }
2333 }
2334 dout("%s: returning %d\n", __func__, ret);
2335
2336 return ret;
2337}
2338
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002339static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2340{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002341 struct device *dev;
Alex Eldercd789ab2012-08-30 00:16:38 -05002342 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002343
2344 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002345
Alex Eldercd789ab2012-08-30 00:16:38 -05002346 dev = &rbd_dev->dev;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002347 dev->bus = &rbd_bus_type;
2348 dev->type = &rbd_device_type;
2349 dev->parent = &rbd_root_dev;
2350 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002351 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002352 ret = device_register(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002353
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002354 mutex_unlock(&ctl_mutex);
Alex Eldercd789ab2012-08-30 00:16:38 -05002355
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002356 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002357}
2358
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002359static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2360{
2361 device_unregister(&rbd_dev->dev);
2362}
2363
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002364static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2365{
2366 int ret, rc;
2367
2368 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002369 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002370 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002371 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002372 if (rc < 0)
2373 return rc;
2374 }
2375 } while (ret == -ERANGE);
2376
2377 return ret;
2378}
2379
Alex Eldere2839302012-08-29 17:11:06 -05002380static atomic64_t rbd_dev_id_max = ATOMIC64_INIT(0);
Alex Elder1ddbe942012-01-29 13:57:44 -06002381
2382/*
Alex Elder499afd52012-02-02 08:13:29 -06002383 * Get a unique rbd identifier for the given new rbd_dev, and add
2384 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002385 */
Alex Eldere2839302012-08-29 17:11:06 -05002386static void rbd_dev_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002387{
Alex Eldere2839302012-08-29 17:11:06 -05002388 rbd_dev->dev_id = atomic64_inc_return(&rbd_dev_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002389
2390 spin_lock(&rbd_dev_list_lock);
2391 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2392 spin_unlock(&rbd_dev_list_lock);
Alex Eldere2839302012-08-29 17:11:06 -05002393 dout("rbd_dev %p given dev id %llu\n", rbd_dev,
2394 (unsigned long long) rbd_dev->dev_id);
Alex Elder1ddbe942012-01-29 13:57:44 -06002395}
Alex Elderb7f23c32012-01-29 13:57:43 -06002396
Alex Elder1ddbe942012-01-29 13:57:44 -06002397/*
Alex Elder499afd52012-02-02 08:13:29 -06002398 * Remove an rbd_dev from the global list, and record that its
2399 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002400 */
Alex Eldere2839302012-08-29 17:11:06 -05002401static void rbd_dev_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002402{
Alex Elderd184f6b2012-01-29 13:57:44 -06002403 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002404 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002405 int max_id;
2406
Alex Elderaafb2302012-09-06 16:00:54 -05002407 rbd_assert(rbd_id > 0);
Alex Elder499afd52012-02-02 08:13:29 -06002408
Alex Eldere2839302012-08-29 17:11:06 -05002409 dout("rbd_dev %p released dev id %llu\n", rbd_dev,
2410 (unsigned long long) rbd_dev->dev_id);
Alex Elder499afd52012-02-02 08:13:29 -06002411 spin_lock(&rbd_dev_list_lock);
2412 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002413
2414 /*
2415 * If the id being "put" is not the current maximum, there
2416 * is nothing special we need to do.
2417 */
Alex Eldere2839302012-08-29 17:11:06 -05002418 if (rbd_id != atomic64_read(&rbd_dev_id_max)) {
Alex Elderd184f6b2012-01-29 13:57:44 -06002419 spin_unlock(&rbd_dev_list_lock);
2420 return;
2421 }
2422
2423 /*
2424 * We need to update the current maximum id. Search the
2425 * list to find out what it is. We're more likely to find
2426 * the maximum at the end, so search the list backward.
2427 */
2428 max_id = 0;
2429 list_for_each_prev(tmp, &rbd_dev_list) {
2430 struct rbd_device *rbd_dev;
2431
2432 rbd_dev = list_entry(tmp, struct rbd_device, node);
2433 if (rbd_id > max_id)
2434 max_id = rbd_id;
2435 }
Alex Elder499afd52012-02-02 08:13:29 -06002436 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002437
Alex Elder1ddbe942012-01-29 13:57:44 -06002438 /*
Alex Eldere2839302012-08-29 17:11:06 -05002439 * The max id could have been updated by rbd_dev_id_get(), in
Alex Elderd184f6b2012-01-29 13:57:44 -06002440 * which case it now accurately reflects the new maximum.
2441 * Be careful not to overwrite the maximum value in that
2442 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002443 */
Alex Eldere2839302012-08-29 17:11:06 -05002444 atomic64_cmpxchg(&rbd_dev_id_max, rbd_id, max_id);
2445 dout(" max dev id has been reset\n");
Alex Elderb7f23c32012-01-29 13:57:43 -06002446}
2447
Alex Eldera725f65e2012-02-02 08:13:30 -06002448/*
Alex Eldere28fff262012-02-02 08:13:30 -06002449 * Skips over white space at *buf, and updates *buf to point to the
2450 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002451 * the token (string of non-white space characters) found. Note
2452 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002453 */
2454static inline size_t next_token(const char **buf)
2455{
2456 /*
2457 * These are the characters that produce nonzero for
2458 * isspace() in the "C" and "POSIX" locales.
2459 */
2460 const char *spaces = " \f\n\r\t\v";
2461
2462 *buf += strspn(*buf, spaces); /* Find start of token */
2463
2464 return strcspn(*buf, spaces); /* Return token length */
2465}
2466
2467/*
2468 * Finds the next token in *buf, and if the provided token buffer is
2469 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002470 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2471 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002472 *
2473 * Returns the length of the token found (not including the '\0').
2474 * Return value will be 0 if no token is found, and it will be >=
2475 * token_size if the token would not fit.
2476 *
Alex Elder593a9e72012-02-07 12:03:37 -06002477 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002478 * found token. Note that this occurs even if the token buffer is
2479 * too small to hold it.
2480 */
2481static inline size_t copy_token(const char **buf,
2482 char *token,
2483 size_t token_size)
2484{
2485 size_t len;
2486
2487 len = next_token(buf);
2488 if (len < token_size) {
2489 memcpy(token, *buf, len);
2490 *(token + len) = '\0';
2491 }
2492 *buf += len;
2493
2494 return len;
2495}
2496
2497/*
Alex Elderea3352f2012-07-09 21:04:23 -05002498 * Finds the next token in *buf, dynamically allocates a buffer big
2499 * enough to hold a copy of it, and copies the token into the new
2500 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2501 * that a duplicate buffer is created even for a zero-length token.
2502 *
2503 * Returns a pointer to the newly-allocated duplicate, or a null
2504 * pointer if memory for the duplicate was not available. If
2505 * the lenp argument is a non-null pointer, the length of the token
2506 * (not including the '\0') is returned in *lenp.
2507 *
2508 * If successful, the *buf pointer will be updated to point beyond
2509 * the end of the found token.
2510 *
2511 * Note: uses GFP_KERNEL for allocation.
2512 */
2513static inline char *dup_token(const char **buf, size_t *lenp)
2514{
2515 char *dup;
2516 size_t len;
2517
2518 len = next_token(buf);
2519 dup = kmalloc(len + 1, GFP_KERNEL);
2520 if (!dup)
2521 return NULL;
2522
2523 memcpy(dup, *buf, len);
2524 *(dup + len) = '\0';
2525 *buf += len;
2526
2527 if (lenp)
2528 *lenp = len;
2529
2530 return dup;
2531}
2532
2533/*
Alex Elder3feeb8942012-08-31 17:29:52 -05002534 * This fills in the pool_name, image_name, image_name_len, rbd_dev,
2535 * rbd_md_name, and name fields of the given rbd_dev, based on the
2536 * list of monitor addresses and other options provided via
2537 * /sys/bus/rbd/add. Returns a pointer to a dynamically-allocated
2538 * copy of the snapshot name to map if successful, or a
2539 * pointer-coded error otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05002540 *
2541 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002542 */
Alex Elder3feeb8942012-08-31 17:29:52 -05002543static char *rbd_add_parse_args(struct rbd_device *rbd_dev,
2544 const char *buf,
2545 const char **mon_addrs,
2546 size_t *mon_addrs_size,
2547 char *options,
2548 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002549{
Alex Elderd22f76e2012-07-12 10:46:35 -05002550 size_t len;
Alex Elder3feeb8942012-08-31 17:29:52 -05002551 char *err_ptr = ERR_PTR(-EINVAL);
2552 char *snap_name;
Alex Eldere28fff262012-02-02 08:13:30 -06002553
2554 /* The first four tokens are required */
2555
Alex Elder7ef32142012-02-02 08:13:30 -06002556 len = next_token(&buf);
2557 if (!len)
Alex Elder3feeb8942012-08-31 17:29:52 -05002558 return err_ptr;
Alex Elder5214ecc2012-02-02 08:13:30 -06002559 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002560 *mon_addrs = buf;
2561
2562 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002563
Alex Eldere28fff262012-02-02 08:13:30 -06002564 len = copy_token(&buf, options, options_size);
2565 if (!len || len >= options_size)
Alex Elder3feeb8942012-08-31 17:29:52 -05002566 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002567
Alex Elder3feeb8942012-08-31 17:29:52 -05002568 err_ptr = ERR_PTR(-ENOMEM);
Alex Elderd22f76e2012-07-12 10:46:35 -05002569 rbd_dev->pool_name = dup_token(&buf, NULL);
2570 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002571 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002572
Alex Elder0bed54d2012-07-03 16:01:18 -05002573 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2574 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002575 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002576
Alex Elder3feeb8942012-08-31 17:29:52 -05002577 /* Snapshot name is optional */
2578 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05002579 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05002580 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
2581 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elder849b4262012-07-09 21:04:24 -05002582 }
Alex Elder3feeb8942012-08-31 17:29:52 -05002583 snap_name = kmalloc(len + 1, GFP_KERNEL);
2584 if (!snap_name)
2585 goto out_err;
2586 memcpy(snap_name, buf, len);
2587 *(snap_name + len) = '\0';
Alex Eldere28fff262012-02-02 08:13:30 -06002588
Alex Elder3feeb8942012-08-31 17:29:52 -05002589dout(" SNAP_NAME is <%s>, len is %zd\n", snap_name, len);
2590
2591 return snap_name;
Alex Elderd22f76e2012-07-12 10:46:35 -05002592
2593out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002594 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002595 rbd_dev->image_name = NULL;
2596 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002597 kfree(rbd_dev->pool_name);
2598 rbd_dev->pool_name = NULL;
2599
Alex Elder3feeb8942012-08-31 17:29:52 -05002600 return err_ptr;
Alex Eldera725f65e2012-02-02 08:13:30 -06002601}
2602
Alex Elder589d30e2012-07-10 20:30:11 -05002603/*
2604 * An rbd format 2 image has a unique identifier, distinct from the
2605 * name given to it by the user. Internally, that identifier is
2606 * what's used to specify the names of objects related to the image.
2607 *
2608 * A special "rbd id" object is used to map an rbd image name to its
2609 * id. If that object doesn't exist, then there is no v2 rbd image
2610 * with the supplied name.
2611 *
2612 * This function will record the given rbd_dev's image_id field if
2613 * it can be determined, and in that case will return 0. If any
2614 * errors occur a negative errno will be returned and the rbd_dev's
2615 * image_id field will be unchanged (and should be NULL).
2616 */
2617static int rbd_dev_image_id(struct rbd_device *rbd_dev)
2618{
2619 int ret;
2620 size_t size;
2621 char *object_name;
2622 void *response;
2623 void *p;
2624
2625 /*
2626 * First, see if the format 2 image id file exists, and if
2627 * so, get the image's persistent id from it.
2628 */
2629 size = sizeof (RBD_ID_PREFIX) + rbd_dev->image_name_len;
2630 object_name = kmalloc(size, GFP_NOIO);
2631 if (!object_name)
2632 return -ENOMEM;
2633 sprintf(object_name, "%s%s", RBD_ID_PREFIX, rbd_dev->image_name);
2634 dout("rbd id object name is %s\n", object_name);
2635
2636 /* Response will be an encoded string, which includes a length */
2637
2638 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
2639 response = kzalloc(size, GFP_NOIO);
2640 if (!response) {
2641 ret = -ENOMEM;
2642 goto out;
2643 }
2644
2645 ret = rbd_req_sync_exec(rbd_dev, object_name,
2646 "rbd", "get_id",
2647 NULL, 0,
2648 response, RBD_IMAGE_ID_LEN_MAX,
2649 CEPH_OSD_FLAG_READ, NULL);
2650 dout("%s: rbd_req_sync_exec returned %d\n", __func__, ret);
2651 if (ret < 0)
2652 goto out;
2653
2654 p = response;
2655 rbd_dev->image_id = ceph_extract_encoded_string(&p,
2656 p + RBD_IMAGE_ID_LEN_MAX,
2657 &rbd_dev->image_id_len,
2658 GFP_NOIO);
2659 if (IS_ERR(rbd_dev->image_id)) {
2660 ret = PTR_ERR(rbd_dev->image_id);
2661 rbd_dev->image_id = NULL;
2662 } else {
2663 dout("image_id is %s\n", rbd_dev->image_id);
2664 }
2665out:
2666 kfree(response);
2667 kfree(object_name);
2668
2669 return ret;
2670}
2671
Alex Eldera30b71b2012-07-10 20:30:11 -05002672static int rbd_dev_v1_probe(struct rbd_device *rbd_dev)
2673{
2674 int ret;
2675 size_t size;
2676
2677 /* Version 1 images have no id; empty string is used */
2678
2679 rbd_dev->image_id = kstrdup("", GFP_KERNEL);
2680 if (!rbd_dev->image_id)
2681 return -ENOMEM;
2682 rbd_dev->image_id_len = 0;
2683
2684 /* Record the header object name for this rbd image. */
2685
2686 size = rbd_dev->image_name_len + sizeof (RBD_SUFFIX);
2687 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2688 if (!rbd_dev->header_name) {
2689 ret = -ENOMEM;
2690 goto out_err;
2691 }
2692 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
2693
2694 /* Populate rbd image metadata */
2695
2696 ret = rbd_read_header(rbd_dev, &rbd_dev->header);
2697 if (ret < 0)
2698 goto out_err;
2699 rbd_dev->image_format = 1;
2700
2701 dout("discovered version 1 image, header name is %s\n",
2702 rbd_dev->header_name);
2703
2704 return 0;
2705
2706out_err:
2707 kfree(rbd_dev->header_name);
2708 rbd_dev->header_name = NULL;
2709 kfree(rbd_dev->image_id);
2710 rbd_dev->image_id = NULL;
2711
2712 return ret;
2713}
2714
2715static int rbd_dev_v2_probe(struct rbd_device *rbd_dev)
2716{
2717 size_t size;
Alex Elder9d475de2012-07-03 16:01:19 -05002718 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05002719
2720 /*
2721 * Image id was filled in by the caller. Record the header
2722 * object name for this rbd image.
2723 */
2724 size = sizeof (RBD_HEADER_PREFIX) + rbd_dev->image_id_len;
2725 rbd_dev->header_name = kmalloc(size, GFP_KERNEL);
2726 if (!rbd_dev->header_name)
2727 return -ENOMEM;
2728 sprintf(rbd_dev->header_name, "%s%s",
2729 RBD_HEADER_PREFIX, rbd_dev->image_id);
Alex Elder9d475de2012-07-03 16:01:19 -05002730
2731 /* Get the size and object order for the image */
2732
2733 ret = rbd_dev_v2_image_size(rbd_dev);
2734 if (ret < 0)
2735 goto out_err;
Alex Elder1e130192012-07-03 16:01:19 -05002736
2737 /* Get the object prefix (a.k.a. block_name) for the image */
2738
2739 ret = rbd_dev_v2_object_prefix(rbd_dev);
2740 if (ret < 0)
2741 goto out_err;
Alex Eldera30b71b2012-07-10 20:30:11 -05002742 rbd_dev->image_format = 2;
2743
2744 dout("discovered version 2 image, header name is %s\n",
2745 rbd_dev->header_name);
2746
2747 return -ENOTSUPP;
Alex Elder9d475de2012-07-03 16:01:19 -05002748out_err:
2749 kfree(rbd_dev->header_name);
2750 rbd_dev->header_name = NULL;
Alex Elder1e130192012-07-03 16:01:19 -05002751 kfree(rbd_dev->header.object_prefix);
2752 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05002753
2754 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05002755}
2756
2757/*
2758 * Probe for the existence of the header object for the given rbd
2759 * device. For format 2 images this includes determining the image
2760 * id.
2761 */
2762static int rbd_dev_probe(struct rbd_device *rbd_dev)
2763{
2764 int ret;
2765
2766 /*
2767 * Get the id from the image id object. If it's not a
2768 * format 2 image, we'll get ENOENT back, and we'll assume
2769 * it's a format 1 image.
2770 */
2771 ret = rbd_dev_image_id(rbd_dev);
2772 if (ret)
2773 ret = rbd_dev_v1_probe(rbd_dev);
2774 else
2775 ret = rbd_dev_v2_probe(rbd_dev);
2776 if (ret)
2777 dout("probe failed, returning %d\n", ret);
2778
2779 return ret;
2780}
2781
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002782static ssize_t rbd_add(struct bus_type *bus,
2783 const char *buf,
2784 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002785{
Alex Eldercb8627c2012-07-09 21:04:23 -05002786 char *options;
2787 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002788 const char *mon_addrs = NULL;
2789 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002790 struct ceph_osd_client *osdc;
2791 int rc = -ENOMEM;
Alex Elder3feeb8942012-08-31 17:29:52 -05002792 char *snap_name;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002793
2794 if (!try_module_get(THIS_MODULE))
2795 return -ENODEV;
2796
Alex Elder27cc2592012-02-02 08:13:30 -06002797 options = kmalloc(count, GFP_KERNEL);
2798 if (!options)
Alex Elder85ae8922012-07-26 23:37:14 -05002799 goto err_out_mem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002800 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2801 if (!rbd_dev)
Alex Elder85ae8922012-07-26 23:37:14 -05002802 goto err_out_mem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002803
2804 /* static rbd_device initialization */
2805 spin_lock_init(&rbd_dev->lock);
2806 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002807 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002808 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002809
Alex Eldera725f65e2012-02-02 08:13:30 -06002810 /* parse add command */
Alex Elder3feeb8942012-08-31 17:29:52 -05002811 snap_name = rbd_add_parse_args(rbd_dev, buf,
2812 &mon_addrs, &mon_addrs_size, options, count);
2813 if (IS_ERR(snap_name)) {
2814 rc = PTR_ERR(snap_name);
Alex Elder85ae8922012-07-26 23:37:14 -05002815 goto err_out_mem;
Alex Elder3feeb8942012-08-31 17:29:52 -05002816 }
Alex Eldera725f65e2012-02-02 08:13:30 -06002817
Alex Elderf8c38922012-08-10 13:12:07 -07002818 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2819 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002820 goto err_out_args;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002821
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002822 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002823 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002824 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2825 if (rc < 0)
2826 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002827 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002828
Alex Eldera30b71b2012-07-10 20:30:11 -05002829 rc = rbd_dev_probe(rbd_dev);
2830 if (rc < 0)
Alex Elder589d30e2012-07-10 20:30:11 -05002831 goto err_out_client;
Alex Eldera30b71b2012-07-10 20:30:11 -05002832 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder05fd6f62012-08-29 17:11:07 -05002833
2834 /* no need to lock here, as rbd_dev is not registered yet */
2835 rc = rbd_dev_snaps_update(rbd_dev);
2836 if (rc)
2837 goto err_out_header;
2838
2839 rc = rbd_dev_set_mapping(rbd_dev, snap_name);
2840 if (rc)
2841 goto err_out_header;
2842
Alex Elder85ae8922012-07-26 23:37:14 -05002843 /* generate unique id: find highest unique id, add one */
2844 rbd_dev_id_get(rbd_dev);
2845
2846 /* Fill in the device name, now that we have its id. */
2847 BUILD_BUG_ON(DEV_NAME_LEN
2848 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2849 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
2850
2851 /* Get our block major device number. */
2852
Alex Elder27cc2592012-02-02 08:13:30 -06002853 rc = register_blkdev(0, rbd_dev->name);
2854 if (rc < 0)
Alex Elder85ae8922012-07-26 23:37:14 -05002855 goto err_out_id;
Alex Elder27cc2592012-02-02 08:13:30 -06002856 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002857
Alex Elder0f308a32012-08-29 17:11:07 -05002858 /* Set up the blkdev mapping. */
2859
2860 rc = rbd_init_disk(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002861 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002862 goto err_out_blkdev;
2863
Alex Elder0f308a32012-08-29 17:11:07 -05002864 rc = rbd_bus_add_dev(rbd_dev);
2865 if (rc)
2866 goto err_out_disk;
2867
Alex Elder32eec682012-02-08 16:11:14 -06002868 /*
2869 * At this point cleanup in the event of an error is the job
2870 * of the sysfs code (initiated by rbd_bus_del_dev()).
Alex Elder32eec682012-02-08 16:11:14 -06002871 */
Alex Elder2ac4e752012-07-10 20:30:10 -05002872
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002873 down_write(&rbd_dev->header_rwsem);
Alex Elder5ed16172012-08-29 17:11:07 -05002874 rc = rbd_dev_snaps_register(rbd_dev);
Alex Elder4bb1f1e2012-08-23 23:48:49 -05002875 up_write(&rbd_dev->header_rwsem);
Alex Elder2ac4e752012-07-10 20:30:10 -05002876 if (rc)
2877 goto err_out_bus;
2878
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002879 rc = rbd_init_watch_dev(rbd_dev);
2880 if (rc)
2881 goto err_out_bus;
2882
Alex Elder3ee40012012-08-29 17:11:07 -05002883 /* Everything's ready. Announce the disk to the world. */
2884
2885 add_disk(rbd_dev->disk);
2886
2887 pr_info("%s: added with size 0x%llx\n", rbd_dev->disk->disk_name,
2888 (unsigned long long) rbd_dev->mapping.size);
2889
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002890 return count;
2891
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002892err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002893 /* this will also clean up rest of rbd_dev stuff */
2894
2895 rbd_bus_del_dev(rbd_dev);
2896 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002897 return rc;
2898
Alex Elder0f308a32012-08-29 17:11:07 -05002899err_out_disk:
2900 rbd_free_disk(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002901err_out_blkdev:
2902 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder85ae8922012-07-26 23:37:14 -05002903err_out_id:
2904 rbd_dev_id_put(rbd_dev);
Alex Elder05fd6f62012-08-29 17:11:07 -05002905err_out_header:
2906 rbd_header_free(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002907err_out_client:
Alex Elder3fcf2582012-07-03 16:01:19 -05002908 kfree(rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002909 rbd_put_client(rbd_dev);
Alex Elder589d30e2012-07-10 20:30:11 -05002910 kfree(rbd_dev->image_id);
Alex Elder85ae8922012-07-26 23:37:14 -05002911err_out_args:
2912 kfree(rbd_dev->mapping.snap_name);
2913 kfree(rbd_dev->image_name);
2914 kfree(rbd_dev->pool_name);
2915err_out_mem:
Alex Elder27cc2592012-02-02 08:13:30 -06002916 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002917 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002918
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002919 dout("Error adding device %s\n", buf);
2920 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002921
2922 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002923}
2924
Alex Elderde71a292012-07-03 16:01:19 -05002925static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002926{
2927 struct list_head *tmp;
2928 struct rbd_device *rbd_dev;
2929
Alex Eldere124a82f2012-01-29 13:57:44 -06002930 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002931 list_for_each(tmp, &rbd_dev_list) {
2932 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002933 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06002934 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002935 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002936 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002937 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002938 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002939 return NULL;
2940}
2941
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002942static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002943{
Alex Elder593a9e72012-02-07 12:03:37 -06002944 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002945
Alex Elder1dbb4392012-01-24 10:08:37 -06002946 if (rbd_dev->watch_request) {
2947 struct ceph_client *client = rbd_dev->rbd_client->client;
2948
2949 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002950 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002951 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002952 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002953 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002954
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002955 rbd_put_client(rbd_dev);
2956
2957 /* clean up and free blkdev */
2958 rbd_free_disk(rbd_dev);
2959 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002960
Alex Elder2ac4e752012-07-10 20:30:10 -05002961 /* release allocated disk header fields */
2962 rbd_header_free(&rbd_dev->header);
2963
Alex Elder32eec682012-02-08 16:11:14 -06002964 /* done with the id, and with the rbd_dev */
Alex Elderf84344f2012-08-31 17:29:51 -05002965 kfree(rbd_dev->mapping.snap_name);
Alex Elder589d30e2012-07-10 20:30:11 -05002966 kfree(rbd_dev->image_id);
Alex Elder0bed54d2012-07-03 16:01:18 -05002967 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002968 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002969 kfree(rbd_dev->image_name);
Alex Eldere2839302012-08-29 17:11:06 -05002970 rbd_dev_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002971 kfree(rbd_dev);
2972
2973 /* release module ref */
2974 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002975}
2976
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002977static ssize_t rbd_remove(struct bus_type *bus,
2978 const char *buf,
2979 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002980{
2981 struct rbd_device *rbd_dev = NULL;
2982 int target_id, rc;
2983 unsigned long ul;
2984 int ret = count;
2985
2986 rc = strict_strtoul(buf, 10, &ul);
2987 if (rc)
2988 return rc;
2989
2990 /* convert to int; abort if we lost anything in the conversion */
2991 target_id = (int) ul;
2992 if (target_id != ul)
2993 return -EINVAL;
2994
2995 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2996
2997 rbd_dev = __rbd_get_dev(target_id);
2998 if (!rbd_dev) {
2999 ret = -ENOENT;
3000 goto done;
3001 }
3002
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003003 __rbd_remove_all_snaps(rbd_dev);
3004 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003005
3006done:
3007 mutex_unlock(&ctl_mutex);
Alex Elderaafb2302012-09-06 16:00:54 -05003008
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003009 return ret;
3010}
3011
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003012/*
3013 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003014 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003015 */
3016static int rbd_sysfs_init(void)
3017{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003018 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003019
Alex Elderfed4c142012-02-07 12:03:36 -06003020 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06003021 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003022 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003023
Alex Elderfed4c142012-02-07 12:03:36 -06003024 ret = bus_register(&rbd_bus_type);
3025 if (ret < 0)
3026 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003027
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003028 return ret;
3029}
3030
3031static void rbd_sysfs_cleanup(void)
3032{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003033 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06003034 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003035}
3036
3037int __init rbd_init(void)
3038{
3039 int rc;
3040
3041 rc = rbd_sysfs_init();
3042 if (rc)
3043 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06003044 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003045 return 0;
3046}
3047
3048void __exit rbd_exit(void)
3049{
3050 rbd_sysfs_cleanup();
3051}
3052
3053module_init(rbd_init);
3054module_exit(rbd_exit);
3055
3056MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
3057MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
3058MODULE_DESCRIPTION("rados block device");
3059
3060/* following authorship retained from original osdblk.c */
3061MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
3062
3063MODULE_LICENSE("GPL");