blob: 2db51cef9560ea99f1c80c09a1532128be85f4fc [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Alex Eldercc0538b2012-08-10 13:12:07 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73#define RBD_READ_ONLY_DEFAULT false
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070074
Yehuda Sadeh602adf42010-08-12 16:11:25 -070075/*
76 * block device image metadata (in-memory version)
77 */
78struct rbd_image_header {
79 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050080 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070081 __u8 obj_order;
82 __u8 crypt_type;
83 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070084 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070085 u32 total_snaps;
86
87 char *snap_names;
88 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070089
90 u64 obj_version;
91};
92
93struct rbd_options {
94 int notify_timeout;
Alex Eldercc0538b2012-08-10 13:12:07 -070095 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096};
97
98/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060099 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700100 */
101struct rbd_client {
102 struct ceph_client *client;
103 struct kref kref;
104 struct list_head node;
105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700110struct rbd_req_status {
111 int done;
112 int rc;
113 u64 bytes;
114};
115
116/*
117 * a collection of requests
118 */
119struct rbd_req_coll {
120 int total;
121 int num_done;
122 struct kref kref;
123 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700124};
125
Alex Elderf0f8cef2012-01-29 13:57:44 -0600126/*
127 * a single io request
128 */
129struct rbd_request {
130 struct request *rq; /* blk layer request */
131 struct bio *bio; /* cloned bio */
132 struct page **pages; /* list of used pages */
133 u64 len;
134 int coll_index;
135 struct rbd_req_coll *coll;
136};
137
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800138struct rbd_snap {
139 struct device dev;
140 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800141 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800142 struct list_head node;
143 u64 id;
144};
145
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146/*
147 * a single device
148 */
149struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500150 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151
152 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155
Alex Elderf8c38922012-08-10 13:12:07 -0700156 struct rbd_options rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700157 struct rbd_client *rbd_client;
158
159 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
160
161 spinlock_t lock; /* queue lock */
162
163 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500164 char *image_name;
165 size_t image_name_len;
166 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500167 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500168 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700169
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700170 struct ceph_osd_event *watch_event;
171 struct ceph_osd_request *watch_request;
172
Josh Durginc6666012011-11-21 17:11:12 -0800173 /* protects updating the header */
174 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800175 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500176 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800177 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800178 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800179 /* whether the snap_id this device reads from still exists */
180 bool snap_exists;
Alex Eldercc0538b2012-08-10 13:12:07 -0700181 bool read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700182
183 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800184
185 /* list of snapshots */
186 struct list_head snaps;
187
188 /* sysfs related */
189 struct device dev;
190};
191
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700192static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600193
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700194static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600195static DEFINE_SPINLOCK(rbd_dev_list_lock);
196
Alex Elder432b8582012-01-29 13:57:44 -0600197static LIST_HEAD(rbd_client_list); /* clients */
198static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800200static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
201static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800202static ssize_t rbd_snap_add(struct device *dev,
203 struct device_attribute *attr,
204 const char *buf,
205 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500206static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800207
Alex Elderf0f8cef2012-01-29 13:57:44 -0600208static ssize_t rbd_add(struct bus_type *bus, const char *buf,
209 size_t count);
210static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
211 size_t count);
212
213static struct bus_attribute rbd_bus_attrs[] = {
214 __ATTR(add, S_IWUSR, NULL, rbd_add),
215 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
216 __ATTR_NULL
217};
218
219static struct bus_type rbd_bus_type = {
220 .name = "rbd",
221 .bus_attrs = rbd_bus_attrs,
222};
223
224static void rbd_root_dev_release(struct device *dev)
225{
226}
227
228static struct device rbd_root_dev = {
229 .init_name = "rbd",
230 .release = rbd_root_dev_release,
231};
232
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800234static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
235{
236 return get_device(&rbd_dev->dev);
237}
238
239static void rbd_put_dev(struct rbd_device *rbd_dev)
240{
241 put_device(&rbd_dev->dev);
242}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700243
Alex Elder1fe5e992012-07-25 09:32:41 -0500244static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700245
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700246static int rbd_open(struct block_device *bdev, fmode_t mode)
247{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600248 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700250 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
251 return -EROFS;
252
Alex Elder340c7a22012-08-10 13:12:07 -0700253 rbd_get_dev(rbd_dev);
254 set_device_ro(bdev, rbd_dev->read_only);
255
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700256 return 0;
257}
258
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800259static int rbd_release(struct gendisk *disk, fmode_t mode)
260{
261 struct rbd_device *rbd_dev = disk->private_data;
262
263 rbd_put_dev(rbd_dev);
264
265 return 0;
266}
267
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700268static const struct block_device_operations rbd_bd_ops = {
269 .owner = THIS_MODULE,
270 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800271 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700272};
273
274/*
275 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500276 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700277 */
Alex Elderf8c38922012-08-10 13:12:07 -0700278static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279{
280 struct rbd_client *rbdc;
281 int ret = -ENOMEM;
282
283 dout("rbd_client_create\n");
284 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
285 if (!rbdc)
286 goto out_opt;
287
288 kref_init(&rbdc->kref);
289 INIT_LIST_HEAD(&rbdc->node);
290
Alex Elderbc534d82012-01-29 13:57:44 -0600291 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
292
Alex Elder43ae4702012-07-03 16:01:18 -0500293 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600295 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500296 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
298 ret = ceph_open_session(rbdc->client);
299 if (ret < 0)
300 goto out_err;
301
Alex Elder432b8582012-01-29 13:57:44 -0600302 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700303 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600304 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305
Alex Elderbc534d82012-01-29 13:57:44 -0600306 mutex_unlock(&ctl_mutex);
307
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700308 dout("rbd_client_create created %p\n", rbdc);
309 return rbdc;
310
311out_err:
312 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600313out_mutex:
314 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700315 kfree(rbdc);
316out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500317 if (ceph_opts)
318 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400319 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700320}
321
322/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700323 * Find a ceph client with specific addr and configuration. If
324 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700325 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700326static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327{
328 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700329 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700330
Alex Elder43ae4702012-07-03 16:01:18 -0500331 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332 return NULL;
333
Alex Elder1f7ba332012-08-10 13:12:07 -0700334 spin_lock(&rbd_client_list_lock);
335 list_for_each_entry(client_node, &rbd_client_list, node) {
336 if (!ceph_compare_options(ceph_opts, client_node->client)) {
337 kref_get(&client_node->kref);
338 found = true;
339 break;
340 }
341 }
342 spin_unlock(&rbd_client_list_lock);
343
344 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700345}
346
347/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700348 * mount options
349 */
350enum {
351 Opt_notify_timeout,
352 Opt_last_int,
353 /* int args above */
354 Opt_last_string,
355 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700356 Opt_read_only,
357 Opt_read_write,
358 /* Boolean args above */
359 Opt_last_bool,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700360};
361
Alex Elder43ae4702012-07-03 16:01:18 -0500362static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700363 {Opt_notify_timeout, "notify_timeout=%d"},
364 /* int args above */
365 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700366 {Opt_read_only, "read_only"},
367 {Opt_read_only, "ro"}, /* Alternate spelling */
368 {Opt_read_write, "read_write"},
369 {Opt_read_write, "rw"}, /* Alternate spelling */
370 /* Boolean args above */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700371 {-1, NULL}
372};
373
374static int parse_rbd_opts_token(char *c, void *private)
375{
Alex Elder43ae4702012-07-03 16:01:18 -0500376 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700377 substring_t argstr[MAX_OPT_ARGS];
378 int token, intval, ret;
379
Alex Elder43ae4702012-07-03 16:01:18 -0500380 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700381 if (token < 0)
382 return -EINVAL;
383
384 if (token < Opt_last_int) {
385 ret = match_int(&argstr[0], &intval);
386 if (ret < 0) {
387 pr_err("bad mount option arg (not int) "
388 "at '%s'\n", c);
389 return ret;
390 }
391 dout("got int token %d val %d\n", token, intval);
392 } else if (token > Opt_last_int && token < Opt_last_string) {
393 dout("got string token %d val %s\n", token,
394 argstr[0].from);
Alex Eldercc0538b2012-08-10 13:12:07 -0700395 } else if (token > Opt_last_string && token < Opt_last_bool) {
396 dout("got Boolean token %d\n", token);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700397 } else {
398 dout("got token %d\n", token);
399 }
400
401 switch (token) {
402 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500403 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700404 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700405 case Opt_read_only:
406 rbd_opts->read_only = true;
407 break;
408 case Opt_read_write:
409 rbd_opts->read_only = false;
410 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700411 default:
412 BUG_ON(token);
413 }
414 return 0;
415}
416
417/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700418 * Get a ceph client with specific addr and configuration, if one does
419 * not exist create it.
420 */
Alex Elderf8c38922012-08-10 13:12:07 -0700421static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
422 size_t mon_addr_len, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700423{
Alex Elderf8c38922012-08-10 13:12:07 -0700424 struct rbd_options *rbd_opts = &rbd_dev->rbd_opts;
Alex Elder43ae4702012-07-03 16:01:18 -0500425 struct ceph_options *ceph_opts;
Alex Elderf8c38922012-08-10 13:12:07 -0700426 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700427
428 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Alex Eldercc0538b2012-08-10 13:12:07 -0700429 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700430
Alex Elder43ae4702012-07-03 16:01:18 -0500431 ceph_opts = ceph_parse_options(options, mon_addr,
432 mon_addr + mon_addr_len,
433 parse_rbd_opts_token, rbd_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700434 if (IS_ERR(ceph_opts))
435 return PTR_ERR(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436
Alex Elder1f7ba332012-08-10 13:12:07 -0700437 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600439 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500440 ceph_destroy_options(ceph_opts);
Alex Elderf8c38922012-08-10 13:12:07 -0700441 } else {
442 rbdc = rbd_client_create(ceph_opts);
443 if (IS_ERR(rbdc))
444 return PTR_ERR(rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445 }
Alex Elderf8c38922012-08-10 13:12:07 -0700446 rbd_dev->rbd_client = rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700447
Alex Elderf8c38922012-08-10 13:12:07 -0700448 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700449}
450
451/*
452 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600453 *
Alex Elder432b8582012-01-29 13:57:44 -0600454 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700455 */
456static void rbd_client_release(struct kref *kref)
457{
458 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
459
460 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500461 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700462 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500463 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700464
465 ceph_destroy_client(rbdc->client);
466 kfree(rbdc);
467}
468
469/*
470 * Drop reference to ceph client node. If it's not referenced anymore, release
471 * it.
472 */
473static void rbd_put_client(struct rbd_device *rbd_dev)
474{
475 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
476 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700477}
478
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700479/*
480 * Destroy requests collection
481 */
482static void rbd_coll_release(struct kref *kref)
483{
484 struct rbd_req_coll *coll =
485 container_of(kref, struct rbd_req_coll, kref);
486
487 dout("rbd_coll_release %p\n", coll);
488 kfree(coll);
489}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700490
Alex Elder8e94af82012-07-25 09:32:40 -0500491static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
492{
Alex Elder103a1502012-08-02 11:29:45 -0500493 size_t size;
494 u32 snap_count;
495
496 /* The header has to start with the magic rbd header text */
497 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
498 return false;
499
500 /*
501 * The size of a snapshot header has to fit in a size_t, and
502 * that limits the number of snapshots.
503 */
504 snap_count = le32_to_cpu(ondisk->snap_count);
505 size = SIZE_MAX - sizeof (struct ceph_snap_context);
506 if (snap_count > size / sizeof (__le64))
507 return false;
508
509 /*
510 * Not only that, but the size of the entire the snapshot
511 * header must also be representable in a size_t.
512 */
513 size -= snap_count * sizeof (__le64);
514 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
515 return false;
516
517 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500518}
519
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700520/*
521 * Create a new header structure, translate header format from the on-disk
522 * header.
523 */
524static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500525 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700526{
Alex Elderccece232012-07-10 20:30:10 -0500527 u32 snap_count;
Alex Elder58c17b02012-08-23 23:22:06 -0500528 size_t len;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500529 size_t size;
Alex Elder621901d2012-08-23 23:22:06 -0500530 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700531
Alex Elder6a523252012-07-19 17:12:59 -0500532 memset(header, 0, sizeof (*header));
533
Alex Elder103a1502012-08-02 11:29:45 -0500534 snap_count = le32_to_cpu(ondisk->snap_count);
535
Alex Elder58c17b02012-08-23 23:22:06 -0500536 len = strnlen(ondisk->object_prefix, sizeof (ondisk->object_prefix));
537 header->object_prefix = kmalloc(len + 1, GFP_KERNEL);
Alex Elder6a523252012-07-19 17:12:59 -0500538 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700539 return -ENOMEM;
Alex Elder58c17b02012-08-23 23:22:06 -0500540 memcpy(header->object_prefix, ondisk->object_prefix, len);
541 header->object_prefix[len] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600542
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700543 if (snap_count) {
Alex Elderf785cc12012-08-23 23:22:06 -0500544 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
545
Alex Elder621901d2012-08-23 23:22:06 -0500546 /* Save a copy of the snapshot names */
547
Alex Elderf785cc12012-08-23 23:22:06 -0500548 if (snap_names_len > (u64) SIZE_MAX)
549 return -EIO;
550 header->snap_names = kmalloc(snap_names_len, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700551 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500552 goto out_err;
Alex Elderf785cc12012-08-23 23:22:06 -0500553 /*
554 * Note that rbd_dev_v1_header_read() guarantees
555 * the ondisk buffer we're working with has
556 * snap_names_len bytes beyond the end of the
557 * snapshot id array, this memcpy() is safe.
558 */
559 memcpy(header->snap_names, &ondisk->snaps[snap_count],
560 snap_names_len);
Alex Elder6a523252012-07-19 17:12:59 -0500561
Alex Elder621901d2012-08-23 23:22:06 -0500562 /* Record each snapshot's size */
563
Alex Elderd2bb24e2012-07-26 23:37:14 -0500564 size = snap_count * sizeof (*header->snap_sizes);
565 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700566 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500567 goto out_err;
Alex Elder621901d2012-08-23 23:22:06 -0500568 for (i = 0; i < snap_count; i++)
569 header->snap_sizes[i] =
570 le64_to_cpu(ondisk->snaps[i].image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700571 } else {
Alex Elderccece232012-07-10 20:30:10 -0500572 WARN_ON(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700573 header->snap_names = NULL;
574 header->snap_sizes = NULL;
575 }
Alex Elder849b4262012-07-09 21:04:24 -0500576
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700577 header->image_size = le64_to_cpu(ondisk->image_size);
578 header->obj_order = ondisk->options.order;
579 header->crypt_type = ondisk->options.crypt_type;
580 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500581 header->total_snaps = snap_count;
582
Alex Elder621901d2012-08-23 23:22:06 -0500583 /* Allocate and fill in the snapshot context */
584
Alex Elder6a523252012-07-19 17:12:59 -0500585 size = sizeof (struct ceph_snap_context);
586 size += snap_count * sizeof (header->snapc->snaps[0]);
587 header->snapc = kzalloc(size, GFP_KERNEL);
588 if (!header->snapc)
589 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590
591 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500592 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700593 header->snapc->num_snaps = snap_count;
Alex Elder621901d2012-08-23 23:22:06 -0500594 for (i = 0; i < snap_count; i++)
595 header->snapc->snaps[i] =
596 le64_to_cpu(ondisk->snaps[i].id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597
598 return 0;
599
Alex Elder6a523252012-07-19 17:12:59 -0500600out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500601 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500602 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700603 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500604 header->snap_names = NULL;
Alex Elder6a523252012-07-19 17:12:59 -0500605 kfree(header->object_prefix);
606 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500607
Alex Elder00f1f362012-02-07 12:03:36 -0600608 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609}
610
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700611static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
612 u64 *seq, u64 *size)
613{
614 int i;
615 char *p = header->snap_names;
616
Alex Elder00f1f362012-02-07 12:03:36 -0600617 for (i = 0; i < header->total_snaps; i++) {
618 if (!strcmp(snap_name, p)) {
619
620 /* Found it. Pass back its id and/or size */
621
622 if (seq)
623 *seq = header->snapc->snaps[i];
624 if (size)
625 *size = header->snap_sizes[i];
626 return i;
627 }
628 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629 }
Alex Elder00f1f362012-02-07 12:03:36 -0600630 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631}
632
Alex Elder0ce1a792012-07-03 16:01:18 -0500633static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700634{
Alex Elder78dc4472012-07-19 08:49:18 -0500635 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636
Alex Elder0ce1a792012-07-03 16:01:18 -0500637 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638
Alex Elder0ce1a792012-07-03 16:01:18 -0500639 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800640 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500641 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800642 rbd_dev->snap_exists = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700643 rbd_dev->read_only = rbd_dev->rbd_opts.read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500645 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500647 u64 snap_id = 0;
648
649 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
650 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651 if (ret < 0)
652 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500653 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800654 rbd_dev->snap_exists = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700655 rbd_dev->read_only = true; /* No choice for snapshots */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656 }
657
658 ret = 0;
659done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500660 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700661 return ret;
662}
663
664static void rbd_header_free(struct rbd_image_header *header)
665{
Alex Elder849b4262012-07-09 21:04:24 -0500666 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500667 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700668 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500669 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500670 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500671 header->snap_names = NULL;
Josh Durgind1d25642011-12-05 14:03:05 -0800672 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500673 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674}
675
676/*
677 * get the actual striped segment name, offset and length
678 */
679static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500680 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681 u64 ofs, u64 len,
682 char *seg_name, u64 *segofs)
683{
684 u64 seg = ofs >> header->obj_order;
685
686 if (seg_name)
687 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500688 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689
690 ofs = ofs & ((1 << header->obj_order) - 1);
691 len = min_t(u64, len, (1 << header->obj_order) - ofs);
692
693 if (segofs)
694 *segofs = ofs;
695
696 return len;
697}
698
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700699static int rbd_get_num_segments(struct rbd_image_header *header,
700 u64 ofs, u64 len)
701{
702 u64 start_seg = ofs >> header->obj_order;
703 u64 end_seg = (ofs + len - 1) >> header->obj_order;
704 return end_seg - start_seg + 1;
705}
706
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700708 * returns the size of an object in the image
709 */
710static u64 rbd_obj_bytes(struct rbd_image_header *header)
711{
712 return 1 << header->obj_order;
713}
714
715/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716 * bio helpers
717 */
718
719static void bio_chain_put(struct bio *chain)
720{
721 struct bio *tmp;
722
723 while (chain) {
724 tmp = chain;
725 chain = chain->bi_next;
726 bio_put(tmp);
727 }
728}
729
730/*
731 * zeros a bio chain, starting at specific offset
732 */
733static void zero_bio_chain(struct bio *chain, int start_ofs)
734{
735 struct bio_vec *bv;
736 unsigned long flags;
737 void *buf;
738 int i;
739 int pos = 0;
740
741 while (chain) {
742 bio_for_each_segment(bv, chain, i) {
743 if (pos + bv->bv_len > start_ofs) {
744 int remainder = max(start_ofs - pos, 0);
745 buf = bvec_kmap_irq(bv, &flags);
746 memset(buf + remainder, 0,
747 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200748 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700749 }
750 pos += bv->bv_len;
751 }
752
753 chain = chain->bi_next;
754 }
755}
756
757/*
758 * bio_chain_clone - clone a chain of bios up to a certain length.
759 * might return a bio_pair that will need to be released.
760 */
761static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
762 struct bio_pair **bp,
763 int len, gfp_t gfpmask)
764{
765 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
766 int total = 0;
767
768 if (*bp) {
769 bio_pair_release(*bp);
770 *bp = NULL;
771 }
772
773 while (old_chain && (total < len)) {
774 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
775 if (!tmp)
776 goto err_out;
777
778 if (total + old_chain->bi_size > len) {
779 struct bio_pair *bp;
780
781 /*
782 * this split can only happen with a single paged bio,
783 * split_bio will BUG_ON if this is not the case
784 */
785 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500786 "bi_size=%u\n",
787 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700788
789 /* split the bio. We'll release it either in the next
790 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600791 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700792 if (!bp)
793 goto err_out;
794
795 __bio_clone(tmp, &bp->bio1);
796
797 *next = &bp->bio2;
798 } else {
799 __bio_clone(tmp, old_chain);
800 *next = old_chain->bi_next;
801 }
802
803 tmp->bi_bdev = NULL;
804 gfpmask &= ~__GFP_WAIT;
805 tmp->bi_next = NULL;
806
807 if (!new_chain) {
808 new_chain = tail = tmp;
809 } else {
810 tail->bi_next = tmp;
811 tail = tmp;
812 }
813 old_chain = old_chain->bi_next;
814
815 total += tmp->bi_size;
816 }
817
818 BUG_ON(total < len);
819
820 if (tail)
821 tail->bi_next = NULL;
822
823 *old = old_chain;
824
825 return new_chain;
826
827err_out:
828 dout("bio_chain_clone with err\n");
829 bio_chain_put(new_chain);
830 return NULL;
831}
832
833/*
834 * helpers for osd request op vectors.
835 */
Alex Elder57cfc102012-06-26 12:57:03 -0700836static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
837 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700838{
Alex Elder57cfc102012-06-26 12:57:03 -0700839 struct ceph_osd_req_op *ops;
840
841 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
842 if (!ops)
843 return NULL;
844
845 ops[0].op = opcode;
846
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700847 /*
848 * op extent offset and length will be set later on
849 * in calc_raw_layout()
850 */
Alex Elder57cfc102012-06-26 12:57:03 -0700851 ops[0].payload_len = payload_len;
852
853 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854}
855
856static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
857{
858 kfree(ops);
859}
860
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700861static void rbd_coll_end_req_index(struct request *rq,
862 struct rbd_req_coll *coll,
863 int index,
864 int ret, u64 len)
865{
866 struct request_queue *q;
867 int min, max, i;
868
Alex Elderbd919d42012-07-13 20:35:11 -0500869 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
870 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700871
872 if (!rq)
873 return;
874
875 if (!coll) {
876 blk_end_request(rq, ret, len);
877 return;
878 }
879
880 q = rq->q;
881
882 spin_lock_irq(q->queue_lock);
883 coll->status[index].done = 1;
884 coll->status[index].rc = ret;
885 coll->status[index].bytes = len;
886 max = min = coll->num_done;
887 while (max < coll->total && coll->status[max].done)
888 max++;
889
890 for (i = min; i<max; i++) {
891 __blk_end_request(rq, coll->status[i].rc,
892 coll->status[i].bytes);
893 coll->num_done++;
894 kref_put(&coll->kref, rbd_coll_release);
895 }
896 spin_unlock_irq(q->queue_lock);
897}
898
899static void rbd_coll_end_req(struct rbd_request *req,
900 int ret, u64 len)
901{
902 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
903}
904
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700905/*
906 * Send ceph osd request
907 */
908static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500909 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700910 struct ceph_snap_context *snapc,
911 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500912 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700913 struct bio *bio,
914 struct page **pages,
915 int num_pages,
916 int flags,
917 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700918 struct rbd_req_coll *coll,
919 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700920 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700921 struct ceph_msg *msg),
922 struct ceph_osd_request **linger_req,
923 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700924{
925 struct ceph_osd_request *req;
926 struct ceph_file_layout *layout;
927 int ret;
928 u64 bno;
929 struct timespec mtime = CURRENT_TIME;
930 struct rbd_request *req_data;
931 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600932 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700933
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700934 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700935 if (!req_data) {
936 if (coll)
937 rbd_coll_end_req_index(rq, coll, coll_index,
938 -ENOMEM, len);
939 return -ENOMEM;
940 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700941
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700942 if (coll) {
943 req_data->coll = coll;
944 req_data->coll_index = coll_index;
945 }
946
Alex Elderbd919d42012-07-13 20:35:11 -0500947 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
948 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949
Alex Elder0ce1a792012-07-03 16:01:18 -0500950 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600951 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
952 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700953 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700954 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700955 goto done_pages;
956 }
957
958 req->r_callback = rbd_cb;
959
960 req_data->rq = rq;
961 req_data->bio = bio;
962 req_data->pages = pages;
963 req_data->len = len;
964
965 req->r_priv = req_data;
966
967 reqhead = req->r_request->front.iov_base;
968 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
969
Alex Elderaded07e2012-07-03 16:01:18 -0500970 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700971 req->r_oid_len = strlen(req->r_oid);
972
973 layout = &req->r_file_layout;
974 memset(layout, 0, sizeof(*layout));
975 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
976 layout->fl_stripe_count = cpu_to_le32(1);
977 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500978 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600979 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
980 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700981
982 ceph_osdc_build_request(req, ofs, &len,
983 ops,
984 snapc,
985 &mtime,
986 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700987
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700988 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600989 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700990 *linger_req = req;
991 }
992
Alex Elder1dbb4392012-01-24 10:08:37 -0600993 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700994 if (ret < 0)
995 goto done_err;
996
997 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600998 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700999 if (ver)
1000 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -05001001 dout("reassert_ver=%llu\n",
1002 (unsigned long long)
1003 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004 ceph_osdc_put_request(req);
1005 }
1006 return ret;
1007
1008done_err:
1009 bio_chain_put(req_data->bio);
1010 ceph_osdc_put_request(req);
1011done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001012 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001013 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001014 return ret;
1015}
1016
1017/*
1018 * Ceph osd op callback
1019 */
1020static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1021{
1022 struct rbd_request *req_data = req->r_priv;
1023 struct ceph_osd_reply_head *replyhead;
1024 struct ceph_osd_op *op;
1025 __s32 rc;
1026 u64 bytes;
1027 int read_op;
1028
1029 /* parse reply */
1030 replyhead = msg->front.iov_base;
1031 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1032 op = (void *)(replyhead + 1);
1033 rc = le32_to_cpu(replyhead->result);
1034 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001035 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001036
Alex Elderbd919d42012-07-13 20:35:11 -05001037 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1038 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001039
1040 if (rc == -ENOENT && read_op) {
1041 zero_bio_chain(req_data->bio, 0);
1042 rc = 0;
1043 } else if (rc == 0 && read_op && bytes < req_data->len) {
1044 zero_bio_chain(req_data->bio, bytes);
1045 bytes = req_data->len;
1046 }
1047
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001048 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001049
1050 if (req_data->bio)
1051 bio_chain_put(req_data->bio);
1052
1053 ceph_osdc_put_request(req);
1054 kfree(req_data);
1055}
1056
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001057static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1058{
1059 ceph_osdc_put_request(req);
1060}
1061
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001062/*
1063 * Do a synchronous ceph osd operation
1064 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001065static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001066 struct ceph_snap_context *snapc,
1067 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001068 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001069 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001070 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001071 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001072 char *buf,
1073 struct ceph_osd_request **linger_req,
1074 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001075{
1076 int ret;
1077 struct page **pages;
1078 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001079
1080 BUG_ON(ops == NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001081
1082 num_pages = calc_pages_for(ofs , len);
1083 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001084 if (IS_ERR(pages))
1085 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001086
Alex Elder0ce1a792012-07-03 16:01:18 -05001087 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001088 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001089 pages, num_pages,
1090 flags,
1091 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001092 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001093 NULL,
1094 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001095 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001096 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001097
1098 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1099 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1100
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001101done:
1102 ceph_release_page_vector(pages, num_pages);
1103 return ret;
1104}
1105
1106/*
1107 * Do an asynchronous ceph osd operation
1108 */
1109static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001110 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001111 struct ceph_snap_context *snapc,
1112 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001113 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001114 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001115 struct bio *bio,
1116 struct rbd_req_coll *coll,
1117 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001118{
1119 char *seg_name;
1120 u64 seg_ofs;
1121 u64 seg_len;
1122 int ret;
1123 struct ceph_osd_req_op *ops;
1124 u32 payload_len;
1125
1126 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1127 if (!seg_name)
1128 return -ENOMEM;
1129
1130 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001131 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001132 ofs, len,
1133 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001134
1135 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1136
Alex Elder57cfc102012-06-26 12:57:03 -07001137 ret = -ENOMEM;
1138 ops = rbd_create_rw_ops(1, opcode, payload_len);
1139 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001140 goto done;
1141
1142 /* we've taken care of segment sizes earlier when we
1143 cloned the bios. We should never have a segment
1144 truncated at this point */
1145 BUG_ON(seg_len < len);
1146
1147 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1148 seg_name, seg_ofs, seg_len,
1149 bio,
1150 NULL, 0,
1151 flags,
1152 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001153 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001154 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001155
1156 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001157done:
1158 kfree(seg_name);
1159 return ret;
1160}
1161
1162/*
1163 * Request async osd write
1164 */
1165static int rbd_req_write(struct request *rq,
1166 struct rbd_device *rbd_dev,
1167 struct ceph_snap_context *snapc,
1168 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001169 struct bio *bio,
1170 struct rbd_req_coll *coll,
1171 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172{
1173 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1174 CEPH_OSD_OP_WRITE,
1175 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001176 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001177}
1178
1179/*
1180 * Request async osd read
1181 */
1182static int rbd_req_read(struct request *rq,
1183 struct rbd_device *rbd_dev,
1184 u64 snapid,
1185 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001186 struct bio *bio,
1187 struct rbd_req_coll *coll,
1188 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001189{
1190 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001191 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001192 CEPH_OSD_OP_READ,
1193 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001194 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001195}
1196
1197/*
1198 * Request sync osd read
1199 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001200static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001201 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001202 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001203 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001204 char *buf,
1205 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001206{
Alex Elder913d2fd2012-06-26 12:57:03 -07001207 struct ceph_osd_req_op *ops;
1208 int ret;
1209
1210 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1211 if (!ops)
1212 return -ENOMEM;
1213
1214 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001215 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001216 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001217 ops, object_name, ofs, len, buf, NULL, ver);
1218 rbd_destroy_ops(ops);
1219
1220 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001221}
1222
1223/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001224 * Request sync osd watch
1225 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001226static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001227 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001228 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001229{
1230 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001231 int ret;
1232
Alex Elder57cfc102012-06-26 12:57:03 -07001233 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1234 if (!ops)
1235 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001236
Josh Durgina71b8912011-12-05 18:10:44 -08001237 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001238 ops[0].watch.cookie = notify_id;
1239 ops[0].watch.flag = 0;
1240
Alex Elder0ce1a792012-07-03 16:01:18 -05001241 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001242 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001243 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001244 CEPH_OSD_FLAG_READ,
1245 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001246 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001247 rbd_simple_req_cb, 0, NULL);
1248
1249 rbd_destroy_ops(ops);
1250 return ret;
1251}
1252
1253static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1254{
Alex Elder0ce1a792012-07-03 16:01:18 -05001255 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001256 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001257 int rc;
1258
Alex Elder0ce1a792012-07-03 16:01:18 -05001259 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260 return;
1261
Alex Elderbd919d42012-07-13 20:35:11 -05001262 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1263 rbd_dev->header_name, (unsigned long long) notify_id,
1264 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001265 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001266 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001267 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001268 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001269
Alex Elder7f0a24d2012-07-25 09:32:40 -05001270 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001271}
1272
1273/*
1274 * Request sync osd watch
1275 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001276static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001277{
1278 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001279 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001280 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001281
Alex Elder57cfc102012-06-26 12:57:03 -07001282 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1283 if (!ops)
1284 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001285
1286 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001287 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001288 if (ret < 0)
1289 goto fail;
1290
Alex Elder0e6f3222012-07-25 09:32:40 -05001291 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001292 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001293 ops[0].watch.flag = 1;
1294
Alex Elder0ce1a792012-07-03 16:01:18 -05001295 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001296 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001297 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1298 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001299 rbd_dev->header_name,
1300 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001301 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001302
1303 if (ret < 0)
1304 goto fail_event;
1305
1306 rbd_destroy_ops(ops);
1307 return 0;
1308
1309fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001310 ceph_osdc_cancel_event(rbd_dev->watch_event);
1311 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001312fail:
1313 rbd_destroy_ops(ops);
1314 return ret;
1315}
1316
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001317/*
1318 * Request sync osd unwatch
1319 */
Alex Elder070c6332012-07-25 09:32:41 -05001320static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001321{
1322 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001323 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001324
Alex Elder57cfc102012-06-26 12:57:03 -07001325 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1326 if (!ops)
1327 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001328
1329 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001330 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001331 ops[0].watch.flag = 0;
1332
Alex Elder0ce1a792012-07-03 16:01:18 -05001333 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001334 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001335 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1336 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001337 rbd_dev->header_name,
1338 0, 0, NULL, NULL, NULL);
1339
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001340
1341 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001342 ceph_osdc_cancel_event(rbd_dev->watch_event);
1343 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001344 return ret;
1345}
1346
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001347struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001348 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001349};
1350
1351static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1352{
Alex Elder0ce1a792012-07-03 16:01:18 -05001353 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1354 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001355 return;
1356
Alex Elderbd919d42012-07-13 20:35:11 -05001357 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1358 rbd_dev->header_name, (unsigned long long) notify_id,
1359 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001360}
1361
1362/*
1363 * Request sync osd notify
1364 */
Alex Elder4cb16252012-07-25 09:32:40 -05001365static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001366{
1367 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001368 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001369 struct ceph_osd_event *event;
1370 struct rbd_notify_info info;
1371 int payload_len = sizeof(u32) + sizeof(u32);
1372 int ret;
1373
Alex Elder57cfc102012-06-26 12:57:03 -07001374 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1375 if (!ops)
1376 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001377
Alex Elder0ce1a792012-07-03 16:01:18 -05001378 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001379
1380 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1381 (void *)&info, &event);
1382 if (ret < 0)
1383 goto fail;
1384
1385 ops[0].watch.ver = 1;
1386 ops[0].watch.flag = 1;
1387 ops[0].watch.cookie = event->cookie;
1388 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1389 ops[0].watch.timeout = 12;
1390
Alex Elder0ce1a792012-07-03 16:01:18 -05001391 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001392 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001393 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1394 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001395 rbd_dev->header_name,
1396 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001397 if (ret < 0)
1398 goto fail_event;
1399
1400 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1401 dout("ceph_osdc_wait_event returned %d\n", ret);
1402 rbd_destroy_ops(ops);
1403 return 0;
1404
1405fail_event:
1406 ceph_osdc_cancel_event(event);
1407fail:
1408 rbd_destroy_ops(ops);
1409 return ret;
1410}
1411
1412/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001413 * Request sync osd read
1414 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001415static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001416 const char *object_name,
1417 const char *class_name,
1418 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001419 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001420 int len,
1421 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001422{
1423 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001424 int class_name_len = strlen(class_name);
1425 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001426 int ret;
1427
1428 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001429 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001430 if (!ops)
1431 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001432
Alex Elderaded07e2012-07-03 16:01:18 -05001433 ops[0].cls.class_name = class_name;
1434 ops[0].cls.class_len = (__u8) class_name_len;
1435 ops[0].cls.method_name = method_name;
1436 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001437 ops[0].cls.argc = 0;
1438 ops[0].cls.indata = data;
1439 ops[0].cls.indata_len = len;
1440
Alex Elder0ce1a792012-07-03 16:01:18 -05001441 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001442 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001443 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1444 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001445 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001446
1447 rbd_destroy_ops(ops);
1448
1449 dout("cls_exec returned %d\n", ret);
1450 return ret;
1451}
1452
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001453static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1454{
1455 struct rbd_req_coll *coll =
1456 kzalloc(sizeof(struct rbd_req_coll) +
1457 sizeof(struct rbd_req_status) * num_reqs,
1458 GFP_ATOMIC);
1459
1460 if (!coll)
1461 return NULL;
1462 coll->total = num_reqs;
1463 kref_init(&coll->kref);
1464 return coll;
1465}
1466
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001467/*
1468 * block device queue callback
1469 */
1470static void rbd_rq_fn(struct request_queue *q)
1471{
1472 struct rbd_device *rbd_dev = q->queuedata;
1473 struct request *rq;
1474 struct bio_pair *bp = NULL;
1475
Alex Elder00f1f362012-02-07 12:03:36 -06001476 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001477 struct bio *bio;
1478 struct bio *rq_bio, *next_bio = NULL;
1479 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001480 unsigned int size;
1481 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001482 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001483 int num_segs, cur_seg = 0;
1484 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001485 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001486
1487 /* peek at request from block layer */
1488 if (!rq)
1489 break;
1490
1491 dout("fetched request\n");
1492
1493 /* filter out block requests we don't understand */
1494 if ((rq->cmd_type != REQ_TYPE_FS)) {
1495 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001496 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001497 }
1498
1499 /* deduce our operation (read, write) */
1500 do_write = (rq_data_dir(rq) == WRITE);
1501
1502 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001503 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001504 rq_bio = rq->bio;
1505 if (do_write && rbd_dev->read_only) {
1506 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001507 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001508 }
1509
1510 spin_unlock_irq(q->queue_lock);
1511
Josh Durgind1d25642011-12-05 14:03:05 -08001512 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001513
Josh Durgind1d25642011-12-05 14:03:05 -08001514 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001515 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001516 dout("request for non-existent snapshot");
1517 spin_lock_irq(q->queue_lock);
1518 __blk_end_request_all(rq, -ENXIO);
1519 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001520 }
1521
Josh Durgind1d25642011-12-05 14:03:05 -08001522 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1523
1524 up_read(&rbd_dev->header_rwsem);
1525
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001526 dout("%s 0x%x bytes at 0x%llx\n",
1527 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001528 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001530 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1531 coll = rbd_alloc_coll(num_segs);
1532 if (!coll) {
1533 spin_lock_irq(q->queue_lock);
1534 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001535 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001536 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001537 }
1538
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 do {
1540 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001541 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001542 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001543 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001544 ofs, size,
1545 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001546 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001547 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1548 op_size, GFP_ATOMIC);
1549 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001550 rbd_coll_end_req_index(rq, coll, cur_seg,
1551 -ENOMEM, op_size);
1552 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001553 }
1554
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001555
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001556 /* init OSD command: write or read */
1557 if (do_write)
1558 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001559 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001560 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001561 op_size, bio,
1562 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001563 else
1564 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001565 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001566 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001567 op_size, bio,
1568 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001569
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001570next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001571 size -= op_size;
1572 ofs += op_size;
1573
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001574 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001575 rq_bio = next_bio;
1576 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001577 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001578
1579 if (bp)
1580 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001581 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001582
1583 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001584 }
1585}
1586
1587/*
1588 * a queue callback. Makes sure that we don't create a bio that spans across
1589 * multiple osd objects. One exception would be with a single page bios,
1590 * which we handle later at bio_chain_clone
1591 */
1592static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1593 struct bio_vec *bvec)
1594{
1595 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001596 unsigned int chunk_sectors;
1597 sector_t sector;
1598 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001599 int max;
1600
Alex Elder593a9e72012-02-07 12:03:37 -06001601 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1602 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1603 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1604
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001605 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001606 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001607 if (max < 0)
1608 max = 0; /* bio_add cannot handle a negative return */
1609 if (max <= bvec->bv_len && bio_sectors == 0)
1610 return bvec->bv_len;
1611 return max;
1612}
1613
1614static void rbd_free_disk(struct rbd_device *rbd_dev)
1615{
1616 struct gendisk *disk = rbd_dev->disk;
1617
1618 if (!disk)
1619 return;
1620
1621 rbd_header_free(&rbd_dev->header);
1622
1623 if (disk->flags & GENHD_FL_UP)
1624 del_gendisk(disk);
1625 if (disk->queue)
1626 blk_cleanup_queue(disk->queue);
1627 put_disk(disk);
1628}
1629
1630/*
Alex Elder4156d992012-08-02 11:29:46 -05001631 * Read the complete header for the given rbd device.
1632 *
1633 * Returns a pointer to a dynamically-allocated buffer containing
1634 * the complete and validated header. Caller can pass the address
1635 * of a variable that will be filled in with the version of the
1636 * header object at the time it was read.
1637 *
1638 * Returns a pointer-coded errno if a failure occurs.
1639 */
1640static struct rbd_image_header_ondisk *
1641rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1642{
1643 struct rbd_image_header_ondisk *ondisk = NULL;
1644 u32 snap_count = 0;
1645 u64 names_size = 0;
1646 u32 want_count;
1647 int ret;
1648
1649 /*
1650 * The complete header will include an array of its 64-bit
1651 * snapshot ids, followed by the names of those snapshots as
1652 * a contiguous block of NUL-terminated strings. Note that
1653 * the number of snapshots could change by the time we read
1654 * it in, in which case we re-read it.
1655 */
1656 do {
1657 size_t size;
1658
1659 kfree(ondisk);
1660
1661 size = sizeof (*ondisk);
1662 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1663 size += names_size;
1664 ondisk = kmalloc(size, GFP_KERNEL);
1665 if (!ondisk)
1666 return ERR_PTR(-ENOMEM);
1667
1668 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1669 rbd_dev->header_name,
1670 0, size,
1671 (char *) ondisk, version);
1672
1673 if (ret < 0)
1674 goto out_err;
1675 if (WARN_ON((size_t) ret < size)) {
1676 ret = -ENXIO;
1677 pr_warning("short header read for image %s"
1678 " (want %zd got %d)\n",
1679 rbd_dev->image_name, size, ret);
1680 goto out_err;
1681 }
1682 if (!rbd_dev_ondisk_valid(ondisk)) {
1683 ret = -ENXIO;
1684 pr_warning("invalid header for image %s\n",
1685 rbd_dev->image_name);
1686 goto out_err;
1687 }
1688
1689 names_size = le64_to_cpu(ondisk->snap_names_len);
1690 want_count = snap_count;
1691 snap_count = le32_to_cpu(ondisk->snap_count);
1692 } while (snap_count != want_count);
1693
1694 return ondisk;
1695
1696out_err:
1697 kfree(ondisk);
1698
1699 return ERR_PTR(ret);
1700}
1701
1702/*
1703 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001704 */
1705static int rbd_read_header(struct rbd_device *rbd_dev,
1706 struct rbd_image_header *header)
1707{
Alex Elder4156d992012-08-02 11:29:46 -05001708 struct rbd_image_header_ondisk *ondisk;
1709 u64 ver = 0;
1710 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001711
Alex Elder4156d992012-08-02 11:29:46 -05001712 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1713 if (IS_ERR(ondisk))
1714 return PTR_ERR(ondisk);
1715 ret = rbd_header_from_disk(header, ondisk);
1716 if (ret >= 0)
1717 header->obj_version = ver;
1718 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001719
Alex Elder4156d992012-08-02 11:29:46 -05001720 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001721}
1722
1723/*
1724 * create a snapshot
1725 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001726static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001727 const char *snap_name,
1728 gfp_t gfp_flags)
1729{
1730 int name_len = strlen(snap_name);
1731 u64 new_snapid;
1732 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001733 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001734 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001735
1736 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001737 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001738 return -EINVAL;
1739
Alex Elder0ce1a792012-07-03 16:01:18 -05001740 monc = &rbd_dev->rbd_client->client->monc;
1741 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001742 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001743 if (ret < 0)
1744 return ret;
1745
1746 data = kmalloc(name_len + 16, gfp_flags);
1747 if (!data)
1748 return -ENOMEM;
1749
Sage Weil916d4d62011-05-12 16:10:50 -07001750 p = data;
1751 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001752
Sage Weil916d4d62011-05-12 16:10:50 -07001753 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1754 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001755
Alex Elder0bed54d2012-07-03 16:01:18 -05001756 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001757 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001758 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001759
Sage Weil916d4d62011-05-12 16:10:50 -07001760 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001761
Alex Elder505cbb92012-07-19 08:49:18 -05001762 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001763bad:
1764 return -ERANGE;
1765}
1766
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001767static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1768{
1769 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001770 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001771
Alex Eldera0593292012-07-19 09:09:27 -05001772 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001773 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001774}
1775
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001776/*
1777 * only read the first part of the ondisk header, without the snaps info
1778 */
Alex Elderb8136232012-07-25 09:32:41 -05001779static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001780{
1781 int ret;
1782 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001783
1784 ret = rbd_read_header(rbd_dev, &h);
1785 if (ret < 0)
1786 return ret;
1787
Josh Durgina51aa0c2011-12-05 10:35:04 -08001788 down_write(&rbd_dev->header_rwsem);
1789
Sage Weil9db4b3e2011-04-19 22:49:06 -07001790 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001791 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1792 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1793
1794 dout("setting size to %llu sectors", (unsigned long long) size);
1795 set_capacity(rbd_dev->disk, size);
1796 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001797
Alex Elder849b4262012-07-09 21:04:24 -05001798 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001800 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001801 /* osd requests may still refer to snapc */
1802 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001803
Alex Elderb8136232012-07-25 09:32:41 -05001804 if (hver)
1805 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001806 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001807 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001808 rbd_dev->header.total_snaps = h.total_snaps;
1809 rbd_dev->header.snapc = h.snapc;
1810 rbd_dev->header.snap_names = h.snap_names;
1811 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001812 /* Free the extra copy of the object prefix */
1813 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1814 kfree(h.object_prefix);
1815
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001816 ret = __rbd_init_snaps_header(rbd_dev);
1817
Josh Durginc6666012011-11-21 17:11:12 -08001818 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001819
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001820 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001821}
1822
Alex Elder1fe5e992012-07-25 09:32:41 -05001823static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1824{
1825 int ret;
1826
1827 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1828 ret = __rbd_refresh_header(rbd_dev, hver);
1829 mutex_unlock(&ctl_mutex);
1830
1831 return ret;
1832}
1833
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001834static int rbd_init_disk(struct rbd_device *rbd_dev)
1835{
1836 struct gendisk *disk;
1837 struct request_queue *q;
1838 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001839 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840 u64 total_size = 0;
1841
1842 /* contact OSD, request size info about the object being mapped */
1843 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1844 if (rc)
1845 return rc;
1846
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001847 /* no need to lock here, as rbd_dev is not registered yet */
1848 rc = __rbd_init_snaps_header(rbd_dev);
1849 if (rc)
1850 return rc;
1851
Josh Durgincc9d7342011-11-21 18:19:13 -08001852 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001853 if (rc)
1854 return rc;
1855
1856 /* create gendisk info */
1857 rc = -ENOMEM;
1858 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1859 if (!disk)
1860 goto out;
1861
Alex Elderf0f8cef2012-01-29 13:57:44 -06001862 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001863 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001864 disk->major = rbd_dev->major;
1865 disk->first_minor = 0;
1866 disk->fops = &rbd_bd_ops;
1867 disk->private_data = rbd_dev;
1868
1869 /* init rq */
1870 rc = -ENOMEM;
1871 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1872 if (!q)
1873 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001874
Alex Elder593a9e72012-02-07 12:03:37 -06001875 /* We use the default size, but let's be explicit about it. */
1876 blk_queue_physical_block_size(q, SECTOR_SIZE);
1877
Josh Durgin029bcbd2011-07-22 11:35:23 -07001878 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001879 segment_size = rbd_obj_bytes(&rbd_dev->header);
1880 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1881 blk_queue_max_segment_size(q, segment_size);
1882 blk_queue_io_min(q, segment_size);
1883 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001884
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001885 blk_queue_merge_bvec(q, rbd_merge_bvec);
1886 disk->queue = q;
1887
1888 q->queuedata = rbd_dev;
1889
1890 rbd_dev->disk = disk;
1891 rbd_dev->q = q;
1892
1893 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001894 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001895 add_disk(disk);
1896
1897 pr_info("%s: added with size 0x%llx\n",
1898 disk->disk_name, (unsigned long long)total_size);
1899 return 0;
1900
1901out_disk:
1902 put_disk(disk);
1903out:
1904 return rc;
1905}
1906
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001907/*
1908 sysfs
1909*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001910
Alex Elder593a9e72012-02-07 12:03:37 -06001911static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1912{
1913 return container_of(dev, struct rbd_device, dev);
1914}
1915
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001916static ssize_t rbd_size_show(struct device *dev,
1917 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001918{
Alex Elder593a9e72012-02-07 12:03:37 -06001919 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001920 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001921
Josh Durgina51aa0c2011-12-05 10:35:04 -08001922 down_read(&rbd_dev->header_rwsem);
1923 size = get_capacity(rbd_dev->disk);
1924 up_read(&rbd_dev->header_rwsem);
1925
1926 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001927}
1928
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001929static ssize_t rbd_major_show(struct device *dev,
1930 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001931{
Alex Elder593a9e72012-02-07 12:03:37 -06001932 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001933
1934 return sprintf(buf, "%d\n", rbd_dev->major);
1935}
1936
1937static ssize_t rbd_client_id_show(struct device *dev,
1938 struct device_attribute *attr, char *buf)
1939{
Alex Elder593a9e72012-02-07 12:03:37 -06001940 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001941
Alex Elder1dbb4392012-01-24 10:08:37 -06001942 return sprintf(buf, "client%lld\n",
1943 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001944}
1945
1946static ssize_t rbd_pool_show(struct device *dev,
1947 struct device_attribute *attr, char *buf)
1948{
Alex Elder593a9e72012-02-07 12:03:37 -06001949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001950
1951 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1952}
1953
Alex Elder9bb2f332012-07-12 10:46:35 -05001954static ssize_t rbd_pool_id_show(struct device *dev,
1955 struct device_attribute *attr, char *buf)
1956{
1957 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1958
1959 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1960}
1961
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001962static ssize_t rbd_name_show(struct device *dev,
1963 struct device_attribute *attr, char *buf)
1964{
Alex Elder593a9e72012-02-07 12:03:37 -06001965 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001966
Alex Elder0bed54d2012-07-03 16:01:18 -05001967 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001968}
1969
1970static ssize_t rbd_snap_show(struct device *dev,
1971 struct device_attribute *attr,
1972 char *buf)
1973{
Alex Elder593a9e72012-02-07 12:03:37 -06001974 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001975
1976 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1977}
1978
1979static ssize_t rbd_image_refresh(struct device *dev,
1980 struct device_attribute *attr,
1981 const char *buf,
1982 size_t size)
1983{
Alex Elder593a9e72012-02-07 12:03:37 -06001984 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001985 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001986
Alex Elder1fe5e992012-07-25 09:32:41 -05001987 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001988
1989 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001990}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001991
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001992static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1993static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1994static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1995static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001996static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001997static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1998static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1999static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
2000static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002001
2002static struct attribute *rbd_attrs[] = {
2003 &dev_attr_size.attr,
2004 &dev_attr_major.attr,
2005 &dev_attr_client_id.attr,
2006 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05002007 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002008 &dev_attr_name.attr,
2009 &dev_attr_current_snap.attr,
2010 &dev_attr_refresh.attr,
2011 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002012 NULL
2013};
2014
2015static struct attribute_group rbd_attr_group = {
2016 .attrs = rbd_attrs,
2017};
2018
2019static const struct attribute_group *rbd_attr_groups[] = {
2020 &rbd_attr_group,
2021 NULL
2022};
2023
2024static void rbd_sysfs_dev_release(struct device *dev)
2025{
2026}
2027
2028static struct device_type rbd_device_type = {
2029 .name = "rbd",
2030 .groups = rbd_attr_groups,
2031 .release = rbd_sysfs_dev_release,
2032};
2033
2034
2035/*
2036 sysfs - snapshots
2037*/
2038
2039static ssize_t rbd_snap_size_show(struct device *dev,
2040 struct device_attribute *attr,
2041 char *buf)
2042{
2043 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2044
Josh Durgin3591538f2011-12-05 18:25:13 -08002045 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002046}
2047
2048static ssize_t rbd_snap_id_show(struct device *dev,
2049 struct device_attribute *attr,
2050 char *buf)
2051{
2052 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2053
Josh Durgin3591538f2011-12-05 18:25:13 -08002054 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002055}
2056
2057static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2058static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2059
2060static struct attribute *rbd_snap_attrs[] = {
2061 &dev_attr_snap_size.attr,
2062 &dev_attr_snap_id.attr,
2063 NULL,
2064};
2065
2066static struct attribute_group rbd_snap_attr_group = {
2067 .attrs = rbd_snap_attrs,
2068};
2069
2070static void rbd_snap_dev_release(struct device *dev)
2071{
2072 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2073 kfree(snap->name);
2074 kfree(snap);
2075}
2076
2077static const struct attribute_group *rbd_snap_attr_groups[] = {
2078 &rbd_snap_attr_group,
2079 NULL
2080};
2081
2082static struct device_type rbd_snap_device_type = {
2083 .groups = rbd_snap_attr_groups,
2084 .release = rbd_snap_dev_release,
2085};
2086
Alex Elder14e70852012-07-19 09:09:27 -05002087static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002088{
2089 list_del(&snap->node);
2090 device_unregister(&snap->dev);
2091}
2092
Alex Elder14e70852012-07-19 09:09:27 -05002093static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002094 struct device *parent)
2095{
2096 struct device *dev = &snap->dev;
2097 int ret;
2098
2099 dev->type = &rbd_snap_device_type;
2100 dev->parent = parent;
2101 dev->release = rbd_snap_dev_release;
2102 dev_set_name(dev, "snap_%s", snap->name);
2103 ret = device_register(dev);
2104
2105 return ret;
2106}
2107
Alex Elder4e891e02012-07-10 20:30:10 -05002108static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2109 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002110{
Alex Elder4e891e02012-07-10 20:30:10 -05002111 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002112 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002113
2114 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002115 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002116 return ERR_PTR(-ENOMEM);
2117
2118 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002120 if (!snap->name)
2121 goto err;
2122
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002123 snap->size = rbd_dev->header.snap_sizes[i];
2124 snap->id = rbd_dev->header.snapc->snaps[i];
2125 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002126 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002127 if (ret < 0)
2128 goto err;
2129 }
Alex Elder4e891e02012-07-10 20:30:10 -05002130
2131 return snap;
2132
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002133err:
2134 kfree(snap->name);
2135 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002136
2137 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002138}
2139
2140/*
Alex Elder35938152012-08-02 11:29:46 -05002141 * Scan the rbd device's current snapshot list and compare it to the
2142 * newly-received snapshot context. Remove any existing snapshots
2143 * not present in the new snapshot context. Add a new snapshot for
2144 * any snaphots in the snapshot context not in the current list.
2145 * And verify there are no changes to snapshots we already know
2146 * about.
2147 *
2148 * Assumes the snapshots in the snapshot context are sorted by
2149 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2150 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002151 */
2152static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2153{
Alex Elder35938152012-08-02 11:29:46 -05002154 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2155 const u32 snap_count = snapc->num_snaps;
2156 char *snap_name = rbd_dev->header.snap_names;
2157 struct list_head *head = &rbd_dev->snaps;
2158 struct list_head *links = head->next;
2159 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002160
Alex Elder35938152012-08-02 11:29:46 -05002161 while (index < snap_count || links != head) {
2162 u64 snap_id;
2163 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002164
Alex Elder35938152012-08-02 11:29:46 -05002165 snap_id = index < snap_count ? snapc->snaps[index]
2166 : CEPH_NOSNAP;
2167 snap = links != head ? list_entry(links, struct rbd_snap, node)
2168 : NULL;
2169 BUG_ON(snap && snap->id == CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002170
Alex Elder35938152012-08-02 11:29:46 -05002171 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2172 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002173
Alex Elder35938152012-08-02 11:29:46 -05002174 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002175
Alex Elder35938152012-08-02 11:29:46 -05002176 if (rbd_dev->snap_id == snap->id)
Josh Durgine88a36e2011-11-21 18:14:25 -08002177 rbd_dev->snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002178 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002179
Alex Elder35938152012-08-02 11:29:46 -05002180 /* Done with this list entry; advance */
2181
2182 links = next;
2183 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002184 }
Alex Elder35938152012-08-02 11:29:46 -05002185
2186 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2187 struct rbd_snap *new_snap;
2188
2189 /* We haven't seen this snapshot before */
2190
2191 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2192 snap_name);
2193 if (IS_ERR(new_snap))
2194 return PTR_ERR(new_snap);
2195
2196 /* New goes before existing, or at end of list */
2197
2198 if (snap)
2199 list_add_tail(&new_snap->node, &snap->node);
2200 else
Alex Elder523f3252012-08-30 00:16:37 -05002201 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002202 } else {
2203 /* Already have this one */
2204
2205 BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]);
2206 BUG_ON(strcmp(snap->name, snap_name));
2207
2208 /* Done with this list entry; advance */
2209
2210 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002211 }
Alex Elder35938152012-08-02 11:29:46 -05002212
2213 /* Advance to the next entry in the snapshot context */
2214
2215 index++;
2216 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002217 }
2218
2219 return 0;
2220}
2221
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002222static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2223{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002224 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002225 struct device *dev;
2226 struct rbd_snap *snap;
2227
2228 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2229 dev = &rbd_dev->dev;
2230
2231 dev->bus = &rbd_bus_type;
2232 dev->type = &rbd_device_type;
2233 dev->parent = &rbd_root_dev;
2234 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002235 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002236 ret = device_register(dev);
2237 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002238 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002239
2240 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002241 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002242 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002243 break;
2244 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002245out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002246 mutex_unlock(&ctl_mutex);
2247 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002248}
2249
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002250static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2251{
2252 device_unregister(&rbd_dev->dev);
2253}
2254
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002255static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2256{
2257 int ret, rc;
2258
2259 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002260 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002261 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002262 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002263 if (rc < 0)
2264 return rc;
2265 }
2266 } while (ret == -ERANGE);
2267
2268 return ret;
2269}
2270
Alex Elder1ddbe942012-01-29 13:57:44 -06002271static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2272
2273/*
Alex Elder499afd52012-02-02 08:13:29 -06002274 * Get a unique rbd identifier for the given new rbd_dev, and add
2275 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002276 */
Alex Elder499afd52012-02-02 08:13:29 -06002277static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002278{
Alex Elderde71a292012-07-03 16:01:19 -05002279 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002280
2281 spin_lock(&rbd_dev_list_lock);
2282 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2283 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002284}
Alex Elderb7f23c32012-01-29 13:57:43 -06002285
Alex Elder1ddbe942012-01-29 13:57:44 -06002286/*
Alex Elder499afd52012-02-02 08:13:29 -06002287 * Remove an rbd_dev from the global list, and record that its
2288 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002289 */
Alex Elder499afd52012-02-02 08:13:29 -06002290static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002291{
Alex Elderd184f6b2012-01-29 13:57:44 -06002292 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002293 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002294 int max_id;
2295
2296 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002297
2298 spin_lock(&rbd_dev_list_lock);
2299 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002300
2301 /*
2302 * If the id being "put" is not the current maximum, there
2303 * is nothing special we need to do.
2304 */
2305 if (rbd_id != atomic64_read(&rbd_id_max)) {
2306 spin_unlock(&rbd_dev_list_lock);
2307 return;
2308 }
2309
2310 /*
2311 * We need to update the current maximum id. Search the
2312 * list to find out what it is. We're more likely to find
2313 * the maximum at the end, so search the list backward.
2314 */
2315 max_id = 0;
2316 list_for_each_prev(tmp, &rbd_dev_list) {
2317 struct rbd_device *rbd_dev;
2318
2319 rbd_dev = list_entry(tmp, struct rbd_device, node);
2320 if (rbd_id > max_id)
2321 max_id = rbd_id;
2322 }
Alex Elder499afd52012-02-02 08:13:29 -06002323 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002324
Alex Elder1ddbe942012-01-29 13:57:44 -06002325 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002326 * The max id could have been updated by rbd_id_get(), in
2327 * which case it now accurately reflects the new maximum.
2328 * Be careful not to overwrite the maximum value in that
2329 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002330 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002331 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002332}
2333
Alex Eldera725f65e2012-02-02 08:13:30 -06002334/*
Alex Eldere28fff262012-02-02 08:13:30 -06002335 * Skips over white space at *buf, and updates *buf to point to the
2336 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002337 * the token (string of non-white space characters) found. Note
2338 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002339 */
2340static inline size_t next_token(const char **buf)
2341{
2342 /*
2343 * These are the characters that produce nonzero for
2344 * isspace() in the "C" and "POSIX" locales.
2345 */
2346 const char *spaces = " \f\n\r\t\v";
2347
2348 *buf += strspn(*buf, spaces); /* Find start of token */
2349
2350 return strcspn(*buf, spaces); /* Return token length */
2351}
2352
2353/*
2354 * Finds the next token in *buf, and if the provided token buffer is
2355 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002356 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2357 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002358 *
2359 * Returns the length of the token found (not including the '\0').
2360 * Return value will be 0 if no token is found, and it will be >=
2361 * token_size if the token would not fit.
2362 *
Alex Elder593a9e72012-02-07 12:03:37 -06002363 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002364 * found token. Note that this occurs even if the token buffer is
2365 * too small to hold it.
2366 */
2367static inline size_t copy_token(const char **buf,
2368 char *token,
2369 size_t token_size)
2370{
2371 size_t len;
2372
2373 len = next_token(buf);
2374 if (len < token_size) {
2375 memcpy(token, *buf, len);
2376 *(token + len) = '\0';
2377 }
2378 *buf += len;
2379
2380 return len;
2381}
2382
2383/*
Alex Elderea3352f2012-07-09 21:04:23 -05002384 * Finds the next token in *buf, dynamically allocates a buffer big
2385 * enough to hold a copy of it, and copies the token into the new
2386 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2387 * that a duplicate buffer is created even for a zero-length token.
2388 *
2389 * Returns a pointer to the newly-allocated duplicate, or a null
2390 * pointer if memory for the duplicate was not available. If
2391 * the lenp argument is a non-null pointer, the length of the token
2392 * (not including the '\0') is returned in *lenp.
2393 *
2394 * If successful, the *buf pointer will be updated to point beyond
2395 * the end of the found token.
2396 *
2397 * Note: uses GFP_KERNEL for allocation.
2398 */
2399static inline char *dup_token(const char **buf, size_t *lenp)
2400{
2401 char *dup;
2402 size_t len;
2403
2404 len = next_token(buf);
2405 dup = kmalloc(len + 1, GFP_KERNEL);
2406 if (!dup)
2407 return NULL;
2408
2409 memcpy(dup, *buf, len);
2410 *(dup + len) = '\0';
2411 *buf += len;
2412
2413 if (lenp)
2414 *lenp = len;
2415
2416 return dup;
2417}
2418
2419/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002420 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002421 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2422 * on the list of monitor addresses and other options provided via
2423 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002424 *
2425 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002426 */
2427static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2428 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002429 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002430 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002431 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002432 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002433{
Alex Elderd22f76e2012-07-12 10:46:35 -05002434 size_t len;
2435 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002436
2437 /* The first four tokens are required */
2438
Alex Elder7ef32142012-02-02 08:13:30 -06002439 len = next_token(&buf);
2440 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002441 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002442 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002443 *mon_addrs = buf;
2444
2445 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002446
Alex Eldere28fff262012-02-02 08:13:30 -06002447 len = copy_token(&buf, options, options_size);
2448 if (!len || len >= options_size)
2449 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002450
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002451 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002452 rbd_dev->pool_name = dup_token(&buf, NULL);
2453 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002454 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002455
Alex Elder0bed54d2012-07-03 16:01:18 -05002456 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2457 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002458 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002459
Alex Eldercb8627c2012-07-09 21:04:23 -05002460 /* Create the name of the header object */
2461
Alex Elder0bed54d2012-07-03 16:01:18 -05002462 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002463 + sizeof (RBD_SUFFIX),
2464 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002465 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002466 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002467 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002468
Alex Eldere28fff262012-02-02 08:13:30 -06002469 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002470 * The snapshot name is optional. If none is is supplied,
2471 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002472 */
Alex Elder820a5f32012-07-09 21:04:24 -05002473 rbd_dev->snap_name = dup_token(&buf, &len);
2474 if (!rbd_dev->snap_name)
2475 goto out_err;
2476 if (!len) {
2477 /* Replace the empty name with the default */
2478 kfree(rbd_dev->snap_name);
2479 rbd_dev->snap_name
2480 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2481 if (!rbd_dev->snap_name)
2482 goto out_err;
2483
Alex Eldere28fff262012-02-02 08:13:30 -06002484 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2485 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002486 }
Alex Eldere28fff262012-02-02 08:13:30 -06002487
Alex Eldera725f65e2012-02-02 08:13:30 -06002488 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002489
2490out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002491 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002492 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002493 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002494 rbd_dev->image_name = NULL;
2495 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002496 kfree(rbd_dev->pool_name);
2497 rbd_dev->pool_name = NULL;
2498
2499 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002500}
2501
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002502static ssize_t rbd_add(struct bus_type *bus,
2503 const char *buf,
2504 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002505{
Alex Eldercb8627c2012-07-09 21:04:23 -05002506 char *options;
2507 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002508 const char *mon_addrs = NULL;
2509 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002510 struct ceph_osd_client *osdc;
2511 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002512
2513 if (!try_module_get(THIS_MODULE))
2514 return -ENODEV;
2515
Alex Elder27cc2592012-02-02 08:13:30 -06002516 options = kmalloc(count, GFP_KERNEL);
2517 if (!options)
2518 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002519 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2520 if (!rbd_dev)
2521 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002522
2523 /* static rbd_device initialization */
2524 spin_lock_init(&rbd_dev->lock);
2525 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002526 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002527 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002528
Alex Elderd184f6b2012-01-29 13:57:44 -06002529 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002530 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002531
Alex Eldera725f65e2012-02-02 08:13:30 -06002532 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002533 BUILD_BUG_ON(DEV_NAME_LEN
2534 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002535 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a82f2012-01-29 13:57:44 -06002536
Alex Eldera725f65e2012-02-02 08:13:30 -06002537 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002538 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002539 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002540 if (rc)
2541 goto err_put_id;
2542
Alex Elderf8c38922012-08-10 13:12:07 -07002543 rc = rbd_get_client(rbd_dev, mon_addrs, mon_addrs_size - 1, options);
2544 if (rc < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002545 goto err_put_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002546
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002547 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002548 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002549 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2550 if (rc < 0)
2551 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002552 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002553
2554 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002555 rc = register_blkdev(0, rbd_dev->name);
2556 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002557 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002558 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002559
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002560 rc = rbd_bus_add_dev(rbd_dev);
2561 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002562 goto err_out_blkdev;
2563
Alex Elder32eec682012-02-08 16:11:14 -06002564 /*
2565 * At this point cleanup in the event of an error is the job
2566 * of the sysfs code (initiated by rbd_bus_del_dev()).
2567 *
2568 * Set up and announce blkdev mapping.
2569 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002570 rc = rbd_init_disk(rbd_dev);
2571 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002572 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002573
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002574 rc = rbd_init_watch_dev(rbd_dev);
2575 if (rc)
2576 goto err_out_bus;
2577
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002578 return count;
2579
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002580err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002581 /* this will also clean up rest of rbd_dev stuff */
2582
2583 rbd_bus_del_dev(rbd_dev);
2584 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002585 return rc;
2586
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002587err_out_blkdev:
2588 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2589err_out_client:
2590 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002591err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002592 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002593 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002594 kfree(rbd_dev->header_name);
2595 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002596 kfree(rbd_dev->pool_name);
2597 }
Alex Elder499afd52012-02-02 08:13:29 -06002598 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002599err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002600 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002601 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002602
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002603 dout("Error adding device %s\n", buf);
2604 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002605
2606 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002607}
2608
Alex Elderde71a292012-07-03 16:01:19 -05002609static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002610{
2611 struct list_head *tmp;
2612 struct rbd_device *rbd_dev;
2613
Alex Eldere124a82f2012-01-29 13:57:44 -06002614 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002615 list_for_each(tmp, &rbd_dev_list) {
2616 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002617 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06002618 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002619 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002620 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002621 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002622 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002623 return NULL;
2624}
2625
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002626static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002627{
Alex Elder593a9e72012-02-07 12:03:37 -06002628 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002629
Alex Elder1dbb4392012-01-24 10:08:37 -06002630 if (rbd_dev->watch_request) {
2631 struct ceph_client *client = rbd_dev->rbd_client->client;
2632
2633 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002634 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002635 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002636 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002637 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002638
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002639 rbd_put_client(rbd_dev);
2640
2641 /* clean up and free blkdev */
2642 rbd_free_disk(rbd_dev);
2643 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002644
2645 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002646 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002647 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002648 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002649 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002650 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002651 kfree(rbd_dev);
2652
2653 /* release module ref */
2654 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002655}
2656
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002657static ssize_t rbd_remove(struct bus_type *bus,
2658 const char *buf,
2659 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002660{
2661 struct rbd_device *rbd_dev = NULL;
2662 int target_id, rc;
2663 unsigned long ul;
2664 int ret = count;
2665
2666 rc = strict_strtoul(buf, 10, &ul);
2667 if (rc)
2668 return rc;
2669
2670 /* convert to int; abort if we lost anything in the conversion */
2671 target_id = (int) ul;
2672 if (target_id != ul)
2673 return -EINVAL;
2674
2675 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2676
2677 rbd_dev = __rbd_get_dev(target_id);
2678 if (!rbd_dev) {
2679 ret = -ENOENT;
2680 goto done;
2681 }
2682
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002683 __rbd_remove_all_snaps(rbd_dev);
2684 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002685
2686done:
2687 mutex_unlock(&ctl_mutex);
2688 return ret;
2689}
2690
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002691static ssize_t rbd_snap_add(struct device *dev,
2692 struct device_attribute *attr,
2693 const char *buf,
2694 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002695{
Alex Elder593a9e72012-02-07 12:03:37 -06002696 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002697 int ret;
2698 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002699 if (!name)
2700 return -ENOMEM;
2701
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002702 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002703
2704 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2705
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002706 ret = rbd_header_add_snap(rbd_dev,
2707 name, GFP_KERNEL);
2708 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002709 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002710
Alex Elderb8136232012-07-25 09:32:41 -05002711 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002712 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002713 goto err_unlock;
2714
2715 /* shouldn't hold ctl_mutex when notifying.. notify might
2716 trigger a watch callback that would need to get that mutex */
2717 mutex_unlock(&ctl_mutex);
2718
2719 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002720 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002721
2722 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002723 kfree(name);
2724 return ret;
2725
2726err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002727 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002728 kfree(name);
2729 return ret;
2730}
2731
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002732/*
2733 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002734 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002735 */
2736static int rbd_sysfs_init(void)
2737{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002738 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002739
Alex Elderfed4c142012-02-07 12:03:36 -06002740 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002741 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002742 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002743
Alex Elderfed4c142012-02-07 12:03:36 -06002744 ret = bus_register(&rbd_bus_type);
2745 if (ret < 0)
2746 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002747
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002748 return ret;
2749}
2750
2751static void rbd_sysfs_cleanup(void)
2752{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002753 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002754 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002755}
2756
2757int __init rbd_init(void)
2758{
2759 int rc;
2760
2761 rc = rbd_sysfs_init();
2762 if (rc)
2763 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002764 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002765 return 0;
2766}
2767
2768void __exit rbd_exit(void)
2769{
2770 rbd_sysfs_cleanup();
2771}
2772
2773module_init(rbd_init);
2774module_exit(rbd_exit);
2775
2776MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2777MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2778MODULE_DESCRIPTION("rados block device");
2779
2780/* following authorship retained from original osdblk.c */
2781MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2782
2783MODULE_LICENSE("GPL");