blob: 15bd3ecbcf34d6c3bc55758e580871b1080f4b9e [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
Alex Elder0f1d3f92012-08-02 11:29:44 -050084 u64 snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070085 u32 total_snaps;
86
87 char *snap_names;
88 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070089
90 u64 obj_version;
91};
92
93struct rbd_options {
94 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095};
96
97/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060098 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099 */
100struct rbd_client {
101 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700102 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103 struct kref kref;
104 struct list_head node;
105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700110struct rbd_req_status {
111 int done;
112 int rc;
113 u64 bytes;
114};
115
116/*
117 * a collection of requests
118 */
119struct rbd_req_coll {
120 int total;
121 int num_done;
122 struct kref kref;
123 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700124};
125
Alex Elderf0f8cef2012-01-29 13:57:44 -0600126/*
127 * a single io request
128 */
129struct rbd_request {
130 struct request *rq; /* blk layer request */
131 struct bio *bio; /* cloned bio */
132 struct page **pages; /* list of used pages */
133 u64 len;
134 int coll_index;
135 struct rbd_req_coll *coll;
136};
137
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800138struct rbd_snap {
139 struct device dev;
140 const char *name;
Josh Durgin35915382011-12-05 18:25:13 -0800141 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800142 struct list_head node;
143 u64 id;
144};
145
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146/*
147 * a single device
148 */
149struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500150 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151
152 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156 struct rbd_client *rbd_client;
157
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
159
160 spinlock_t lock; /* queue lock */
161
162 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500163 char *image_name;
164 size_t image_name_len;
165 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500166 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500167 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700168
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700169 struct ceph_osd_event *watch_event;
170 struct ceph_osd_request *watch_request;
171
Josh Durginc6666012011-11-21 17:11:12 -0800172 /* protects updating the header */
173 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800174 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800176 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800177 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
182 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183
184 /* list of snapshots */
185 struct list_head snaps;
186
187 /* sysfs related */
188 struct device dev;
189};
190
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700191static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a822012-01-29 13:57:44 -0600192
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700193static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a822012-01-29 13:57:44 -0600194static DEFINE_SPINLOCK(rbd_dev_list_lock);
195
Alex Elder432b8582012-01-29 13:57:44 -0600196static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
Alex Elder14e70852012-07-19 09:09:27 -0500205static void __rbd_remove_snap_dev(struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800206
Alex Elderf0f8cef2012-01-29 13:57:44 -0600207static ssize_t rbd_add(struct bus_type *bus, const char *buf,
208 size_t count);
209static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
210 size_t count);
211
212static struct bus_attribute rbd_bus_attrs[] = {
213 __ATTR(add, S_IWUSR, NULL, rbd_add),
214 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
215 __ATTR_NULL
216};
217
218static struct bus_type rbd_bus_type = {
219 .name = "rbd",
220 .bus_attrs = rbd_bus_attrs,
221};
222
223static void rbd_root_dev_release(struct device *dev)
224{
225}
226
227static struct device rbd_root_dev = {
228 .init_name = "rbd",
229 .release = rbd_root_dev_release,
230};
231
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800232
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
234{
235 return get_device(&rbd_dev->dev);
236}
237
238static void rbd_put_dev(struct rbd_device *rbd_dev)
239{
240 put_device(&rbd_dev->dev);
241}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700242
Alex Elder1fe5e992012-07-25 09:32:41 -0500243static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700244
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700245static int rbd_open(struct block_device *bdev, fmode_t mode)
246{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600247 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700248
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
250 return -EROFS;
251
Alex Elder340c7a22012-08-10 13:12:07 -0700252 rbd_get_dev(rbd_dev);
253 set_device_ro(bdev, rbd_dev->read_only);
254
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700255 return 0;
256}
257
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800258static int rbd_release(struct gendisk *disk, fmode_t mode)
259{
260 struct rbd_device *rbd_dev = disk->private_data;
261
262 rbd_put_dev(rbd_dev);
263
264 return 0;
265}
266
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700267static const struct block_device_operations rbd_bd_ops = {
268 .owner = THIS_MODULE,
269 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800270 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700271};
272
273/*
274 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500275 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700276 */
Alex Elder43ae4702012-07-03 16:01:18 -0500277static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700278 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700279{
280 struct rbd_client *rbdc;
281 int ret = -ENOMEM;
282
283 dout("rbd_client_create\n");
284 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
285 if (!rbdc)
286 goto out_opt;
287
288 kref_init(&rbdc->kref);
289 INIT_LIST_HEAD(&rbdc->node);
290
Alex Elderbc534d82012-01-29 13:57:44 -0600291 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
292
Alex Elder43ae4702012-07-03 16:01:18 -0500293 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600295 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500296 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700297
298 ret = ceph_open_session(rbdc->client);
299 if (ret < 0)
300 goto out_err;
301
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700302 rbdc->rbd_opts = rbd_opts;
303
Alex Elder432b8582012-01-29 13:57:44 -0600304 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700305 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600306 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307
Alex Elderbc534d82012-01-29 13:57:44 -0600308 mutex_unlock(&ctl_mutex);
309
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700310 dout("rbd_client_create created %p\n", rbdc);
311 return rbdc;
312
313out_err:
314 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600315out_mutex:
316 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700317 kfree(rbdc);
318out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500319 if (ceph_opts)
320 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400321 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700322}
323
324/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700325 * Find a ceph client with specific addr and configuration. If
326 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700327 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700328static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700329{
330 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700331 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700332
Alex Elder43ae4702012-07-03 16:01:18 -0500333 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 return NULL;
335
Alex Elder1f7ba332012-08-10 13:12:07 -0700336 spin_lock(&rbd_client_list_lock);
337 list_for_each_entry(client_node, &rbd_client_list, node) {
338 if (!ceph_compare_options(ceph_opts, client_node->client)) {
339 kref_get(&client_node->kref);
340 found = true;
341 break;
342 }
343 }
344 spin_unlock(&rbd_client_list_lock);
345
346 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700347}
348
349/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700350 * mount options
351 */
352enum {
353 Opt_notify_timeout,
354 Opt_last_int,
355 /* int args above */
356 Opt_last_string,
357 /* string args above */
358};
359
Alex Elder43ae4702012-07-03 16:01:18 -0500360static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361 {Opt_notify_timeout, "notify_timeout=%d"},
362 /* int args above */
363 /* string args above */
364 {-1, NULL}
365};
366
367static int parse_rbd_opts_token(char *c, void *private)
368{
Alex Elder43ae4702012-07-03 16:01:18 -0500369 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700370 substring_t argstr[MAX_OPT_ARGS];
371 int token, intval, ret;
372
Alex Elder43ae4702012-07-03 16:01:18 -0500373 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700374 if (token < 0)
375 return -EINVAL;
376
377 if (token < Opt_last_int) {
378 ret = match_int(&argstr[0], &intval);
379 if (ret < 0) {
380 pr_err("bad mount option arg (not int) "
381 "at '%s'\n", c);
382 return ret;
383 }
384 dout("got int token %d val %d\n", token, intval);
385 } else if (token > Opt_last_int && token < Opt_last_string) {
386 dout("got string token %d val %s\n", token,
387 argstr[0].from);
388 } else {
389 dout("got token %d\n", token);
390 }
391
392 switch (token) {
393 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500394 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700395 break;
396 default:
397 BUG_ON(token);
398 }
399 return 0;
400}
401
402/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700403 * Get a ceph client with specific addr and configuration, if one does
404 * not exist create it.
405 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600406static struct rbd_client *rbd_get_client(const char *mon_addr,
407 size_t mon_addr_len,
408 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700409{
410 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500411 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700412 struct rbd_options *rbd_opts;
413
414 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
415 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600416 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700417
418 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419
Alex Elder43ae4702012-07-03 16:01:18 -0500420 ceph_opts = ceph_parse_options(options, mon_addr,
421 mon_addr + mon_addr_len,
422 parse_rbd_opts_token, rbd_opts);
423 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600424 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500425 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600426 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700427
Alex Elder1f7ba332012-08-10 13:12:07 -0700428 rbdc = rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700429 if (rbdc) {
Alex Eldere6994d32012-01-29 13:57:44 -0600430 /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500431 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600432 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433
Alex Elderd720bcb2012-02-02 08:13:30 -0600434 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700435 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700436
Alex Elder43ae4702012-07-03 16:01:18 -0500437 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd720bcb2012-02-02 08:13:30 -0600438 if (IS_ERR(rbdc))
439 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440
Alex Elderd720bcb2012-02-02 08:13:30 -0600441 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700442}
443
444/*
445 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600446 *
Alex Elder432b8582012-01-29 13:57:44 -0600447 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700448 */
449static void rbd_client_release(struct kref *kref)
450{
451 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
452
453 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500454 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700455 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500456 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700457
458 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700459 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700460 kfree(rbdc);
461}
462
463/*
464 * Drop reference to ceph client node. If it's not referenced anymore, release
465 * it.
466 */
467static void rbd_put_client(struct rbd_device *rbd_dev)
468{
469 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
470 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700471}
472
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700473/*
474 * Destroy requests collection
475 */
476static void rbd_coll_release(struct kref *kref)
477{
478 struct rbd_req_coll *coll =
479 container_of(kref, struct rbd_req_coll, kref);
480
481 dout("rbd_coll_release %p\n", coll);
482 kfree(coll);
483}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700484
Alex Elder8e94af82012-07-25 09:32:40 -0500485static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
486{
Alex Elder103a1502012-08-02 11:29:45 -0500487 size_t size;
488 u32 snap_count;
489
490 /* The header has to start with the magic rbd header text */
491 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
492 return false;
493
494 /*
495 * The size of a snapshot header has to fit in a size_t, and
496 * that limits the number of snapshots.
497 */
498 snap_count = le32_to_cpu(ondisk->snap_count);
499 size = SIZE_MAX - sizeof (struct ceph_snap_context);
500 if (snap_count > size / sizeof (__le64))
501 return false;
502
503 /*
504 * Not only that, but the size of the entire the snapshot
505 * header must also be representable in a size_t.
506 */
507 size -= snap_count * sizeof (__le64);
508 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
509 return false;
510
511 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500512}
513
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700514/*
515 * Create a new header structure, translate header format from the on-disk
516 * header.
517 */
518static int rbd_header_from_disk(struct rbd_image_header *header,
Alex Elder4156d992012-08-02 11:29:46 -0500519 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700520{
Alex Elderccece232012-07-10 20:30:10 -0500521 u32 snap_count;
Alex Elderd2bb24e2012-07-26 23:37:14 -0500522 size_t size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700523
Alex Elder6a523252012-07-19 17:12:59 -0500524 memset(header, 0, sizeof (*header));
525
Alex Elder103a1502012-08-02 11:29:45 -0500526 snap_count = le32_to_cpu(ondisk->snap_count);
527
Alex Elder843a0d02012-08-31 17:29:51 -0500528 size = sizeof (ondisk->object_prefix) + 1;
Alex Elder6a523252012-07-19 17:12:59 -0500529 header->object_prefix = kmalloc(size, GFP_KERNEL);
530 if (!header->object_prefix)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700531 return -ENOMEM;
Alex Elder843a0d02012-08-31 17:29:51 -0500532 memcpy(header->object_prefix, ondisk->object_prefix, size - 1);
Alex Elder6a523252012-07-19 17:12:59 -0500533 header->object_prefix[size - 1] = '\0';
Alex Elder00f1f362012-02-07 12:03:36 -0600534
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700535 if (snap_count) {
Alex Elderccece232012-07-10 20:30:10 -0500536 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Alex Elder0f1d3f92012-08-02 11:29:44 -0500537 BUG_ON(header->snap_names_len > (u64) SIZE_MAX);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700538 header->snap_names = kmalloc(header->snap_names_len,
Alex Eldered63f4f2012-07-19 09:09:27 -0500539 GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700540 if (!header->snap_names)
Alex Elder6a523252012-07-19 17:12:59 -0500541 goto out_err;
542
Alex Elderd2bb24e2012-07-26 23:37:14 -0500543 size = snap_count * sizeof (*header->snap_sizes);
544 header->snap_sizes = kmalloc(size, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545 if (!header->snap_sizes)
Alex Elder6a523252012-07-19 17:12:59 -0500546 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700547 } else {
Alex Elderccece232012-07-10 20:30:10 -0500548 WARN_ON(ondisk->snap_names_len);
549 header->snap_names_len = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700550 header->snap_names = NULL;
551 header->snap_sizes = NULL;
552 }
Alex Elder849b4262012-07-09 21:04:24 -0500553
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700554 header->image_size = le64_to_cpu(ondisk->image_size);
555 header->obj_order = ondisk->options.order;
556 header->crypt_type = ondisk->options.crypt_type;
557 header->comp_type = ondisk->options.comp_type;
Alex Elder6a523252012-07-19 17:12:59 -0500558 header->total_snaps = snap_count;
559
Alex Elder6a523252012-07-19 17:12:59 -0500560 size = sizeof (struct ceph_snap_context);
561 size += snap_count * sizeof (header->snapc->snaps[0]);
562 header->snapc = kzalloc(size, GFP_KERNEL);
563 if (!header->snapc)
564 goto out_err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565
566 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500567 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700568 header->snapc->num_snaps = snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700569
Alex Elder28cb7752012-07-26 23:37:15 -0500570 /* Fill in the snapshot information */
571
572 if (snap_count) {
573 u32 i;
Alex Elderccece232012-07-10 20:30:10 -0500574
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575 for (i = 0; i < snap_count; i++) {
576 header->snapc->snaps[i] =
577 le64_to_cpu(ondisk->snaps[i].id);
578 header->snap_sizes[i] =
579 le64_to_cpu(ondisk->snaps[i].image_size);
580 }
581
582 /* copy snapshot names */
Alex Elderccece232012-07-10 20:30:10 -0500583 memcpy(header->snap_names, &ondisk->snaps[snap_count],
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700584 header->snap_names_len);
585 }
586
587 return 0;
588
Alex Elder6a523252012-07-19 17:12:59 -0500589out_err:
Alex Elder849b4262012-07-09 21:04:24 -0500590 kfree(header->snap_sizes);
Alex Elderccece232012-07-10 20:30:10 -0500591 header->snap_sizes = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592 kfree(header->snap_names);
Alex Elderccece232012-07-10 20:30:10 -0500593 header->snap_names = NULL;
Alex Elderd78fd7a2012-07-26 23:37:14 -0500594 header->snap_names_len = 0;
Alex Elder6a523252012-07-19 17:12:59 -0500595 kfree(header->object_prefix);
596 header->object_prefix = NULL;
Alex Elderccece232012-07-10 20:30:10 -0500597
Alex Elder00f1f362012-02-07 12:03:36 -0600598 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599}
600
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700601static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
602 u64 *seq, u64 *size)
603{
604 int i;
605 char *p = header->snap_names;
606
Alex Elder00f1f362012-02-07 12:03:36 -0600607 for (i = 0; i < header->total_snaps; i++) {
608 if (!strcmp(snap_name, p)) {
609
610 /* Found it. Pass back its id and/or size */
611
612 if (seq)
613 *seq = header->snapc->snaps[i];
614 if (size)
615 *size = header->snap_sizes[i];
616 return i;
617 }
618 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700619 }
Alex Elder00f1f362012-02-07 12:03:36 -0600620 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700621}
622
Alex Elder0ce1a792012-07-03 16:01:18 -0500623static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700624{
Alex Elder78dc4472012-07-19 08:49:18 -0500625 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626
Alex Elder0ce1a792012-07-03 16:01:18 -0500627 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700628
Alex Elder0ce1a792012-07-03 16:01:18 -0500629 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800630 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500631 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800632 rbd_dev->snap_exists = false;
Alex Elder0ce1a792012-07-03 16:01:18 -0500633 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700634 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500635 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700636 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500637 u64 snap_id = 0;
638
639 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
640 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700641 if (ret < 0)
642 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500643 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800644 rbd_dev->snap_exists = true;
Alex Elder0ce1a792012-07-03 16:01:18 -0500645 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646 }
647
648 ret = 0;
649done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500650 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651 return ret;
652}
653
654static void rbd_header_free(struct rbd_image_header *header)
655{
Alex Elder849b4262012-07-09 21:04:24 -0500656 kfree(header->object_prefix);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500657 header->object_prefix = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658 kfree(header->snap_sizes);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500659 header->snap_sizes = NULL;
Alex Elder849b4262012-07-09 21:04:24 -0500660 kfree(header->snap_names);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500661 header->snap_names = NULL;
662 header->snap_names_len = 0;
Josh Durgind1d25642011-12-05 14:03:05 -0800663 ceph_put_snap_context(header->snapc);
Alex Elderd78fd7a2012-07-26 23:37:14 -0500664 header->snapc = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700665}
666
667/*
668 * get the actual striped segment name, offset and length
669 */
670static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500671 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700672 u64 ofs, u64 len,
673 char *seg_name, u64 *segofs)
674{
675 u64 seg = ofs >> header->obj_order;
676
677 if (seg_name)
678 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500679 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680
681 ofs = ofs & ((1 << header->obj_order) - 1);
682 len = min_t(u64, len, (1 << header->obj_order) - ofs);
683
684 if (segofs)
685 *segofs = ofs;
686
687 return len;
688}
689
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700690static int rbd_get_num_segments(struct rbd_image_header *header,
691 u64 ofs, u64 len)
692{
693 u64 start_seg = ofs >> header->obj_order;
694 u64 end_seg = (ofs + len - 1) >> header->obj_order;
695 return end_seg - start_seg + 1;
696}
697
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700698/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700699 * returns the size of an object in the image
700 */
701static u64 rbd_obj_bytes(struct rbd_image_header *header)
702{
703 return 1 << header->obj_order;
704}
705
706/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700707 * bio helpers
708 */
709
710static void bio_chain_put(struct bio *chain)
711{
712 struct bio *tmp;
713
714 while (chain) {
715 tmp = chain;
716 chain = chain->bi_next;
717 bio_put(tmp);
718 }
719}
720
721/*
722 * zeros a bio chain, starting at specific offset
723 */
724static void zero_bio_chain(struct bio *chain, int start_ofs)
725{
726 struct bio_vec *bv;
727 unsigned long flags;
728 void *buf;
729 int i;
730 int pos = 0;
731
732 while (chain) {
733 bio_for_each_segment(bv, chain, i) {
734 if (pos + bv->bv_len > start_ofs) {
735 int remainder = max(start_ofs - pos, 0);
736 buf = bvec_kmap_irq(bv, &flags);
737 memset(buf + remainder, 0,
738 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200739 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700740 }
741 pos += bv->bv_len;
742 }
743
744 chain = chain->bi_next;
745 }
746}
747
748/*
749 * bio_chain_clone - clone a chain of bios up to a certain length.
750 * might return a bio_pair that will need to be released.
751 */
752static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
753 struct bio_pair **bp,
754 int len, gfp_t gfpmask)
755{
756 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
757 int total = 0;
758
759 if (*bp) {
760 bio_pair_release(*bp);
761 *bp = NULL;
762 }
763
764 while (old_chain && (total < len)) {
765 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
766 if (!tmp)
767 goto err_out;
768
769 if (total + old_chain->bi_size > len) {
770 struct bio_pair *bp;
771
772 /*
773 * this split can only happen with a single paged bio,
774 * split_bio will BUG_ON if this is not the case
775 */
776 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500777 "bi_size=%u\n",
778 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779
780 /* split the bio. We'll release it either in the next
781 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600782 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700783 if (!bp)
784 goto err_out;
785
786 __bio_clone(tmp, &bp->bio1);
787
788 *next = &bp->bio2;
789 } else {
790 __bio_clone(tmp, old_chain);
791 *next = old_chain->bi_next;
792 }
793
794 tmp->bi_bdev = NULL;
795 gfpmask &= ~__GFP_WAIT;
796 tmp->bi_next = NULL;
797
798 if (!new_chain) {
799 new_chain = tail = tmp;
800 } else {
801 tail->bi_next = tmp;
802 tail = tmp;
803 }
804 old_chain = old_chain->bi_next;
805
806 total += tmp->bi_size;
807 }
808
809 BUG_ON(total < len);
810
811 if (tail)
812 tail->bi_next = NULL;
813
814 *old = old_chain;
815
816 return new_chain;
817
818err_out:
819 dout("bio_chain_clone with err\n");
820 bio_chain_put(new_chain);
821 return NULL;
822}
823
824/*
825 * helpers for osd request op vectors.
826 */
Alex Elder57cfc102012-06-26 12:57:03 -0700827static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops,
828 int opcode, u32 payload_len)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700829{
Alex Elder57cfc102012-06-26 12:57:03 -0700830 struct ceph_osd_req_op *ops;
831
832 ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO);
833 if (!ops)
834 return NULL;
835
836 ops[0].op = opcode;
837
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700838 /*
839 * op extent offset and length will be set later on
840 * in calc_raw_layout()
841 */
Alex Elder57cfc102012-06-26 12:57:03 -0700842 ops[0].payload_len = payload_len;
843
844 return ops;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700845}
846
847static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
848{
849 kfree(ops);
850}
851
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700852static void rbd_coll_end_req_index(struct request *rq,
853 struct rbd_req_coll *coll,
854 int index,
855 int ret, u64 len)
856{
857 struct request_queue *q;
858 int min, max, i;
859
Alex Elderbd919d42012-07-13 20:35:11 -0500860 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
861 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700862
863 if (!rq)
864 return;
865
866 if (!coll) {
867 blk_end_request(rq, ret, len);
868 return;
869 }
870
871 q = rq->q;
872
873 spin_lock_irq(q->queue_lock);
874 coll->status[index].done = 1;
875 coll->status[index].rc = ret;
876 coll->status[index].bytes = len;
877 max = min = coll->num_done;
878 while (max < coll->total && coll->status[max].done)
879 max++;
880
881 for (i = min; i<max; i++) {
882 __blk_end_request(rq, coll->status[i].rc,
883 coll->status[i].bytes);
884 coll->num_done++;
885 kref_put(&coll->kref, rbd_coll_release);
886 }
887 spin_unlock_irq(q->queue_lock);
888}
889
890static void rbd_coll_end_req(struct rbd_request *req,
891 int ret, u64 len)
892{
893 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
894}
895
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896/*
897 * Send ceph osd request
898 */
899static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500900 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700901 struct ceph_snap_context *snapc,
902 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500903 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700904 struct bio *bio,
905 struct page **pages,
906 int num_pages,
907 int flags,
908 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700909 struct rbd_req_coll *coll,
910 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700911 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700912 struct ceph_msg *msg),
913 struct ceph_osd_request **linger_req,
914 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700915{
916 struct ceph_osd_request *req;
917 struct ceph_file_layout *layout;
918 int ret;
919 u64 bno;
920 struct timespec mtime = CURRENT_TIME;
921 struct rbd_request *req_data;
922 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600923 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700924
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700926 if (!req_data) {
927 if (coll)
928 rbd_coll_end_req_index(rq, coll, coll_index,
929 -ENOMEM, len);
930 return -ENOMEM;
931 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700932
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700933 if (coll) {
934 req_data->coll = coll;
935 req_data->coll_index = coll_index;
936 }
937
Alex Elderbd919d42012-07-13 20:35:11 -0500938 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
939 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700940
Alex Elder0ce1a792012-07-03 16:01:18 -0500941 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600942 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
943 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700944 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700945 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700946 goto done_pages;
947 }
948
949 req->r_callback = rbd_cb;
950
951 req_data->rq = rq;
952 req_data->bio = bio;
953 req_data->pages = pages;
954 req_data->len = len;
955
956 req->r_priv = req_data;
957
958 reqhead = req->r_request->front.iov_base;
959 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
960
Alex Elderaded07e2012-07-03 16:01:18 -0500961 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700962 req->r_oid_len = strlen(req->r_oid);
963
964 layout = &req->r_file_layout;
965 memset(layout, 0, sizeof(*layout));
966 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
967 layout->fl_stripe_count = cpu_to_le32(1);
968 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500969 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600970 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
971 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700972
973 ceph_osdc_build_request(req, ofs, &len,
974 ops,
975 snapc,
976 &mtime,
977 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700978
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700979 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600980 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700981 *linger_req = req;
982 }
983
Alex Elder1dbb4392012-01-24 10:08:37 -0600984 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700985 if (ret < 0)
986 goto done_err;
987
988 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600989 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700990 if (ver)
991 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -0500992 dout("reassert_ver=%llu\n",
993 (unsigned long long)
994 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700995 ceph_osdc_put_request(req);
996 }
997 return ret;
998
999done_err:
1000 bio_chain_put(req_data->bio);
1001 ceph_osdc_put_request(req);
1002done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001003 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005 return ret;
1006}
1007
1008/*
1009 * Ceph osd op callback
1010 */
1011static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1012{
1013 struct rbd_request *req_data = req->r_priv;
1014 struct ceph_osd_reply_head *replyhead;
1015 struct ceph_osd_op *op;
1016 __s32 rc;
1017 u64 bytes;
1018 int read_op;
1019
1020 /* parse reply */
1021 replyhead = msg->front.iov_base;
1022 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
1023 op = (void *)(replyhead + 1);
1024 rc = le32_to_cpu(replyhead->result);
1025 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -05001026 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001027
Alex Elderbd919d42012-07-13 20:35:11 -05001028 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
1029 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030
1031 if (rc == -ENOENT && read_op) {
1032 zero_bio_chain(req_data->bio, 0);
1033 rc = 0;
1034 } else if (rc == 0 && read_op && bytes < req_data->len) {
1035 zero_bio_chain(req_data->bio, bytes);
1036 bytes = req_data->len;
1037 }
1038
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001039 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001040
1041 if (req_data->bio)
1042 bio_chain_put(req_data->bio);
1043
1044 ceph_osdc_put_request(req);
1045 kfree(req_data);
1046}
1047
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001048static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1049{
1050 ceph_osdc_put_request(req);
1051}
1052
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001053/*
1054 * Do a synchronous ceph osd operation
1055 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001056static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001057 struct ceph_snap_context *snapc,
1058 u64 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001059 int flags,
Alex Elder913d2fd2012-06-26 12:57:03 -07001060 struct ceph_osd_req_op *ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001061 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001062 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001063 char *buf,
1064 struct ceph_osd_request **linger_req,
1065 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001066{
1067 int ret;
1068 struct page **pages;
1069 int num_pages;
Alex Elder913d2fd2012-06-26 12:57:03 -07001070
1071 BUG_ON(ops == NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001072
1073 num_pages = calc_pages_for(ofs , len);
1074 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001075 if (IS_ERR(pages))
1076 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001077
Alex Elder0ce1a792012-07-03 16:01:18 -05001078 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001079 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 pages, num_pages,
1081 flags,
1082 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001083 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001084 NULL,
1085 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001086 if (ret < 0)
Alex Elder913d2fd2012-06-26 12:57:03 -07001087 goto done;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088
1089 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1090 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1091
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001092done:
1093 ceph_release_page_vector(pages, num_pages);
1094 return ret;
1095}
1096
1097/*
1098 * Do an asynchronous ceph osd operation
1099 */
1100static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001101 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 struct ceph_snap_context *snapc,
1103 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001104 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001105 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001106 struct bio *bio,
1107 struct rbd_req_coll *coll,
1108 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001109{
1110 char *seg_name;
1111 u64 seg_ofs;
1112 u64 seg_len;
1113 int ret;
1114 struct ceph_osd_req_op *ops;
1115 u32 payload_len;
1116
1117 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1118 if (!seg_name)
1119 return -ENOMEM;
1120
1121 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001122 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001123 ofs, len,
1124 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001125
1126 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1127
Alex Elder57cfc102012-06-26 12:57:03 -07001128 ret = -ENOMEM;
1129 ops = rbd_create_rw_ops(1, opcode, payload_len);
1130 if (!ops)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001131 goto done;
1132
1133 /* we've taken care of segment sizes earlier when we
1134 cloned the bios. We should never have a segment
1135 truncated at this point */
1136 BUG_ON(seg_len < len);
1137
1138 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1139 seg_name, seg_ofs, seg_len,
1140 bio,
1141 NULL, 0,
1142 flags,
1143 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001144 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001145 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001146
1147 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148done:
1149 kfree(seg_name);
1150 return ret;
1151}
1152
1153/*
1154 * Request async osd write
1155 */
1156static int rbd_req_write(struct request *rq,
1157 struct rbd_device *rbd_dev,
1158 struct ceph_snap_context *snapc,
1159 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001160 struct bio *bio,
1161 struct rbd_req_coll *coll,
1162 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001163{
1164 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1165 CEPH_OSD_OP_WRITE,
1166 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001167 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001168}
1169
1170/*
1171 * Request async osd read
1172 */
1173static int rbd_req_read(struct request *rq,
1174 struct rbd_device *rbd_dev,
1175 u64 snapid,
1176 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001177 struct bio *bio,
1178 struct rbd_req_coll *coll,
1179 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001180{
1181 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001182 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001183 CEPH_OSD_OP_READ,
1184 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001185 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001186}
1187
1188/*
1189 * Request sync osd read
1190 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001191static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001192 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001193 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001194 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001195 char *buf,
1196 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001197{
Alex Elder913d2fd2012-06-26 12:57:03 -07001198 struct ceph_osd_req_op *ops;
1199 int ret;
1200
1201 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0);
1202 if (!ops)
1203 return -ENOMEM;
1204
1205 ret = rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001206 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001207 CEPH_OSD_FLAG_READ,
Alex Elder913d2fd2012-06-26 12:57:03 -07001208 ops, object_name, ofs, len, buf, NULL, ver);
1209 rbd_destroy_ops(ops);
1210
1211 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001212}
1213
1214/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001215 * Request sync osd watch
1216 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001217static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001218 u64 ver,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001219 u64 notify_id)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001220{
1221 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001222 int ret;
1223
Alex Elder57cfc102012-06-26 12:57:03 -07001224 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0);
1225 if (!ops)
1226 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001227
Josh Durgina71b8912011-12-05 18:10:44 -08001228 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001229 ops[0].watch.cookie = notify_id;
1230 ops[0].watch.flag = 0;
1231
Alex Elder0ce1a792012-07-03 16:01:18 -05001232 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elder7f0a24d2012-07-25 09:32:40 -05001233 rbd_dev->header_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001234 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001235 CEPH_OSD_FLAG_READ,
1236 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001237 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001238 rbd_simple_req_cb, 0, NULL);
1239
1240 rbd_destroy_ops(ops);
1241 return ret;
1242}
1243
1244static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1245{
Alex Elder0ce1a792012-07-03 16:01:18 -05001246 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001247 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001248 int rc;
1249
Alex Elder0ce1a792012-07-03 16:01:18 -05001250 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001251 return;
1252
Alex Elderbd919d42012-07-13 20:35:11 -05001253 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1254 rbd_dev->header_name, (unsigned long long) notify_id,
1255 (unsigned int) opcode);
Alex Elder1fe5e992012-07-25 09:32:41 -05001256 rc = rbd_refresh_header(rbd_dev, &hver);
Sage Weil13143d22011-05-12 16:08:30 -07001257 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001258 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001259 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001260
Alex Elder7f0a24d2012-07-25 09:32:40 -05001261 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001262}
1263
1264/*
1265 * Request sync osd watch
1266 */
Alex Elder0e6f3222012-07-25 09:32:40 -05001267static int rbd_req_sync_watch(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001268{
1269 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001270 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder57cfc102012-06-26 12:57:03 -07001271 int ret;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001272
Alex Elder57cfc102012-06-26 12:57:03 -07001273 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1274 if (!ops)
1275 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001276
1277 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001278 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001279 if (ret < 0)
1280 goto fail;
1281
Alex Elder0e6f3222012-07-25 09:32:40 -05001282 ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version);
Alex Elder0ce1a792012-07-03 16:01:18 -05001283 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001284 ops[0].watch.flag = 1;
1285
Alex Elder0ce1a792012-07-03 16:01:18 -05001286 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001287 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001288 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1289 ops,
Alex Elder0e6f3222012-07-25 09:32:40 -05001290 rbd_dev->header_name,
1291 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001292 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001293
1294 if (ret < 0)
1295 goto fail_event;
1296
1297 rbd_destroy_ops(ops);
1298 return 0;
1299
1300fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001301 ceph_osdc_cancel_event(rbd_dev->watch_event);
1302 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001303fail:
1304 rbd_destroy_ops(ops);
1305 return ret;
1306}
1307
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001308/*
1309 * Request sync osd unwatch
1310 */
Alex Elder070c6332012-07-25 09:32:41 -05001311static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001312{
1313 struct ceph_osd_req_op *ops;
Alex Elder57cfc102012-06-26 12:57:03 -07001314 int ret;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001315
Alex Elder57cfc102012-06-26 12:57:03 -07001316 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0);
1317 if (!ops)
1318 return -ENOMEM;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001319
1320 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001321 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001322 ops[0].watch.flag = 0;
1323
Alex Elder0ce1a792012-07-03 16:01:18 -05001324 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001325 CEPH_NOSNAP,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001326 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1327 ops,
Alex Elder070c6332012-07-25 09:32:41 -05001328 rbd_dev->header_name,
1329 0, 0, NULL, NULL, NULL);
1330
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001331
1332 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001333 ceph_osdc_cancel_event(rbd_dev->watch_event);
1334 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001335 return ret;
1336}
1337
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001339 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001340};
1341
1342static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1343{
Alex Elder0ce1a792012-07-03 16:01:18 -05001344 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1345 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001346 return;
1347
Alex Elderbd919d42012-07-13 20:35:11 -05001348 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1349 rbd_dev->header_name, (unsigned long long) notify_id,
1350 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001351}
1352
1353/*
1354 * Request sync osd notify
1355 */
Alex Elder4cb16252012-07-25 09:32:40 -05001356static int rbd_req_sync_notify(struct rbd_device *rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001357{
1358 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001359 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001360 struct ceph_osd_event *event;
1361 struct rbd_notify_info info;
1362 int payload_len = sizeof(u32) + sizeof(u32);
1363 int ret;
1364
Alex Elder57cfc102012-06-26 12:57:03 -07001365 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len);
1366 if (!ops)
1367 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001368
Alex Elder0ce1a792012-07-03 16:01:18 -05001369 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001370
1371 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1372 (void *)&info, &event);
1373 if (ret < 0)
1374 goto fail;
1375
1376 ops[0].watch.ver = 1;
1377 ops[0].watch.flag = 1;
1378 ops[0].watch.cookie = event->cookie;
1379 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1380 ops[0].watch.timeout = 12;
1381
Alex Elder0ce1a792012-07-03 16:01:18 -05001382 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001383 CEPH_NOSNAP,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001384 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1385 ops,
Alex Elder4cb16252012-07-25 09:32:40 -05001386 rbd_dev->header_name,
1387 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001388 if (ret < 0)
1389 goto fail_event;
1390
1391 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1392 dout("ceph_osdc_wait_event returned %d\n", ret);
1393 rbd_destroy_ops(ops);
1394 return 0;
1395
1396fail_event:
1397 ceph_osdc_cancel_event(event);
1398fail:
1399 rbd_destroy_ops(ops);
1400 return ret;
1401}
1402
1403/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001404 * Request sync osd read
1405 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001406static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001407 const char *object_name,
1408 const char *class_name,
1409 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001410 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001411 int len,
1412 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001413{
1414 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001415 int class_name_len = strlen(class_name);
1416 int method_name_len = strlen(method_name);
Alex Elder57cfc102012-06-26 12:57:03 -07001417 int ret;
1418
1419 ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001420 class_name_len + method_name_len + len);
Alex Elder57cfc102012-06-26 12:57:03 -07001421 if (!ops)
1422 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001423
Alex Elderaded07e2012-07-03 16:01:18 -05001424 ops[0].cls.class_name = class_name;
1425 ops[0].cls.class_len = (__u8) class_name_len;
1426 ops[0].cls.method_name = method_name;
1427 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001428 ops[0].cls.argc = 0;
1429 ops[0].cls.indata = data;
1430 ops[0].cls.indata_len = len;
1431
Alex Elder0ce1a792012-07-03 16:01:18 -05001432 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001433 CEPH_NOSNAP,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001434 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1435 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001436 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001437
1438 rbd_destroy_ops(ops);
1439
1440 dout("cls_exec returned %d\n", ret);
1441 return ret;
1442}
1443
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001444static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1445{
1446 struct rbd_req_coll *coll =
1447 kzalloc(sizeof(struct rbd_req_coll) +
1448 sizeof(struct rbd_req_status) * num_reqs,
1449 GFP_ATOMIC);
1450
1451 if (!coll)
1452 return NULL;
1453 coll->total = num_reqs;
1454 kref_init(&coll->kref);
1455 return coll;
1456}
1457
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001458/*
1459 * block device queue callback
1460 */
1461static void rbd_rq_fn(struct request_queue *q)
1462{
1463 struct rbd_device *rbd_dev = q->queuedata;
1464 struct request *rq;
1465 struct bio_pair *bp = NULL;
1466
Alex Elder00f1f362012-02-07 12:03:36 -06001467 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001468 struct bio *bio;
1469 struct bio *rq_bio, *next_bio = NULL;
1470 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001471 unsigned int size;
1472 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001473 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001474 int num_segs, cur_seg = 0;
1475 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001476 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001477
1478 /* peek at request from block layer */
1479 if (!rq)
1480 break;
1481
1482 dout("fetched request\n");
1483
1484 /* filter out block requests we don't understand */
1485 if ((rq->cmd_type != REQ_TYPE_FS)) {
1486 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001487 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001488 }
1489
1490 /* deduce our operation (read, write) */
1491 do_write = (rq_data_dir(rq) == WRITE);
1492
1493 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001494 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001495 rq_bio = rq->bio;
1496 if (do_write && rbd_dev->read_only) {
1497 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001498 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001499 }
1500
1501 spin_unlock_irq(q->queue_lock);
1502
Josh Durgind1d25642011-12-05 14:03:05 -08001503 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001504
Josh Durgind1d25642011-12-05 14:03:05 -08001505 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001506 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001507 dout("request for non-existent snapshot");
1508 spin_lock_irq(q->queue_lock);
1509 __blk_end_request_all(rq, -ENXIO);
1510 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001511 }
1512
Josh Durgind1d25642011-12-05 14:03:05 -08001513 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1514
1515 up_read(&rbd_dev->header_rwsem);
1516
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001517 dout("%s 0x%x bytes at 0x%llx\n",
1518 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001519 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001520
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1522 coll = rbd_alloc_coll(num_segs);
1523 if (!coll) {
1524 spin_lock_irq(q->queue_lock);
1525 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001526 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001527 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001528 }
1529
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001530 do {
1531 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001532 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001533 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001534 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 ofs, size,
1536 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001537 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001538 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1539 op_size, GFP_ATOMIC);
1540 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001541 rbd_coll_end_req_index(rq, coll, cur_seg,
1542 -ENOMEM, op_size);
1543 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001544 }
1545
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001546
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001547 /* init OSD command: write or read */
1548 if (do_write)
1549 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001550 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001551 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001552 op_size, bio,
1553 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001554 else
1555 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001556 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001557 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001558 op_size, bio,
1559 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001560
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001561next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001562 size -= op_size;
1563 ofs += op_size;
1564
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001565 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001566 rq_bio = next_bio;
1567 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001568 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001569
1570 if (bp)
1571 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001572 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001573
1574 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001575 }
1576}
1577
1578/*
1579 * a queue callback. Makes sure that we don't create a bio that spans across
1580 * multiple osd objects. One exception would be with a single page bios,
1581 * which we handle later at bio_chain_clone
1582 */
1583static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1584 struct bio_vec *bvec)
1585{
1586 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001587 unsigned int chunk_sectors;
1588 sector_t sector;
1589 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001590 int max;
1591
Alex Elder593a9e72012-02-07 12:03:37 -06001592 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1593 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1594 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1595
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001596 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001597 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001598 if (max < 0)
1599 max = 0; /* bio_add cannot handle a negative return */
1600 if (max <= bvec->bv_len && bio_sectors == 0)
1601 return bvec->bv_len;
1602 return max;
1603}
1604
1605static void rbd_free_disk(struct rbd_device *rbd_dev)
1606{
1607 struct gendisk *disk = rbd_dev->disk;
1608
1609 if (!disk)
1610 return;
1611
1612 rbd_header_free(&rbd_dev->header);
1613
1614 if (disk->flags & GENHD_FL_UP)
1615 del_gendisk(disk);
1616 if (disk->queue)
1617 blk_cleanup_queue(disk->queue);
1618 put_disk(disk);
1619}
1620
1621/*
Alex Elder4156d992012-08-02 11:29:46 -05001622 * Read the complete header for the given rbd device.
1623 *
1624 * Returns a pointer to a dynamically-allocated buffer containing
1625 * the complete and validated header. Caller can pass the address
1626 * of a variable that will be filled in with the version of the
1627 * header object at the time it was read.
1628 *
1629 * Returns a pointer-coded errno if a failure occurs.
1630 */
1631static struct rbd_image_header_ondisk *
1632rbd_dev_v1_header_read(struct rbd_device *rbd_dev, u64 *version)
1633{
1634 struct rbd_image_header_ondisk *ondisk = NULL;
1635 u32 snap_count = 0;
1636 u64 names_size = 0;
1637 u32 want_count;
1638 int ret;
1639
1640 /*
1641 * The complete header will include an array of its 64-bit
1642 * snapshot ids, followed by the names of those snapshots as
1643 * a contiguous block of NUL-terminated strings. Note that
1644 * the number of snapshots could change by the time we read
1645 * it in, in which case we re-read it.
1646 */
1647 do {
1648 size_t size;
1649
1650 kfree(ondisk);
1651
1652 size = sizeof (*ondisk);
1653 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
1654 size += names_size;
1655 ondisk = kmalloc(size, GFP_KERNEL);
1656 if (!ondisk)
1657 return ERR_PTR(-ENOMEM);
1658
1659 ret = rbd_req_sync_read(rbd_dev, CEPH_NOSNAP,
1660 rbd_dev->header_name,
1661 0, size,
1662 (char *) ondisk, version);
1663
1664 if (ret < 0)
1665 goto out_err;
1666 if (WARN_ON((size_t) ret < size)) {
1667 ret = -ENXIO;
1668 pr_warning("short header read for image %s"
1669 " (want %zd got %d)\n",
1670 rbd_dev->image_name, size, ret);
1671 goto out_err;
1672 }
1673 if (!rbd_dev_ondisk_valid(ondisk)) {
1674 ret = -ENXIO;
1675 pr_warning("invalid header for image %s\n",
1676 rbd_dev->image_name);
1677 goto out_err;
1678 }
1679
1680 names_size = le64_to_cpu(ondisk->snap_names_len);
1681 want_count = snap_count;
1682 snap_count = le32_to_cpu(ondisk->snap_count);
1683 } while (snap_count != want_count);
1684
1685 return ondisk;
1686
1687out_err:
1688 kfree(ondisk);
1689
1690 return ERR_PTR(ret);
1691}
1692
1693/*
1694 * reload the ondisk the header
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001695 */
1696static int rbd_read_header(struct rbd_device *rbd_dev,
1697 struct rbd_image_header *header)
1698{
Alex Elder4156d992012-08-02 11:29:46 -05001699 struct rbd_image_header_ondisk *ondisk;
1700 u64 ver = 0;
1701 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001702
Alex Elder4156d992012-08-02 11:29:46 -05001703 ondisk = rbd_dev_v1_header_read(rbd_dev, &ver);
1704 if (IS_ERR(ondisk))
1705 return PTR_ERR(ondisk);
1706 ret = rbd_header_from_disk(header, ondisk);
1707 if (ret >= 0)
1708 header->obj_version = ver;
1709 kfree(ondisk);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001710
Alex Elder4156d992012-08-02 11:29:46 -05001711 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001712}
1713
1714/*
1715 * create a snapshot
1716 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001717static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001718 const char *snap_name,
1719 gfp_t gfp_flags)
1720{
1721 int name_len = strlen(snap_name);
1722 u64 new_snapid;
1723 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001724 void *data, *p, *e;
Alex Elder1dbb4392012-01-24 10:08:37 -06001725 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001726
1727 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001728 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001729 return -EINVAL;
1730
Alex Elder0ce1a792012-07-03 16:01:18 -05001731 monc = &rbd_dev->rbd_client->client->monc;
1732 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001733 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001734 if (ret < 0)
1735 return ret;
1736
1737 data = kmalloc(name_len + 16, gfp_flags);
1738 if (!data)
1739 return -ENOMEM;
1740
Sage Weil916d4d62011-05-12 16:10:50 -07001741 p = data;
1742 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001743
Sage Weil916d4d62011-05-12 16:10:50 -07001744 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1745 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001746
Alex Elder0bed54d2012-07-03 16:01:18 -05001747 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001748 "rbd", "snap_add",
Alex Elderd67d4be2012-07-13 20:35:11 -05001749 data, p - data, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001750
Sage Weil916d4d62011-05-12 16:10:50 -07001751 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001752
Alex Elder505cbb92012-07-19 08:49:18 -05001753 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001754bad:
1755 return -ERANGE;
1756}
1757
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001758static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1759{
1760 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001761 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001762
Alex Eldera0593292012-07-19 09:09:27 -05001763 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Alex Elder14e70852012-07-19 09:09:27 -05001764 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001765}
1766
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001767/*
1768 * only read the first part of the ondisk header, without the snaps info
1769 */
Alex Elderb8136232012-07-25 09:32:41 -05001770static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001771{
1772 int ret;
1773 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001774
1775 ret = rbd_read_header(rbd_dev, &h);
1776 if (ret < 0)
1777 return ret;
1778
Josh Durgina51aa0c2011-12-05 10:35:04 -08001779 down_write(&rbd_dev->header_rwsem);
1780
Sage Weil9db4b3e2011-04-19 22:49:06 -07001781 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001782 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1783 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1784
1785 dout("setting size to %llu sectors", (unsigned long long) size);
1786 set_capacity(rbd_dev->disk, size);
1787 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001788
Alex Elder849b4262012-07-09 21:04:24 -05001789 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001790 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001791 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001792 /* osd requests may still refer to snapc */
1793 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001794
Alex Elderb8136232012-07-25 09:32:41 -05001795 if (hver)
1796 *hver = h.obj_version;
Josh Durgina71b8912011-12-05 18:10:44 -08001797 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001798 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001799 rbd_dev->header.total_snaps = h.total_snaps;
1800 rbd_dev->header.snapc = h.snapc;
1801 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001802 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001803 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001804 /* Free the extra copy of the object prefix */
1805 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1806 kfree(h.object_prefix);
1807
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001808 ret = __rbd_init_snaps_header(rbd_dev);
1809
Josh Durginc6666012011-11-21 17:11:12 -08001810 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001811
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001812 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001813}
1814
Alex Elder1fe5e992012-07-25 09:32:41 -05001815static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver)
1816{
1817 int ret;
1818
1819 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1820 ret = __rbd_refresh_header(rbd_dev, hver);
1821 mutex_unlock(&ctl_mutex);
1822
1823 return ret;
1824}
1825
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001826static int rbd_init_disk(struct rbd_device *rbd_dev)
1827{
1828 struct gendisk *disk;
1829 struct request_queue *q;
1830 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001831 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832 u64 total_size = 0;
1833
1834 /* contact OSD, request size info about the object being mapped */
1835 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1836 if (rc)
1837 return rc;
1838
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001839 /* no need to lock here, as rbd_dev is not registered yet */
1840 rc = __rbd_init_snaps_header(rbd_dev);
1841 if (rc)
1842 return rc;
1843
Josh Durgincc9d7342011-11-21 18:19:13 -08001844 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001845 if (rc)
1846 return rc;
1847
1848 /* create gendisk info */
1849 rc = -ENOMEM;
1850 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1851 if (!disk)
1852 goto out;
1853
Alex Elderf0f8cef2012-01-29 13:57:44 -06001854 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001855 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001856 disk->major = rbd_dev->major;
1857 disk->first_minor = 0;
1858 disk->fops = &rbd_bd_ops;
1859 disk->private_data = rbd_dev;
1860
1861 /* init rq */
1862 rc = -ENOMEM;
1863 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1864 if (!q)
1865 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001866
Alex Elder593a9e72012-02-07 12:03:37 -06001867 /* We use the default size, but let's be explicit about it. */
1868 blk_queue_physical_block_size(q, SECTOR_SIZE);
1869
Josh Durgin029bcbd2011-07-22 11:35:23 -07001870 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001871 segment_size = rbd_obj_bytes(&rbd_dev->header);
1872 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1873 blk_queue_max_segment_size(q, segment_size);
1874 blk_queue_io_min(q, segment_size);
1875 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001876
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001877 blk_queue_merge_bvec(q, rbd_merge_bvec);
1878 disk->queue = q;
1879
1880 q->queuedata = rbd_dev;
1881
1882 rbd_dev->disk = disk;
1883 rbd_dev->q = q;
1884
1885 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001886 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001887 add_disk(disk);
1888
1889 pr_info("%s: added with size 0x%llx\n",
1890 disk->disk_name, (unsigned long long)total_size);
1891 return 0;
1892
1893out_disk:
1894 put_disk(disk);
1895out:
1896 return rc;
1897}
1898
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001899/*
1900 sysfs
1901*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001902
Alex Elder593a9e72012-02-07 12:03:37 -06001903static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1904{
1905 return container_of(dev, struct rbd_device, dev);
1906}
1907
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001908static ssize_t rbd_size_show(struct device *dev,
1909 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001910{
Alex Elder593a9e72012-02-07 12:03:37 -06001911 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001912 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001913
Josh Durgina51aa0c2011-12-05 10:35:04 -08001914 down_read(&rbd_dev->header_rwsem);
1915 size = get_capacity(rbd_dev->disk);
1916 up_read(&rbd_dev->header_rwsem);
1917
1918 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001919}
1920
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001921static ssize_t rbd_major_show(struct device *dev,
1922 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001923{
Alex Elder593a9e72012-02-07 12:03:37 -06001924 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001925
1926 return sprintf(buf, "%d\n", rbd_dev->major);
1927}
1928
1929static ssize_t rbd_client_id_show(struct device *dev,
1930 struct device_attribute *attr, char *buf)
1931{
Alex Elder593a9e72012-02-07 12:03:37 -06001932 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001933
Alex Elder1dbb4392012-01-24 10:08:37 -06001934 return sprintf(buf, "client%lld\n",
1935 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001936}
1937
1938static ssize_t rbd_pool_show(struct device *dev,
1939 struct device_attribute *attr, char *buf)
1940{
Alex Elder593a9e72012-02-07 12:03:37 -06001941 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001942
1943 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1944}
1945
Alex Elder9bb2f332012-07-12 10:46:35 -05001946static ssize_t rbd_pool_id_show(struct device *dev,
1947 struct device_attribute *attr, char *buf)
1948{
1949 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1950
1951 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1952}
1953
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001954static ssize_t rbd_name_show(struct device *dev,
1955 struct device_attribute *attr, char *buf)
1956{
Alex Elder593a9e72012-02-07 12:03:37 -06001957 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001958
Alex Elder0bed54d2012-07-03 16:01:18 -05001959 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001960}
1961
1962static ssize_t rbd_snap_show(struct device *dev,
1963 struct device_attribute *attr,
1964 char *buf)
1965{
Alex Elder593a9e72012-02-07 12:03:37 -06001966 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001967
1968 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1969}
1970
1971static ssize_t rbd_image_refresh(struct device *dev,
1972 struct device_attribute *attr,
1973 const char *buf,
1974 size_t size)
1975{
Alex Elder593a9e72012-02-07 12:03:37 -06001976 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05001977 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001978
Alex Elder1fe5e992012-07-25 09:32:41 -05001979 ret = rbd_refresh_header(rbd_dev, NULL);
Alex Elderb8136232012-07-25 09:32:41 -05001980
1981 return ret < 0 ? ret : size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001982}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001983
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001984static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1985static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1986static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1987static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001988static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001989static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1990static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1991static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1992static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001993
1994static struct attribute *rbd_attrs[] = {
1995 &dev_attr_size.attr,
1996 &dev_attr_major.attr,
1997 &dev_attr_client_id.attr,
1998 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001999 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002000 &dev_attr_name.attr,
2001 &dev_attr_current_snap.attr,
2002 &dev_attr_refresh.attr,
2003 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002004 NULL
2005};
2006
2007static struct attribute_group rbd_attr_group = {
2008 .attrs = rbd_attrs,
2009};
2010
2011static const struct attribute_group *rbd_attr_groups[] = {
2012 &rbd_attr_group,
2013 NULL
2014};
2015
2016static void rbd_sysfs_dev_release(struct device *dev)
2017{
2018}
2019
2020static struct device_type rbd_device_type = {
2021 .name = "rbd",
2022 .groups = rbd_attr_groups,
2023 .release = rbd_sysfs_dev_release,
2024};
2025
2026
2027/*
2028 sysfs - snapshots
2029*/
2030
2031static ssize_t rbd_snap_size_show(struct device *dev,
2032 struct device_attribute *attr,
2033 char *buf)
2034{
2035 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2036
Josh Durgin35915382011-12-05 18:25:13 -08002037 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002038}
2039
2040static ssize_t rbd_snap_id_show(struct device *dev,
2041 struct device_attribute *attr,
2042 char *buf)
2043{
2044 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2045
Josh Durgin35915382011-12-05 18:25:13 -08002046 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002047}
2048
2049static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
2050static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
2051
2052static struct attribute *rbd_snap_attrs[] = {
2053 &dev_attr_snap_size.attr,
2054 &dev_attr_snap_id.attr,
2055 NULL,
2056};
2057
2058static struct attribute_group rbd_snap_attr_group = {
2059 .attrs = rbd_snap_attrs,
2060};
2061
2062static void rbd_snap_dev_release(struct device *dev)
2063{
2064 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2065 kfree(snap->name);
2066 kfree(snap);
2067}
2068
2069static const struct attribute_group *rbd_snap_attr_groups[] = {
2070 &rbd_snap_attr_group,
2071 NULL
2072};
2073
2074static struct device_type rbd_snap_device_type = {
2075 .groups = rbd_snap_attr_groups,
2076 .release = rbd_snap_dev_release,
2077};
2078
Alex Elder14e70852012-07-19 09:09:27 -05002079static void __rbd_remove_snap_dev(struct rbd_snap *snap)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002080{
2081 list_del(&snap->node);
2082 device_unregister(&snap->dev);
2083}
2084
Alex Elder14e70852012-07-19 09:09:27 -05002085static int rbd_register_snap_dev(struct rbd_snap *snap,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002086 struct device *parent)
2087{
2088 struct device *dev = &snap->dev;
2089 int ret;
2090
2091 dev->type = &rbd_snap_device_type;
2092 dev->parent = parent;
2093 dev->release = rbd_snap_dev_release;
2094 dev_set_name(dev, "snap_%s", snap->name);
2095 ret = device_register(dev);
2096
2097 return ret;
2098}
2099
Alex Elder4e891e02012-07-10 20:30:10 -05002100static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev,
2101 int i, const char *name)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002102{
Alex Elder4e891e02012-07-10 20:30:10 -05002103 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002104 int ret;
Alex Elder4e891e02012-07-10 20:30:10 -05002105
2106 snap = kzalloc(sizeof (*snap), GFP_KERNEL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002107 if (!snap)
Alex Elder4e891e02012-07-10 20:30:10 -05002108 return ERR_PTR(-ENOMEM);
2109
2110 ret = -ENOMEM;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002111 snap->name = kstrdup(name, GFP_KERNEL);
Alex Elder4e891e02012-07-10 20:30:10 -05002112 if (!snap->name)
2113 goto err;
2114
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002115 snap->size = rbd_dev->header.snap_sizes[i];
2116 snap->id = rbd_dev->header.snapc->snaps[i];
2117 if (device_is_registered(&rbd_dev->dev)) {
Alex Elder14e70852012-07-19 09:09:27 -05002118 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002119 if (ret < 0)
2120 goto err;
2121 }
Alex Elder4e891e02012-07-10 20:30:10 -05002122
2123 return snap;
2124
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002125err:
2126 kfree(snap->name);
2127 kfree(snap);
Alex Elder4e891e02012-07-10 20:30:10 -05002128
2129 return ERR_PTR(ret);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002130}
2131
2132/*
Alex Elder35938152012-08-02 11:29:46 -05002133 * Scan the rbd device's current snapshot list and compare it to the
2134 * newly-received snapshot context. Remove any existing snapshots
2135 * not present in the new snapshot context. Add a new snapshot for
2136 * any snaphots in the snapshot context not in the current list.
2137 * And verify there are no changes to snapshots we already know
2138 * about.
2139 *
2140 * Assumes the snapshots in the snapshot context are sorted by
2141 * snapshot id, highest id first. (Snapshots in the rbd_dev's list
2142 * are also maintained in that order.)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002143 */
2144static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2145{
Alex Elder35938152012-08-02 11:29:46 -05002146 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
2147 const u32 snap_count = snapc->num_snaps;
2148 char *snap_name = rbd_dev->header.snap_names;
2149 struct list_head *head = &rbd_dev->snaps;
2150 struct list_head *links = head->next;
2151 u32 index = 0;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002152
Alex Elder35938152012-08-02 11:29:46 -05002153 while (index < snap_count || links != head) {
2154 u64 snap_id;
2155 struct rbd_snap *snap;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002156
Alex Elder35938152012-08-02 11:29:46 -05002157 snap_id = index < snap_count ? snapc->snaps[index]
2158 : CEPH_NOSNAP;
2159 snap = links != head ? list_entry(links, struct rbd_snap, node)
2160 : NULL;
2161 BUG_ON(snap && snap->id == CEPH_NOSNAP);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002162
Alex Elder35938152012-08-02 11:29:46 -05002163 if (snap_id == CEPH_NOSNAP || (snap && snap->id > snap_id)) {
2164 struct list_head *next = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002165
Alex Elder35938152012-08-02 11:29:46 -05002166 /* Existing snapshot not in the new snap context */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002167
Alex Elder35938152012-08-02 11:29:46 -05002168 if (rbd_dev->snap_id == snap->id)
Josh Durgine88a36e2011-11-21 18:14:25 -08002169 rbd_dev->snap_exists = false;
Alex Elder35938152012-08-02 11:29:46 -05002170 __rbd_remove_snap_dev(snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002171
Alex Elder35938152012-08-02 11:29:46 -05002172 /* Done with this list entry; advance */
2173
2174 links = next;
2175 continue;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002176 }
Alex Elder35938152012-08-02 11:29:46 -05002177
2178 if (!snap || (snap_id != CEPH_NOSNAP && snap->id < snap_id)) {
2179 struct rbd_snap *new_snap;
2180
2181 /* We haven't seen this snapshot before */
2182
2183 new_snap = __rbd_add_snap_dev(rbd_dev, index,
2184 snap_name);
2185 if (IS_ERR(new_snap))
2186 return PTR_ERR(new_snap);
2187
2188 /* New goes before existing, or at end of list */
2189
2190 if (snap)
2191 list_add_tail(&new_snap->node, &snap->node);
2192 else
Alex Elder523f3252012-08-30 00:16:37 -05002193 list_add_tail(&new_snap->node, head);
Alex Elder35938152012-08-02 11:29:46 -05002194 } else {
2195 /* Already have this one */
2196
2197 BUG_ON(snap->size != rbd_dev->header.snap_sizes[index]);
2198 BUG_ON(strcmp(snap->name, snap_name));
2199
2200 /* Done with this list entry; advance */
2201
2202 links = links->next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002203 }
Alex Elder35938152012-08-02 11:29:46 -05002204
2205 /* Advance to the next entry in the snapshot context */
2206
2207 index++;
2208 snap_name += strlen(snap_name) + 1;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002209 }
2210
2211 return 0;
2212}
2213
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002214static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2215{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002216 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002217 struct device *dev;
2218 struct rbd_snap *snap;
2219
2220 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2221 dev = &rbd_dev->dev;
2222
2223 dev->bus = &rbd_bus_type;
2224 dev->type = &rbd_device_type;
2225 dev->parent = &rbd_root_dev;
2226 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002227 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002228 ret = device_register(dev);
2229 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002230 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002231
2232 list_for_each_entry(snap, &rbd_dev->snaps, node) {
Alex Elder14e70852012-07-19 09:09:27 -05002233 ret = rbd_register_snap_dev(snap, &rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002234 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002235 break;
2236 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002237out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002238 mutex_unlock(&ctl_mutex);
2239 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002240}
2241
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002242static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2243{
2244 device_unregister(&rbd_dev->dev);
2245}
2246
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002247static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2248{
2249 int ret, rc;
2250
2251 do {
Alex Elder0e6f3222012-07-25 09:32:40 -05002252 ret = rbd_req_sync_watch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002253 if (ret == -ERANGE) {
Alex Elder1fe5e992012-07-25 09:32:41 -05002254 rc = rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002255 if (rc < 0)
2256 return rc;
2257 }
2258 } while (ret == -ERANGE);
2259
2260 return ret;
2261}
2262
Alex Elder1ddbe942012-01-29 13:57:44 -06002263static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2264
2265/*
Alex Elder499afd52012-02-02 08:13:29 -06002266 * Get a unique rbd identifier for the given new rbd_dev, and add
2267 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002268 */
Alex Elder499afd52012-02-02 08:13:29 -06002269static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002270{
Alex Elderde71a292012-07-03 16:01:19 -05002271 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002272
2273 spin_lock(&rbd_dev_list_lock);
2274 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2275 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002276}
Alex Elderb7f23c32012-01-29 13:57:43 -06002277
Alex Elder1ddbe942012-01-29 13:57:44 -06002278/*
Alex Elder499afd52012-02-02 08:13:29 -06002279 * Remove an rbd_dev from the global list, and record that its
2280 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002281 */
Alex Elder499afd52012-02-02 08:13:29 -06002282static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002283{
Alex Elderd184f6b2012-01-29 13:57:44 -06002284 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002285 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002286 int max_id;
2287
2288 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002289
2290 spin_lock(&rbd_dev_list_lock);
2291 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002292
2293 /*
2294 * If the id being "put" is not the current maximum, there
2295 * is nothing special we need to do.
2296 */
2297 if (rbd_id != atomic64_read(&rbd_id_max)) {
2298 spin_unlock(&rbd_dev_list_lock);
2299 return;
2300 }
2301
2302 /*
2303 * We need to update the current maximum id. Search the
2304 * list to find out what it is. We're more likely to find
2305 * the maximum at the end, so search the list backward.
2306 */
2307 max_id = 0;
2308 list_for_each_prev(tmp, &rbd_dev_list) {
2309 struct rbd_device *rbd_dev;
2310
2311 rbd_dev = list_entry(tmp, struct rbd_device, node);
2312 if (rbd_id > max_id)
2313 max_id = rbd_id;
2314 }
Alex Elder499afd52012-02-02 08:13:29 -06002315 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002316
Alex Elder1ddbe942012-01-29 13:57:44 -06002317 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002318 * The max id could have been updated by rbd_id_get(), in
2319 * which case it now accurately reflects the new maximum.
2320 * Be careful not to overwrite the maximum value in that
2321 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002322 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002323 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002324}
2325
Alex Eldera725f65e2012-02-02 08:13:30 -06002326/*
Alex Eldere28fff262012-02-02 08:13:30 -06002327 * Skips over white space at *buf, and updates *buf to point to the
2328 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002329 * the token (string of non-white space characters) found. Note
2330 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002331 */
2332static inline size_t next_token(const char **buf)
2333{
2334 /*
2335 * These are the characters that produce nonzero for
2336 * isspace() in the "C" and "POSIX" locales.
2337 */
2338 const char *spaces = " \f\n\r\t\v";
2339
2340 *buf += strspn(*buf, spaces); /* Find start of token */
2341
2342 return strcspn(*buf, spaces); /* Return token length */
2343}
2344
2345/*
2346 * Finds the next token in *buf, and if the provided token buffer is
2347 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002348 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2349 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002350 *
2351 * Returns the length of the token found (not including the '\0').
2352 * Return value will be 0 if no token is found, and it will be >=
2353 * token_size if the token would not fit.
2354 *
Alex Elder593a9e72012-02-07 12:03:37 -06002355 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002356 * found token. Note that this occurs even if the token buffer is
2357 * too small to hold it.
2358 */
2359static inline size_t copy_token(const char **buf,
2360 char *token,
2361 size_t token_size)
2362{
2363 size_t len;
2364
2365 len = next_token(buf);
2366 if (len < token_size) {
2367 memcpy(token, *buf, len);
2368 *(token + len) = '\0';
2369 }
2370 *buf += len;
2371
2372 return len;
2373}
2374
2375/*
Alex Elderea3352f2012-07-09 21:04:23 -05002376 * Finds the next token in *buf, dynamically allocates a buffer big
2377 * enough to hold a copy of it, and copies the token into the new
2378 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2379 * that a duplicate buffer is created even for a zero-length token.
2380 *
2381 * Returns a pointer to the newly-allocated duplicate, or a null
2382 * pointer if memory for the duplicate was not available. If
2383 * the lenp argument is a non-null pointer, the length of the token
2384 * (not including the '\0') is returned in *lenp.
2385 *
2386 * If successful, the *buf pointer will be updated to point beyond
2387 * the end of the found token.
2388 *
2389 * Note: uses GFP_KERNEL for allocation.
2390 */
2391static inline char *dup_token(const char **buf, size_t *lenp)
2392{
2393 char *dup;
2394 size_t len;
2395
2396 len = next_token(buf);
2397 dup = kmalloc(len + 1, GFP_KERNEL);
2398 if (!dup)
2399 return NULL;
2400
2401 memcpy(dup, *buf, len);
2402 *(dup + len) = '\0';
2403 *buf += len;
2404
2405 if (lenp)
2406 *lenp = len;
2407
2408 return dup;
2409}
2410
2411/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002412 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002413 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2414 * on the list of monitor addresses and other options provided via
2415 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002416 *
2417 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002418 */
2419static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2420 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002421 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002422 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002423 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002424 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002425{
Alex Elderd22f76e2012-07-12 10:46:35 -05002426 size_t len;
2427 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002428
2429 /* The first four tokens are required */
2430
Alex Elder7ef32142012-02-02 08:13:30 -06002431 len = next_token(&buf);
2432 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002433 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002434 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002435 *mon_addrs = buf;
2436
2437 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002438
Alex Eldere28fff262012-02-02 08:13:30 -06002439 len = copy_token(&buf, options, options_size);
2440 if (!len || len >= options_size)
2441 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002442
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002443 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002444 rbd_dev->pool_name = dup_token(&buf, NULL);
2445 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002446 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002447
Alex Elder0bed54d2012-07-03 16:01:18 -05002448 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2449 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002450 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002451
Alex Eldercb8627c2012-07-09 21:04:23 -05002452 /* Create the name of the header object */
2453
Alex Elder0bed54d2012-07-03 16:01:18 -05002454 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002455 + sizeof (RBD_SUFFIX),
2456 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002457 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002458 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002459 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002460
Alex Eldere28fff262012-02-02 08:13:30 -06002461 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002462 * The snapshot name is optional. If none is is supplied,
2463 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002464 */
Alex Elder820a5f32012-07-09 21:04:24 -05002465 rbd_dev->snap_name = dup_token(&buf, &len);
2466 if (!rbd_dev->snap_name)
2467 goto out_err;
2468 if (!len) {
2469 /* Replace the empty name with the default */
2470 kfree(rbd_dev->snap_name);
2471 rbd_dev->snap_name
2472 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2473 if (!rbd_dev->snap_name)
2474 goto out_err;
2475
Alex Eldere28fff262012-02-02 08:13:30 -06002476 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2477 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002478 }
Alex Eldere28fff262012-02-02 08:13:30 -06002479
Alex Eldera725f65e2012-02-02 08:13:30 -06002480 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002481
2482out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002483 kfree(rbd_dev->header_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002484 rbd_dev->header_name = NULL;
Alex Elder0bed54d2012-07-03 16:01:18 -05002485 kfree(rbd_dev->image_name);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002486 rbd_dev->image_name = NULL;
2487 rbd_dev->image_name_len = 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002488 kfree(rbd_dev->pool_name);
2489 rbd_dev->pool_name = NULL;
2490
2491 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002492}
2493
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002494static ssize_t rbd_add(struct bus_type *bus,
2495 const char *buf,
2496 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002497{
Alex Eldercb8627c2012-07-09 21:04:23 -05002498 char *options;
2499 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002500 const char *mon_addrs = NULL;
2501 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002502 struct ceph_osd_client *osdc;
2503 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002504
2505 if (!try_module_get(THIS_MODULE))
2506 return -ENODEV;
2507
Alex Elder27cc2592012-02-02 08:13:30 -06002508 options = kmalloc(count, GFP_KERNEL);
2509 if (!options)
2510 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002511 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2512 if (!rbd_dev)
2513 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002514
2515 /* static rbd_device initialization */
2516 spin_lock_init(&rbd_dev->lock);
2517 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002518 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002519 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002520
Alex Elderd184f6b2012-01-29 13:57:44 -06002521 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002522 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002523
Alex Eldera725f65e2012-02-02 08:13:30 -06002524 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002525 BUILD_BUG_ON(DEV_NAME_LEN
2526 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002527 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a822012-01-29 13:57:44 -06002528
Alex Eldera725f65e2012-02-02 08:13:30 -06002529 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002530 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002531 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002532 if (rc)
2533 goto err_put_id;
2534
Alex Elder5214ecc2012-02-02 08:13:30 -06002535 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2536 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002537 if (IS_ERR(rbd_dev->rbd_client)) {
2538 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderd78fd7a2012-07-26 23:37:14 -05002539 rbd_dev->rbd_client = NULL;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002540 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002541 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002542
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002543 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002544 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002545 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2546 if (rc < 0)
2547 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002548 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002549
2550 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002551 rc = register_blkdev(0, rbd_dev->name);
2552 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002553 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002554 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002555
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002556 rc = rbd_bus_add_dev(rbd_dev);
2557 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002558 goto err_out_blkdev;
2559
Alex Elder32eec682012-02-08 16:11:14 -06002560 /*
2561 * At this point cleanup in the event of an error is the job
2562 * of the sysfs code (initiated by rbd_bus_del_dev()).
2563 *
2564 * Set up and announce blkdev mapping.
2565 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002566 rc = rbd_init_disk(rbd_dev);
2567 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002568 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002569
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002570 rc = rbd_init_watch_dev(rbd_dev);
2571 if (rc)
2572 goto err_out_bus;
2573
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002574 return count;
2575
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002576err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002577 /* this will also clean up rest of rbd_dev stuff */
2578
2579 rbd_bus_del_dev(rbd_dev);
2580 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002581 return rc;
2582
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002583err_out_blkdev:
2584 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2585err_out_client:
2586 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002587err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002588 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002589 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002590 kfree(rbd_dev->header_name);
2591 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002592 kfree(rbd_dev->pool_name);
2593 }
Alex Elder499afd52012-02-02 08:13:29 -06002594 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002595err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002596 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002597 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002598
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002599 dout("Error adding device %s\n", buf);
2600 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002601
2602 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002603}
2604
Alex Elderde71a292012-07-03 16:01:19 -05002605static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002606{
2607 struct list_head *tmp;
2608 struct rbd_device *rbd_dev;
2609
Alex Eldere124a822012-01-29 13:57:44 -06002610 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002611 list_for_each(tmp, &rbd_dev_list) {
2612 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002613 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a822012-01-29 13:57:44 -06002614 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002615 return rbd_dev;
Alex Eldere124a822012-01-29 13:57:44 -06002616 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002617 }
Alex Eldere124a822012-01-29 13:57:44 -06002618 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002619 return NULL;
2620}
2621
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002622static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002623{
Alex Elder593a9e72012-02-07 12:03:37 -06002624 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002625
Alex Elder1dbb4392012-01-24 10:08:37 -06002626 if (rbd_dev->watch_request) {
2627 struct ceph_client *client = rbd_dev->rbd_client->client;
2628
2629 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002630 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002631 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002632 if (rbd_dev->watch_event)
Alex Elder070c6332012-07-25 09:32:41 -05002633 rbd_req_sync_unwatch(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002634
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002635 rbd_put_client(rbd_dev);
2636
2637 /* clean up and free blkdev */
2638 rbd_free_disk(rbd_dev);
2639 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002640
2641 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002642 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002643 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002644 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002645 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002646 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002647 kfree(rbd_dev);
2648
2649 /* release module ref */
2650 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002651}
2652
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002653static ssize_t rbd_remove(struct bus_type *bus,
2654 const char *buf,
2655 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002656{
2657 struct rbd_device *rbd_dev = NULL;
2658 int target_id, rc;
2659 unsigned long ul;
2660 int ret = count;
2661
2662 rc = strict_strtoul(buf, 10, &ul);
2663 if (rc)
2664 return rc;
2665
2666 /* convert to int; abort if we lost anything in the conversion */
2667 target_id = (int) ul;
2668 if (target_id != ul)
2669 return -EINVAL;
2670
2671 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2672
2673 rbd_dev = __rbd_get_dev(target_id);
2674 if (!rbd_dev) {
2675 ret = -ENOENT;
2676 goto done;
2677 }
2678
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002679 __rbd_remove_all_snaps(rbd_dev);
2680 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002681
2682done:
2683 mutex_unlock(&ctl_mutex);
2684 return ret;
2685}
2686
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002687static ssize_t rbd_snap_add(struct device *dev,
2688 struct device_attribute *attr,
2689 const char *buf,
2690 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002691{
Alex Elder593a9e72012-02-07 12:03:37 -06002692 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002693 int ret;
2694 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002695 if (!name)
2696 return -ENOMEM;
2697
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002698 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002699
2700 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2701
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002702 ret = rbd_header_add_snap(rbd_dev,
2703 name, GFP_KERNEL);
2704 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002705 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002706
Alex Elderb8136232012-07-25 09:32:41 -05002707 ret = __rbd_refresh_header(rbd_dev, NULL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002708 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002709 goto err_unlock;
2710
2711 /* shouldn't hold ctl_mutex when notifying.. notify might
2712 trigger a watch callback that would need to get that mutex */
2713 mutex_unlock(&ctl_mutex);
2714
2715 /* make a best effort, don't error if failed */
Alex Elder4cb16252012-07-25 09:32:40 -05002716 rbd_req_sync_notify(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002717
2718 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002719 kfree(name);
2720 return ret;
2721
2722err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002723 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002724 kfree(name);
2725 return ret;
2726}
2727
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002728/*
2729 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002730 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002731 */
2732static int rbd_sysfs_init(void)
2733{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002734 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002735
Alex Elderfed4c142012-02-07 12:03:36 -06002736 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002737 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002738 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002739
Alex Elderfed4c142012-02-07 12:03:36 -06002740 ret = bus_register(&rbd_bus_type);
2741 if (ret < 0)
2742 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002743
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002744 return ret;
2745}
2746
2747static void rbd_sysfs_cleanup(void)
2748{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002749 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002750 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002751}
2752
2753int __init rbd_init(void)
2754{
2755 int rc;
2756
2757 rc = rbd_sysfs_init();
2758 if (rc)
2759 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002760 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002761 return 0;
2762}
2763
2764void __exit rbd_exit(void)
2765{
2766 rbd_sysfs_cleanup();
2767}
2768
2769module_init(rbd_init);
2770module_exit(rbd_exit);
2771
2772MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2773MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2774MODULE_DESCRIPTION("rados block device");
2775
2776/* following authorship retained from original osdblk.c */
2777MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2778
2779MODULE_LICENSE("GPL");