blob: 14d0a3c9f96a30c2916e49db57fec056b03a9359 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elderf0f8cef2012-01-29 13:57:44 -060044#define RBD_DRV_NAME "rbd"
45#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070046
47#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
48
Alex Elder21079782012-01-24 10:08:36 -060049#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX))
Yehuda Sadeh602adf42010-08-12 16:11:25 -070050#define RBD_MAX_POOL_NAME_LEN 64
51#define RBD_MAX_SNAP_NAME_LEN 32
52#define RBD_MAX_OPT_LEN 1024
53
54#define RBD_SNAP_HEAD_NAME "-"
55
Alex Elder81a89792012-02-02 08:13:30 -060056/*
57 * An RBD device name will be "rbd#", where the "rbd" comes from
58 * RBD_DRV_NAME above, and # is a unique integer identifier.
59 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
60 * enough to hold all possible device names.
61 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070062#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060063#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070064
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070065#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
66
Yehuda Sadeh602adf42010-08-12 16:11:25 -070067/*
68 * block device image metadata (in-memory version)
69 */
70struct rbd_image_header {
71 u64 image_size;
72 char block_name[32];
73 __u8 obj_order;
74 __u8 crypt_type;
75 __u8 comp_type;
76 struct rw_semaphore snap_rwsem;
77 struct ceph_snap_context *snapc;
78 size_t snap_names_len;
79 u64 snap_seq;
80 u32 total_snaps;
81
82 char *snap_names;
83 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070084
85 u64 obj_version;
86};
87
88struct rbd_options {
89 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070090};
91
92/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060093 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070094 */
95struct rbd_client {
96 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070097 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070098 struct kref kref;
99 struct list_head node;
100};
101
102/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600103 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700104 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700105struct rbd_req_status {
106 int done;
107 int rc;
108 u64 bytes;
109};
110
111/*
112 * a collection of requests
113 */
114struct rbd_req_coll {
115 int total;
116 int num_done;
117 struct kref kref;
118 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700119};
120
Alex Elderf0f8cef2012-01-29 13:57:44 -0600121/*
122 * a single io request
123 */
124struct rbd_request {
125 struct request *rq; /* blk layer request */
126 struct bio *bio; /* cloned bio */
127 struct page **pages; /* list of used pages */
128 u64 len;
129 int coll_index;
130 struct rbd_req_coll *coll;
131};
132
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800133struct rbd_snap {
134 struct device dev;
135 const char *name;
136 size_t size;
137 struct list_head node;
138 u64 id;
139};
140
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700141/*
142 * a single device
143 */
144struct rbd_device {
145 int id; /* blkdev unique id */
146
147 int major; /* blkdev assigned major */
148 struct gendisk *disk; /* blkdev's gendisk and rq */
149 struct request_queue *q;
150
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151 struct rbd_client *rbd_client;
152
153 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
154
155 spinlock_t lock; /* queue lock */
156
157 struct rbd_image_header header;
158 char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
159 int obj_len;
160 char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
161 char pool_name[RBD_MAX_POOL_NAME_LEN];
162 int poolid;
163
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700164 struct ceph_osd_event *watch_event;
165 struct ceph_osd_request *watch_request;
166
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700167 char snap_name[RBD_MAX_SNAP_NAME_LEN];
168 u32 cur_snap; /* index+1 of current snapshot within snap context
169 0 - for the head */
170 int read_only;
171
172 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800173
174 /* list of snapshots */
175 struct list_head snaps;
176
177 /* sysfs related */
178 struct device dev;
179};
180
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600182
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700183static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600184static DEFINE_SPINLOCK(rbd_dev_list_lock);
185
Alex Elder432b8582012-01-29 13:57:44 -0600186static LIST_HEAD(rbd_client_list); /* clients */
187static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700188
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800189static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
190static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800191static ssize_t rbd_snap_add(struct device *dev,
192 struct device_attribute *attr,
193 const char *buf,
194 size_t count);
195static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700196 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800197
Alex Elderf0f8cef2012-01-29 13:57:44 -0600198static ssize_t rbd_add(struct bus_type *bus, const char *buf,
199 size_t count);
200static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
201 size_t count);
202
203static struct bus_attribute rbd_bus_attrs[] = {
204 __ATTR(add, S_IWUSR, NULL, rbd_add),
205 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
206 __ATTR_NULL
207};
208
209static struct bus_type rbd_bus_type = {
210 .name = "rbd",
211 .bus_attrs = rbd_bus_attrs,
212};
213
214static void rbd_root_dev_release(struct device *dev)
215{
216}
217
218static struct device rbd_root_dev = {
219 .init_name = "rbd",
220 .release = rbd_root_dev_release,
221};
222
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800223
224static struct rbd_device *dev_to_rbd(struct device *dev)
225{
226 return container_of(dev, struct rbd_device, dev);
227}
228
229static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
230{
231 return get_device(&rbd_dev->dev);
232}
233
234static void rbd_put_dev(struct rbd_device *rbd_dev)
235{
236 put_device(&rbd_dev->dev);
237}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700238
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700239static int __rbd_update_snaps(struct rbd_device *rbd_dev);
240
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700241static int rbd_open(struct block_device *bdev, fmode_t mode)
242{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600243 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700244
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800245 rbd_get_dev(rbd_dev);
246
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700247 set_device_ro(bdev, rbd_dev->read_only);
248
249 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
250 return -EROFS;
251
252 return 0;
253}
254
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800255static int rbd_release(struct gendisk *disk, fmode_t mode)
256{
257 struct rbd_device *rbd_dev = disk->private_data;
258
259 rbd_put_dev(rbd_dev);
260
261 return 0;
262}
263
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700264static const struct block_device_operations rbd_bd_ops = {
265 .owner = THIS_MODULE,
266 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800267 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700268};
269
270/*
271 * Initialize an rbd client instance.
272 * We own *opt.
273 */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700274static struct rbd_client *rbd_client_create(struct ceph_options *opt,
275 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700276{
277 struct rbd_client *rbdc;
278 int ret = -ENOMEM;
279
280 dout("rbd_client_create\n");
281 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
282 if (!rbdc)
283 goto out_opt;
284
285 kref_init(&rbdc->kref);
286 INIT_LIST_HEAD(&rbdc->node);
287
Alex Elderbc534d82012-01-29 13:57:44 -0600288 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
289
Sage Weil6ab00d42011-08-09 09:41:59 -0700290 rbdc->client = ceph_create_client(opt, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700291 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600292 goto out_mutex;
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400293 opt = NULL; /* Now rbdc->client is responsible for opt */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700294
295 ret = ceph_open_session(rbdc->client);
296 if (ret < 0)
297 goto out_err;
298
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700299 rbdc->rbd_opts = rbd_opts;
300
Alex Elder432b8582012-01-29 13:57:44 -0600301 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700302 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600303 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700304
Alex Elderbc534d82012-01-29 13:57:44 -0600305 mutex_unlock(&ctl_mutex);
306
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307 dout("rbd_client_create created %p\n", rbdc);
308 return rbdc;
309
310out_err:
311 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600312out_mutex:
313 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700314 kfree(rbdc);
315out_opt:
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400316 if (opt)
317 ceph_destroy_options(opt);
318 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700319}
320
321/*
322 * Find a ceph client with specific addr and configuration.
323 */
324static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
325{
326 struct rbd_client *client_node;
327
328 if (opt->flags & CEPH_OPT_NOSHARE)
329 return NULL;
330
331 list_for_each_entry(client_node, &rbd_client_list, node)
332 if (ceph_compare_options(opt, client_node->client) == 0)
333 return client_node;
334 return NULL;
335}
336
337/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700338 * mount options
339 */
340enum {
341 Opt_notify_timeout,
342 Opt_last_int,
343 /* int args above */
344 Opt_last_string,
345 /* string args above */
346};
347
348static match_table_t rbdopt_tokens = {
349 {Opt_notify_timeout, "notify_timeout=%d"},
350 /* int args above */
351 /* string args above */
352 {-1, NULL}
353};
354
355static int parse_rbd_opts_token(char *c, void *private)
356{
357 struct rbd_options *rbdopt = private;
358 substring_t argstr[MAX_OPT_ARGS];
359 int token, intval, ret;
360
Alex Elder21079782012-01-24 10:08:36 -0600361 token = match_token(c, rbdopt_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700362 if (token < 0)
363 return -EINVAL;
364
365 if (token < Opt_last_int) {
366 ret = match_int(&argstr[0], &intval);
367 if (ret < 0) {
368 pr_err("bad mount option arg (not int) "
369 "at '%s'\n", c);
370 return ret;
371 }
372 dout("got int token %d val %d\n", token, intval);
373 } else if (token > Opt_last_int && token < Opt_last_string) {
374 dout("got string token %d val %s\n", token,
375 argstr[0].from);
376 } else {
377 dout("got token %d\n", token);
378 }
379
380 switch (token) {
381 case Opt_notify_timeout:
382 rbdopt->notify_timeout = intval;
383 break;
384 default:
385 BUG_ON(token);
386 }
387 return 0;
388}
389
390/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700391 * Get a ceph client with specific addr and configuration, if one does
392 * not exist create it.
393 */
Alex Elderd720bcb2012-02-02 08:13:30 -0600394static struct rbd_client *rbd_get_client(const char *mon_addr, char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700395{
396 struct rbd_client *rbdc;
397 struct ceph_options *opt;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398 struct rbd_options *rbd_opts;
399
400 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
401 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600402 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700403
404 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700405
Alex Elderee577412012-01-24 10:08:36 -0600406 opt = ceph_parse_options(options, mon_addr,
Alex Elder21079782012-01-24 10:08:36 -0600407 mon_addr + strlen(mon_addr),
408 parse_rbd_opts_token, rbd_opts);
Alex Elderee577412012-01-24 10:08:36 -0600409 if (IS_ERR(opt)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600410 kfree(rbd_opts);
411 return ERR_CAST(opt);
Alex Elderee577412012-01-24 10:08:36 -0600412 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700413
Alex Elder432b8582012-01-29 13:57:44 -0600414 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700415 rbdc = __rbd_client_find(opt);
416 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600417 /* using an existing client */
418 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600419 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d3d2012-01-29 13:57:44 -0600420
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421 ceph_destroy_options(opt);
Alex Elder97bb59a2012-01-24 10:08:36 -0600422 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700423
Alex Elderd720bcb2012-02-02 08:13:30 -0600424 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700425 }
Alex Elder432b8582012-01-29 13:57:44 -0600426 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700427
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700428 rbdc = rbd_client_create(opt, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600429
Alex Elderd720bcb2012-02-02 08:13:30 -0600430 if (IS_ERR(rbdc))
431 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432
Alex Elderd720bcb2012-02-02 08:13:30 -0600433 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700434}
435
436/*
437 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600438 *
Alex Elder432b8582012-01-29 13:57:44 -0600439 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700440 */
441static void rbd_client_release(struct kref *kref)
442{
443 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
444
445 dout("rbd_release_client %p\n", rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446 list_del(&rbdc->node);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700447
448 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700449 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450 kfree(rbdc);
451}
452
453/*
454 * Drop reference to ceph client node. If it's not referenced anymore, release
455 * it.
456 */
457static void rbd_put_client(struct rbd_device *rbd_dev)
458{
Alex Elder432b8582012-01-29 13:57:44 -0600459 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700460 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
Alex Elder432b8582012-01-29 13:57:44 -0600461 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700462 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700463}
464
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700465/*
466 * Destroy requests collection
467 */
468static void rbd_coll_release(struct kref *kref)
469{
470 struct rbd_req_coll *coll =
471 container_of(kref, struct rbd_req_coll, kref);
472
473 dout("rbd_coll_release %p\n", coll);
474 kfree(coll);
475}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700476
477/*
478 * Create a new header structure, translate header format from the on-disk
479 * header.
480 */
481static int rbd_header_from_disk(struct rbd_image_header *header,
482 struct rbd_image_header_ondisk *ondisk,
483 int allocated_snaps,
484 gfp_t gfp_flags)
485{
486 int i;
487 u32 snap_count = le32_to_cpu(ondisk->snap_count);
488 int ret = -ENOMEM;
489
Alex Elder21079782012-01-24 10:08:36 -0600490 if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT)))
Josh Durgin81e759f2011-11-15 14:49:53 -0800491 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800492
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700493 init_rwsem(&header->snap_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700494 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
495 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Alex Elder21079782012-01-24 10:08:36 -0600496 snap_count * sizeof (*ondisk),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700497 gfp_flags);
498 if (!header->snapc)
499 return -ENOMEM;
500 if (snap_count) {
501 header->snap_names = kmalloc(header->snap_names_len,
502 GFP_KERNEL);
503 if (!header->snap_names)
504 goto err_snapc;
505 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
506 GFP_KERNEL);
507 if (!header->snap_sizes)
508 goto err_names;
509 } else {
510 header->snap_names = NULL;
511 header->snap_sizes = NULL;
512 }
513 memcpy(header->block_name, ondisk->block_name,
514 sizeof(ondisk->block_name));
515
516 header->image_size = le64_to_cpu(ondisk->image_size);
517 header->obj_order = ondisk->options.order;
518 header->crypt_type = ondisk->options.crypt_type;
519 header->comp_type = ondisk->options.comp_type;
520
521 atomic_set(&header->snapc->nref, 1);
522 header->snap_seq = le64_to_cpu(ondisk->snap_seq);
523 header->snapc->num_snaps = snap_count;
524 header->total_snaps = snap_count;
525
Alex Elder21079782012-01-24 10:08:36 -0600526 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700527 for (i = 0; i < snap_count; i++) {
528 header->snapc->snaps[i] =
529 le64_to_cpu(ondisk->snaps[i].id);
530 header->snap_sizes[i] =
531 le64_to_cpu(ondisk->snaps[i].image_size);
532 }
533
534 /* copy snapshot names */
535 memcpy(header->snap_names, &ondisk->snaps[i],
536 header->snap_names_len);
537 }
538
539 return 0;
540
541err_names:
542 kfree(header->snap_names);
543err_snapc:
544 kfree(header->snapc);
545 return ret;
546}
547
548static int snap_index(struct rbd_image_header *header, int snap_num)
549{
550 return header->total_snaps - snap_num;
551}
552
553static u64 cur_snap_id(struct rbd_device *rbd_dev)
554{
555 struct rbd_image_header *header = &rbd_dev->header;
556
557 if (!rbd_dev->cur_snap)
558 return 0;
559
560 return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
561}
562
563static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
564 u64 *seq, u64 *size)
565{
566 int i;
567 char *p = header->snap_names;
568
569 for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
570 if (strcmp(snap_name, p) == 0)
571 break;
572 }
573 if (i == header->total_snaps)
574 return -ENOENT;
575 if (seq)
576 *seq = header->snapc->snaps[i];
577
578 if (size)
579 *size = header->snap_sizes[i];
580
581 return i;
582}
583
Josh Durgincc9d7342011-11-21 18:19:13 -0800584static int rbd_header_set_snap(struct rbd_device *dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700585{
586 struct rbd_image_header *header = &dev->header;
587 struct ceph_snap_context *snapc = header->snapc;
588 int ret = -ENOENT;
589
Josh Durgincc9d7342011-11-21 18:19:13 -0800590 BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME));
591
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592 down_write(&header->snap_rwsem);
593
Josh Durgincc9d7342011-11-21 18:19:13 -0800594 if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME,
595 sizeof (RBD_SNAP_HEAD_NAME))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700596 if (header->total_snaps)
597 snapc->seq = header->snap_seq;
598 else
599 snapc->seq = 0;
600 dev->cur_snap = 0;
601 dev->read_only = 0;
602 if (size)
603 *size = header->image_size;
604 } else {
Josh Durgincc9d7342011-11-21 18:19:13 -0800605 ret = snap_by_name(header, dev->snap_name, &snapc->seq, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700606 if (ret < 0)
607 goto done;
608
609 dev->cur_snap = header->total_snaps - ret;
610 dev->read_only = 1;
611 }
612
613 ret = 0;
614done:
615 up_write(&header->snap_rwsem);
616 return ret;
617}
618
619static void rbd_header_free(struct rbd_image_header *header)
620{
621 kfree(header->snapc);
622 kfree(header->snap_names);
623 kfree(header->snap_sizes);
624}
625
626/*
627 * get the actual striped segment name, offset and length
628 */
629static u64 rbd_get_segment(struct rbd_image_header *header,
630 const char *block_name,
631 u64 ofs, u64 len,
632 char *seg_name, u64 *segofs)
633{
634 u64 seg = ofs >> header->obj_order;
635
636 if (seg_name)
637 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
638 "%s.%012llx", block_name, seg);
639
640 ofs = ofs & ((1 << header->obj_order) - 1);
641 len = min_t(u64, len, (1 << header->obj_order) - ofs);
642
643 if (segofs)
644 *segofs = ofs;
645
646 return len;
647}
648
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700649static int rbd_get_num_segments(struct rbd_image_header *header,
650 u64 ofs, u64 len)
651{
652 u64 start_seg = ofs >> header->obj_order;
653 u64 end_seg = (ofs + len - 1) >> header->obj_order;
654 return end_seg - start_seg + 1;
655}
656
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700657/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700658 * returns the size of an object in the image
659 */
660static u64 rbd_obj_bytes(struct rbd_image_header *header)
661{
662 return 1 << header->obj_order;
663}
664
665/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700666 * bio helpers
667 */
668
669static void bio_chain_put(struct bio *chain)
670{
671 struct bio *tmp;
672
673 while (chain) {
674 tmp = chain;
675 chain = chain->bi_next;
676 bio_put(tmp);
677 }
678}
679
680/*
681 * zeros a bio chain, starting at specific offset
682 */
683static void zero_bio_chain(struct bio *chain, int start_ofs)
684{
685 struct bio_vec *bv;
686 unsigned long flags;
687 void *buf;
688 int i;
689 int pos = 0;
690
691 while (chain) {
692 bio_for_each_segment(bv, chain, i) {
693 if (pos + bv->bv_len > start_ofs) {
694 int remainder = max(start_ofs - pos, 0);
695 buf = bvec_kmap_irq(bv, &flags);
696 memset(buf + remainder, 0,
697 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200698 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699 }
700 pos += bv->bv_len;
701 }
702
703 chain = chain->bi_next;
704 }
705}
706
707/*
708 * bio_chain_clone - clone a chain of bios up to a certain length.
709 * might return a bio_pair that will need to be released.
710 */
711static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
712 struct bio_pair **bp,
713 int len, gfp_t gfpmask)
714{
715 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
716 int total = 0;
717
718 if (*bp) {
719 bio_pair_release(*bp);
720 *bp = NULL;
721 }
722
723 while (old_chain && (total < len)) {
724 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
725 if (!tmp)
726 goto err_out;
727
728 if (total + old_chain->bi_size > len) {
729 struct bio_pair *bp;
730
731 /*
732 * this split can only happen with a single paged bio,
733 * split_bio will BUG_ON if this is not the case
734 */
735 dout("bio_chain_clone split! total=%d remaining=%d"
736 "bi_size=%d\n",
737 (int)total, (int)len-total,
738 (int)old_chain->bi_size);
739
740 /* split the bio. We'll release it either in the next
741 call, or it will have to be released outside */
742 bp = bio_split(old_chain, (len - total) / 512ULL);
743 if (!bp)
744 goto err_out;
745
746 __bio_clone(tmp, &bp->bio1);
747
748 *next = &bp->bio2;
749 } else {
750 __bio_clone(tmp, old_chain);
751 *next = old_chain->bi_next;
752 }
753
754 tmp->bi_bdev = NULL;
755 gfpmask &= ~__GFP_WAIT;
756 tmp->bi_next = NULL;
757
758 if (!new_chain) {
759 new_chain = tail = tmp;
760 } else {
761 tail->bi_next = tmp;
762 tail = tmp;
763 }
764 old_chain = old_chain->bi_next;
765
766 total += tmp->bi_size;
767 }
768
769 BUG_ON(total < len);
770
771 if (tail)
772 tail->bi_next = NULL;
773
774 *old = old_chain;
775
776 return new_chain;
777
778err_out:
779 dout("bio_chain_clone with err\n");
780 bio_chain_put(new_chain);
781 return NULL;
782}
783
784/*
785 * helpers for osd request op vectors.
786 */
787static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
788 int num_ops,
789 int opcode,
790 u32 payload_len)
791{
792 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
793 GFP_NOIO);
794 if (!*ops)
795 return -ENOMEM;
796 (*ops)[0].op = opcode;
797 /*
798 * op extent offset and length will be set later on
799 * in calc_raw_layout()
800 */
801 (*ops)[0].payload_len = payload_len;
802 return 0;
803}
804
805static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
806{
807 kfree(ops);
808}
809
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700810static void rbd_coll_end_req_index(struct request *rq,
811 struct rbd_req_coll *coll,
812 int index,
813 int ret, u64 len)
814{
815 struct request_queue *q;
816 int min, max, i;
817
818 dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n",
819 coll, index, ret, len);
820
821 if (!rq)
822 return;
823
824 if (!coll) {
825 blk_end_request(rq, ret, len);
826 return;
827 }
828
829 q = rq->q;
830
831 spin_lock_irq(q->queue_lock);
832 coll->status[index].done = 1;
833 coll->status[index].rc = ret;
834 coll->status[index].bytes = len;
835 max = min = coll->num_done;
836 while (max < coll->total && coll->status[max].done)
837 max++;
838
839 for (i = min; i<max; i++) {
840 __blk_end_request(rq, coll->status[i].rc,
841 coll->status[i].bytes);
842 coll->num_done++;
843 kref_put(&coll->kref, rbd_coll_release);
844 }
845 spin_unlock_irq(q->queue_lock);
846}
847
848static void rbd_coll_end_req(struct rbd_request *req,
849 int ret, u64 len)
850{
851 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
852}
853
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700854/*
855 * Send ceph osd request
856 */
857static int rbd_do_request(struct request *rq,
858 struct rbd_device *dev,
859 struct ceph_snap_context *snapc,
860 u64 snapid,
861 const char *obj, u64 ofs, u64 len,
862 struct bio *bio,
863 struct page **pages,
864 int num_pages,
865 int flags,
866 struct ceph_osd_req_op *ops,
867 int num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700868 struct rbd_req_coll *coll,
869 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700870 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700871 struct ceph_msg *msg),
872 struct ceph_osd_request **linger_req,
873 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700874{
875 struct ceph_osd_request *req;
876 struct ceph_file_layout *layout;
877 int ret;
878 u64 bno;
879 struct timespec mtime = CURRENT_TIME;
880 struct rbd_request *req_data;
881 struct ceph_osd_request_head *reqhead;
882 struct rbd_image_header *header = &dev->header;
Alex Elder1dbb4392012-01-24 10:08:37 -0600883 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700886 if (!req_data) {
887 if (coll)
888 rbd_coll_end_req_index(rq, coll, coll_index,
889 -ENOMEM, len);
890 return -ENOMEM;
891 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700893 if (coll) {
894 req_data->coll = coll;
895 req_data->coll_index = coll_index;
896 }
897
898 dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700899
900 down_read(&header->snap_rwsem);
901
Alex Elder1dbb4392012-01-24 10:08:37 -0600902 osdc = &dev->rbd_client->client->osdc;
903 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
904 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700905 if (!req) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700906 up_read(&header->snap_rwsem);
Sage Weil4ad12622011-05-03 09:23:36 -0700907 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700908 goto done_pages;
909 }
910
911 req->r_callback = rbd_cb;
912
913 req_data->rq = rq;
914 req_data->bio = bio;
915 req_data->pages = pages;
916 req_data->len = len;
917
918 req->r_priv = req_data;
919
920 reqhead = req->r_request->front.iov_base;
921 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
922
923 strncpy(req->r_oid, obj, sizeof(req->r_oid));
924 req->r_oid_len = strlen(req->r_oid);
925
926 layout = &req->r_file_layout;
927 memset(layout, 0, sizeof(*layout));
928 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
929 layout->fl_stripe_count = cpu_to_le32(1);
930 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
931 layout->fl_pg_preferred = cpu_to_le32(-1);
932 layout->fl_pg_pool = cpu_to_le32(dev->poolid);
Alex Elder1dbb4392012-01-24 10:08:37 -0600933 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
934 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700935
936 ceph_osdc_build_request(req, ofs, &len,
937 ops,
938 snapc,
939 &mtime,
940 req->r_oid, req->r_oid_len);
941 up_read(&header->snap_rwsem);
942
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700943 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600944 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700945 *linger_req = req;
946 }
947
Alex Elder1dbb4392012-01-24 10:08:37 -0600948 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949 if (ret < 0)
950 goto done_err;
951
952 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600953 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700954 if (ver)
955 *ver = le64_to_cpu(req->r_reassert_version.version);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700956 dout("reassert_ver=%lld\n",
957 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700958 ceph_osdc_put_request(req);
959 }
960 return ret;
961
962done_err:
963 bio_chain_put(req_data->bio);
964 ceph_osdc_put_request(req);
965done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700966 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700967 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 return ret;
969}
970
971/*
972 * Ceph osd op callback
973 */
974static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
975{
976 struct rbd_request *req_data = req->r_priv;
977 struct ceph_osd_reply_head *replyhead;
978 struct ceph_osd_op *op;
979 __s32 rc;
980 u64 bytes;
981 int read_op;
982
983 /* parse reply */
984 replyhead = msg->front.iov_base;
985 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
986 op = (void *)(replyhead + 1);
987 rc = le32_to_cpu(replyhead->result);
988 bytes = le64_to_cpu(op->extent.length);
989 read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
990
991 dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
992
993 if (rc == -ENOENT && read_op) {
994 zero_bio_chain(req_data->bio, 0);
995 rc = 0;
996 } else if (rc == 0 && read_op && bytes < req_data->len) {
997 zero_bio_chain(req_data->bio, bytes);
998 bytes = req_data->len;
999 }
1000
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001001 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001002
1003 if (req_data->bio)
1004 bio_chain_put(req_data->bio);
1005
1006 ceph_osdc_put_request(req);
1007 kfree(req_data);
1008}
1009
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001010static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1011{
1012 ceph_osdc_put_request(req);
1013}
1014
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015/*
1016 * Do a synchronous ceph osd operation
1017 */
1018static int rbd_req_sync_op(struct rbd_device *dev,
1019 struct ceph_snap_context *snapc,
1020 u64 snapid,
1021 int opcode,
1022 int flags,
1023 struct ceph_osd_req_op *orig_ops,
1024 int num_reply,
1025 const char *obj,
1026 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001027 char *buf,
1028 struct ceph_osd_request **linger_req,
1029 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030{
1031 int ret;
1032 struct page **pages;
1033 int num_pages;
1034 struct ceph_osd_req_op *ops = orig_ops;
1035 u32 payload_len;
1036
1037 num_pages = calc_pages_for(ofs , len);
1038 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001039 if (IS_ERR(pages))
1040 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001041
1042 if (!orig_ops) {
1043 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1044 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1045 if (ret < 0)
1046 goto done;
1047
1048 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1049 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1050 if (ret < 0)
1051 goto done_ops;
1052 }
1053 }
1054
1055 ret = rbd_do_request(NULL, dev, snapc, snapid,
1056 obj, ofs, len, NULL,
1057 pages, num_pages,
1058 flags,
1059 ops,
1060 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001061 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001062 NULL,
1063 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001064 if (ret < 0)
1065 goto done_ops;
1066
1067 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1068 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1069
1070done_ops:
1071 if (!orig_ops)
1072 rbd_destroy_ops(ops);
1073done:
1074 ceph_release_page_vector(pages, num_pages);
1075 return ret;
1076}
1077
1078/*
1079 * Do an asynchronous ceph osd operation
1080 */
1081static int rbd_do_op(struct request *rq,
1082 struct rbd_device *rbd_dev ,
1083 struct ceph_snap_context *snapc,
1084 u64 snapid,
1085 int opcode, int flags, int num_reply,
1086 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001087 struct bio *bio,
1088 struct rbd_req_coll *coll,
1089 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001090{
1091 char *seg_name;
1092 u64 seg_ofs;
1093 u64 seg_len;
1094 int ret;
1095 struct ceph_osd_req_op *ops;
1096 u32 payload_len;
1097
1098 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1099 if (!seg_name)
1100 return -ENOMEM;
1101
1102 seg_len = rbd_get_segment(&rbd_dev->header,
1103 rbd_dev->header.block_name,
1104 ofs, len,
1105 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106
1107 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1108
1109 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1110 if (ret < 0)
1111 goto done;
1112
1113 /* we've taken care of segment sizes earlier when we
1114 cloned the bios. We should never have a segment
1115 truncated at this point */
1116 BUG_ON(seg_len < len);
1117
1118 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1119 seg_name, seg_ofs, seg_len,
1120 bio,
1121 NULL, 0,
1122 flags,
1123 ops,
1124 num_reply,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001125 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001126 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001127
1128 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001129done:
1130 kfree(seg_name);
1131 return ret;
1132}
1133
1134/*
1135 * Request async osd write
1136 */
1137static int rbd_req_write(struct request *rq,
1138 struct rbd_device *rbd_dev,
1139 struct ceph_snap_context *snapc,
1140 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001141 struct bio *bio,
1142 struct rbd_req_coll *coll,
1143 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001144{
1145 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1146 CEPH_OSD_OP_WRITE,
1147 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1148 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001149 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001150}
1151
1152/*
1153 * Request async osd read
1154 */
1155static int rbd_req_read(struct request *rq,
1156 struct rbd_device *rbd_dev,
1157 u64 snapid,
1158 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001159 struct bio *bio,
1160 struct rbd_req_coll *coll,
1161 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001162{
1163 return rbd_do_op(rq, rbd_dev, NULL,
1164 (snapid ? snapid : CEPH_NOSNAP),
1165 CEPH_OSD_OP_READ,
1166 CEPH_OSD_FLAG_READ,
1167 2,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001168 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001169}
1170
1171/*
1172 * Request sync osd read
1173 */
1174static int rbd_req_sync_read(struct rbd_device *dev,
1175 struct ceph_snap_context *snapc,
1176 u64 snapid,
1177 const char *obj,
1178 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001179 char *buf,
1180 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181{
1182 return rbd_req_sync_op(dev, NULL,
1183 (snapid ? snapid : CEPH_NOSNAP),
1184 CEPH_OSD_OP_READ,
1185 CEPH_OSD_FLAG_READ,
1186 NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001187 1, obj, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001188}
1189
1190/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001191 * Request sync osd watch
1192 */
1193static int rbd_req_sync_notify_ack(struct rbd_device *dev,
1194 u64 ver,
1195 u64 notify_id,
1196 const char *obj)
1197{
1198 struct ceph_osd_req_op *ops;
1199 struct page **pages = NULL;
Sage Weil11f77002011-05-12 16:13:54 -07001200 int ret;
1201
1202 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001203 if (ret < 0)
1204 return ret;
1205
1206 ops[0].watch.ver = cpu_to_le64(dev->header.obj_version);
1207 ops[0].watch.cookie = notify_id;
1208 ops[0].watch.flag = 0;
1209
1210 ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP,
1211 obj, 0, 0, NULL,
1212 pages, 0,
1213 CEPH_OSD_FLAG_READ,
1214 ops,
1215 1,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001216 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001217 rbd_simple_req_cb, 0, NULL);
1218
1219 rbd_destroy_ops(ops);
1220 return ret;
1221}
1222
1223static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1224{
1225 struct rbd_device *dev = (struct rbd_device *)data;
Sage Weil13143d22011-05-12 16:08:30 -07001226 int rc;
1227
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001228 if (!dev)
1229 return;
1230
1231 dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1232 notify_id, (int)opcode);
1233 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Sage Weil13143d22011-05-12 16:08:30 -07001234 rc = __rbd_update_snaps(dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001235 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001236 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001237 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
1238 " update snaps: %d\n", dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001239
1240 rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name);
1241}
1242
1243/*
1244 * Request sync osd watch
1245 */
1246static int rbd_req_sync_watch(struct rbd_device *dev,
1247 const char *obj,
1248 u64 ver)
1249{
1250 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001251 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001252
1253 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1254 if (ret < 0)
1255 return ret;
1256
1257 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
1258 (void *)dev, &dev->watch_event);
1259 if (ret < 0)
1260 goto fail;
1261
1262 ops[0].watch.ver = cpu_to_le64(ver);
1263 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1264 ops[0].watch.flag = 1;
1265
1266 ret = rbd_req_sync_op(dev, NULL,
1267 CEPH_NOSNAP,
1268 0,
1269 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1270 ops,
1271 1, obj, 0, 0, NULL,
1272 &dev->watch_request, NULL);
1273
1274 if (ret < 0)
1275 goto fail_event;
1276
1277 rbd_destroy_ops(ops);
1278 return 0;
1279
1280fail_event:
1281 ceph_osdc_cancel_event(dev->watch_event);
1282 dev->watch_event = NULL;
1283fail:
1284 rbd_destroy_ops(ops);
1285 return ret;
1286}
1287
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001288/*
1289 * Request sync osd unwatch
1290 */
1291static int rbd_req_sync_unwatch(struct rbd_device *dev,
1292 const char *obj)
1293{
1294 struct ceph_osd_req_op *ops;
1295
1296 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1297 if (ret < 0)
1298 return ret;
1299
1300 ops[0].watch.ver = 0;
1301 ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie);
1302 ops[0].watch.flag = 0;
1303
1304 ret = rbd_req_sync_op(dev, NULL,
1305 CEPH_NOSNAP,
1306 0,
1307 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1308 ops,
1309 1, obj, 0, 0, NULL, NULL, NULL);
1310
1311 rbd_destroy_ops(ops);
1312 ceph_osdc_cancel_event(dev->watch_event);
1313 dev->watch_event = NULL;
1314 return ret;
1315}
1316
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001317struct rbd_notify_info {
1318 struct rbd_device *dev;
1319};
1320
1321static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1322{
1323 struct rbd_device *dev = (struct rbd_device *)data;
1324 if (!dev)
1325 return;
1326
1327 dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name,
1328 notify_id, (int)opcode);
1329}
1330
1331/*
1332 * Request sync osd notify
1333 */
1334static int rbd_req_sync_notify(struct rbd_device *dev,
1335 const char *obj)
1336{
1337 struct ceph_osd_req_op *ops;
Alex Elder1dbb4392012-01-24 10:08:37 -06001338 struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001339 struct ceph_osd_event *event;
1340 struct rbd_notify_info info;
1341 int payload_len = sizeof(u32) + sizeof(u32);
1342 int ret;
1343
1344 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1345 if (ret < 0)
1346 return ret;
1347
1348 info.dev = dev;
1349
1350 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1351 (void *)&info, &event);
1352 if (ret < 0)
1353 goto fail;
1354
1355 ops[0].watch.ver = 1;
1356 ops[0].watch.flag = 1;
1357 ops[0].watch.cookie = event->cookie;
1358 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1359 ops[0].watch.timeout = 12;
1360
1361 ret = rbd_req_sync_op(dev, NULL,
1362 CEPH_NOSNAP,
1363 0,
1364 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1365 ops,
1366 1, obj, 0, 0, NULL, NULL, NULL);
1367 if (ret < 0)
1368 goto fail_event;
1369
1370 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1371 dout("ceph_osdc_wait_event returned %d\n", ret);
1372 rbd_destroy_ops(ops);
1373 return 0;
1374
1375fail_event:
1376 ceph_osdc_cancel_event(event);
1377fail:
1378 rbd_destroy_ops(ops);
1379 return ret;
1380}
1381
1382/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001383 * Request sync osd read
1384 */
1385static int rbd_req_sync_exec(struct rbd_device *dev,
1386 const char *obj,
1387 const char *cls,
1388 const char *method,
1389 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001390 int len,
1391 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001392{
1393 struct ceph_osd_req_op *ops;
1394 int cls_len = strlen(cls);
1395 int method_len = strlen(method);
1396 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
1397 cls_len + method_len + len);
1398 if (ret < 0)
1399 return ret;
1400
1401 ops[0].cls.class_name = cls;
1402 ops[0].cls.class_len = (__u8)cls_len;
1403 ops[0].cls.method_name = method;
1404 ops[0].cls.method_len = (__u8)method_len;
1405 ops[0].cls.argc = 0;
1406 ops[0].cls.indata = data;
1407 ops[0].cls.indata_len = len;
1408
1409 ret = rbd_req_sync_op(dev, NULL,
1410 CEPH_NOSNAP,
1411 0,
1412 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1413 ops,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001414 1, obj, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001415
1416 rbd_destroy_ops(ops);
1417
1418 dout("cls_exec returned %d\n", ret);
1419 return ret;
1420}
1421
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001422static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1423{
1424 struct rbd_req_coll *coll =
1425 kzalloc(sizeof(struct rbd_req_coll) +
1426 sizeof(struct rbd_req_status) * num_reqs,
1427 GFP_ATOMIC);
1428
1429 if (!coll)
1430 return NULL;
1431 coll->total = num_reqs;
1432 kref_init(&coll->kref);
1433 return coll;
1434}
1435
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001436/*
1437 * block device queue callback
1438 */
1439static void rbd_rq_fn(struct request_queue *q)
1440{
1441 struct rbd_device *rbd_dev = q->queuedata;
1442 struct request *rq;
1443 struct bio_pair *bp = NULL;
1444
1445 rq = blk_fetch_request(q);
1446
1447 while (1) {
1448 struct bio *bio;
1449 struct bio *rq_bio, *next_bio = NULL;
1450 bool do_write;
1451 int size, op_size = 0;
1452 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001453 int num_segs, cur_seg = 0;
1454 struct rbd_req_coll *coll;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001455
1456 /* peek at request from block layer */
1457 if (!rq)
1458 break;
1459
1460 dout("fetched request\n");
1461
1462 /* filter out block requests we don't understand */
1463 if ((rq->cmd_type != REQ_TYPE_FS)) {
1464 __blk_end_request_all(rq, 0);
1465 goto next;
1466 }
1467
1468 /* deduce our operation (read, write) */
1469 do_write = (rq_data_dir(rq) == WRITE);
1470
1471 size = blk_rq_bytes(rq);
1472 ofs = blk_rq_pos(rq) * 512ULL;
1473 rq_bio = rq->bio;
1474 if (do_write && rbd_dev->read_only) {
1475 __blk_end_request_all(rq, -EROFS);
1476 goto next;
1477 }
1478
1479 spin_unlock_irq(q->queue_lock);
1480
1481 dout("%s 0x%x bytes at 0x%llx\n",
1482 do_write ? "write" : "read",
1483 size, blk_rq_pos(rq) * 512ULL);
1484
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001485 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1486 coll = rbd_alloc_coll(num_segs);
1487 if (!coll) {
1488 spin_lock_irq(q->queue_lock);
1489 __blk_end_request_all(rq, -ENOMEM);
1490 goto next;
1491 }
1492
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001493 do {
1494 /* a bio clone to be passed down to OSD req */
1495 dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
1496 op_size = rbd_get_segment(&rbd_dev->header,
1497 rbd_dev->header.block_name,
1498 ofs, size,
1499 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001500 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001501 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1502 op_size, GFP_ATOMIC);
1503 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001504 rbd_coll_end_req_index(rq, coll, cur_seg,
1505 -ENOMEM, op_size);
1506 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001507 }
1508
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001509
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001510 /* init OSD command: write or read */
1511 if (do_write)
1512 rbd_req_write(rq, rbd_dev,
1513 rbd_dev->header.snapc,
1514 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001515 op_size, bio,
1516 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001517 else
1518 rbd_req_read(rq, rbd_dev,
1519 cur_snap_id(rbd_dev),
1520 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001521 op_size, bio,
1522 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001523
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001524next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001525 size -= op_size;
1526 ofs += op_size;
1527
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001528 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001529 rq_bio = next_bio;
1530 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001531 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001532
1533 if (bp)
1534 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001535 spin_lock_irq(q->queue_lock);
1536next:
1537 rq = blk_fetch_request(q);
1538 }
1539}
1540
1541/*
1542 * a queue callback. Makes sure that we don't create a bio that spans across
1543 * multiple osd objects. One exception would be with a single page bios,
1544 * which we handle later at bio_chain_clone
1545 */
1546static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1547 struct bio_vec *bvec)
1548{
1549 struct rbd_device *rbd_dev = q->queuedata;
1550 unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
1551 sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1552 unsigned int bio_sectors = bmd->bi_size >> 9;
1553 int max;
1554
1555 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
1556 + bio_sectors)) << 9;
1557 if (max < 0)
1558 max = 0; /* bio_add cannot handle a negative return */
1559 if (max <= bvec->bv_len && bio_sectors == 0)
1560 return bvec->bv_len;
1561 return max;
1562}
1563
1564static void rbd_free_disk(struct rbd_device *rbd_dev)
1565{
1566 struct gendisk *disk = rbd_dev->disk;
1567
1568 if (!disk)
1569 return;
1570
1571 rbd_header_free(&rbd_dev->header);
1572
1573 if (disk->flags & GENHD_FL_UP)
1574 del_gendisk(disk);
1575 if (disk->queue)
1576 blk_cleanup_queue(disk->queue);
1577 put_disk(disk);
1578}
1579
1580/*
1581 * reload the ondisk the header
1582 */
1583static int rbd_read_header(struct rbd_device *rbd_dev,
1584 struct rbd_image_header *header)
1585{
1586 ssize_t rc;
1587 struct rbd_image_header_ondisk *dh;
1588 int snap_count = 0;
1589 u64 snap_names_len = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001590 u64 ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001591
1592 while (1) {
1593 int len = sizeof(*dh) +
1594 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1595 snap_names_len;
1596
1597 rc = -ENOMEM;
1598 dh = kmalloc(len, GFP_KERNEL);
1599 if (!dh)
1600 return -ENOMEM;
1601
1602 rc = rbd_req_sync_read(rbd_dev,
1603 NULL, CEPH_NOSNAP,
1604 rbd_dev->obj_md_name,
1605 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001606 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001607 if (rc < 0)
1608 goto out_dh;
1609
1610 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001611 if (rc < 0) {
1612 if (rc == -ENXIO) {
1613 pr_warning("unrecognized header format"
1614 " for image %s", rbd_dev->obj);
1615 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001616 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001617 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001618
1619 if (snap_count != header->total_snaps) {
1620 snap_count = header->total_snaps;
1621 snap_names_len = header->snap_names_len;
1622 rbd_header_free(header);
1623 kfree(dh);
1624 continue;
1625 }
1626 break;
1627 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001628 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001629
1630out_dh:
1631 kfree(dh);
1632 return rc;
1633}
1634
1635/*
1636 * create a snapshot
1637 */
1638static int rbd_header_add_snap(struct rbd_device *dev,
1639 const char *snap_name,
1640 gfp_t gfp_flags)
1641{
1642 int name_len = strlen(snap_name);
1643 u64 new_snapid;
1644 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001645 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001646 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001647 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001648
1649 /* we should create a snapshot only if we're pointing at the head */
1650 if (dev->cur_snap)
1651 return -EINVAL;
1652
Alex Elder1dbb4392012-01-24 10:08:37 -06001653 monc = &dev->rbd_client->client->monc;
1654 ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001655 dout("created snapid=%lld\n", new_snapid);
1656 if (ret < 0)
1657 return ret;
1658
1659 data = kmalloc(name_len + 16, gfp_flags);
1660 if (!data)
1661 return -ENOMEM;
1662
Sage Weil916d4d62011-05-12 16:10:50 -07001663 p = data;
1664 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001665
Sage Weil916d4d62011-05-12 16:10:50 -07001666 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1667 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001668
1669 ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001670 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001671
Sage Weil916d4d62011-05-12 16:10:50 -07001672 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001673
1674 if (ret < 0)
1675 return ret;
1676
1677 dev->header.snapc->seq = new_snapid;
1678
1679 return 0;
1680bad:
1681 return -ERANGE;
1682}
1683
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001684static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1685{
1686 struct rbd_snap *snap;
1687
1688 while (!list_empty(&rbd_dev->snaps)) {
1689 snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node);
1690 __rbd_remove_snap_dev(rbd_dev, snap);
1691 }
1692}
1693
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001694/*
1695 * only read the first part of the ondisk header, without the snaps info
1696 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001697static int __rbd_update_snaps(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001698{
1699 int ret;
1700 struct rbd_image_header h;
1701 u64 snap_seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001702 int follow_seq = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001703
1704 ret = rbd_read_header(rbd_dev, &h);
1705 if (ret < 0)
1706 return ret;
1707
Sage Weil9db4b3e2011-04-19 22:49:06 -07001708 /* resized? */
1709 set_capacity(rbd_dev->disk, h.image_size / 512ULL);
1710
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001711 down_write(&rbd_dev->header.snap_rwsem);
1712
1713 snap_seq = rbd_dev->header.snapc->seq;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001714 if (rbd_dev->header.total_snaps &&
1715 rbd_dev->header.snapc->snaps[0] == snap_seq)
1716 /* pointing at the head, will need to follow that
1717 if head moves */
1718 follow_seq = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001719
1720 kfree(rbd_dev->header.snapc);
1721 kfree(rbd_dev->header.snap_names);
1722 kfree(rbd_dev->header.snap_sizes);
1723
1724 rbd_dev->header.total_snaps = h.total_snaps;
1725 rbd_dev->header.snapc = h.snapc;
1726 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001727 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001728 rbd_dev->header.snap_sizes = h.snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001729 if (follow_seq)
1730 rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0];
1731 else
1732 rbd_dev->header.snapc->seq = snap_seq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001733
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001734 ret = __rbd_init_snaps_header(rbd_dev);
1735
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001736 up_write(&rbd_dev->header.snap_rwsem);
1737
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001738 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001739}
1740
1741static int rbd_init_disk(struct rbd_device *rbd_dev)
1742{
1743 struct gendisk *disk;
1744 struct request_queue *q;
1745 int rc;
1746 u64 total_size = 0;
1747
1748 /* contact OSD, request size info about the object being mapped */
1749 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1750 if (rc)
1751 return rc;
1752
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001753 /* no need to lock here, as rbd_dev is not registered yet */
1754 rc = __rbd_init_snaps_header(rbd_dev);
1755 if (rc)
1756 return rc;
1757
Josh Durgincc9d7342011-11-21 18:19:13 -08001758 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001759 if (rc)
1760 return rc;
1761
1762 /* create gendisk info */
1763 rc = -ENOMEM;
1764 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1765 if (!disk)
1766 goto out;
1767
Alex Elderf0f8cef2012-01-29 13:57:44 -06001768 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Sage Weilaedfec52011-05-12 20:57:03 -07001769 rbd_dev->id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001770 disk->major = rbd_dev->major;
1771 disk->first_minor = 0;
1772 disk->fops = &rbd_bd_ops;
1773 disk->private_data = rbd_dev;
1774
1775 /* init rq */
1776 rc = -ENOMEM;
1777 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1778 if (!q)
1779 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001780
1781 /* set io sizes to object size */
1782 blk_queue_max_hw_sectors(q, rbd_obj_bytes(&rbd_dev->header) / 512ULL);
1783 blk_queue_max_segment_size(q, rbd_obj_bytes(&rbd_dev->header));
1784 blk_queue_io_min(q, rbd_obj_bytes(&rbd_dev->header));
1785 blk_queue_io_opt(q, rbd_obj_bytes(&rbd_dev->header));
1786
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001787 blk_queue_merge_bvec(q, rbd_merge_bvec);
1788 disk->queue = q;
1789
1790 q->queuedata = rbd_dev;
1791
1792 rbd_dev->disk = disk;
1793 rbd_dev->q = q;
1794
1795 /* finally, announce the disk to the world */
1796 set_capacity(disk, total_size / 512ULL);
1797 add_disk(disk);
1798
1799 pr_info("%s: added with size 0x%llx\n",
1800 disk->disk_name, (unsigned long long)total_size);
1801 return 0;
1802
1803out_disk:
1804 put_disk(disk);
1805out:
1806 return rc;
1807}
1808
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001809/*
1810 sysfs
1811*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001812
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001813static ssize_t rbd_size_show(struct device *dev,
1814 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001815{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001816 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1817
1818 return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001819}
1820
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001821static ssize_t rbd_major_show(struct device *dev,
1822 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001823{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001824 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1825
1826 return sprintf(buf, "%d\n", rbd_dev->major);
1827}
1828
1829static ssize_t rbd_client_id_show(struct device *dev,
1830 struct device_attribute *attr, char *buf)
1831{
1832 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1833
Alex Elder1dbb4392012-01-24 10:08:37 -06001834 return sprintf(buf, "client%lld\n",
1835 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001836}
1837
1838static ssize_t rbd_pool_show(struct device *dev,
1839 struct device_attribute *attr, char *buf)
1840{
1841 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1842
1843 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1844}
1845
1846static ssize_t rbd_name_show(struct device *dev,
1847 struct device_attribute *attr, char *buf)
1848{
1849 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1850
1851 return sprintf(buf, "%s\n", rbd_dev->obj);
1852}
1853
1854static ssize_t rbd_snap_show(struct device *dev,
1855 struct device_attribute *attr,
1856 char *buf)
1857{
1858 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1859
1860 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1861}
1862
1863static ssize_t rbd_image_refresh(struct device *dev,
1864 struct device_attribute *attr,
1865 const char *buf,
1866 size_t size)
1867{
1868 struct rbd_device *rbd_dev = dev_to_rbd(dev);
1869 int rc;
1870 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001871
1872 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1873
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001874 rc = __rbd_update_snaps(rbd_dev);
1875 if (rc < 0)
1876 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001877
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001878 mutex_unlock(&ctl_mutex);
1879 return ret;
1880}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001881
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001882static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1883static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1884static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1885static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
1886static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1887static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1888static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1889static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001890
1891static struct attribute *rbd_attrs[] = {
1892 &dev_attr_size.attr,
1893 &dev_attr_major.attr,
1894 &dev_attr_client_id.attr,
1895 &dev_attr_pool.attr,
1896 &dev_attr_name.attr,
1897 &dev_attr_current_snap.attr,
1898 &dev_attr_refresh.attr,
1899 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001900 NULL
1901};
1902
1903static struct attribute_group rbd_attr_group = {
1904 .attrs = rbd_attrs,
1905};
1906
1907static const struct attribute_group *rbd_attr_groups[] = {
1908 &rbd_attr_group,
1909 NULL
1910};
1911
1912static void rbd_sysfs_dev_release(struct device *dev)
1913{
1914}
1915
1916static struct device_type rbd_device_type = {
1917 .name = "rbd",
1918 .groups = rbd_attr_groups,
1919 .release = rbd_sysfs_dev_release,
1920};
1921
1922
1923/*
1924 sysfs - snapshots
1925*/
1926
1927static ssize_t rbd_snap_size_show(struct device *dev,
1928 struct device_attribute *attr,
1929 char *buf)
1930{
1931 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1932
1933 return sprintf(buf, "%lld\n", (long long)snap->size);
1934}
1935
1936static ssize_t rbd_snap_id_show(struct device *dev,
1937 struct device_attribute *attr,
1938 char *buf)
1939{
1940 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1941
1942 return sprintf(buf, "%lld\n", (long long)snap->id);
1943}
1944
1945static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1946static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1947
1948static struct attribute *rbd_snap_attrs[] = {
1949 &dev_attr_snap_size.attr,
1950 &dev_attr_snap_id.attr,
1951 NULL,
1952};
1953
1954static struct attribute_group rbd_snap_attr_group = {
1955 .attrs = rbd_snap_attrs,
1956};
1957
1958static void rbd_snap_dev_release(struct device *dev)
1959{
1960 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1961 kfree(snap->name);
1962 kfree(snap);
1963}
1964
1965static const struct attribute_group *rbd_snap_attr_groups[] = {
1966 &rbd_snap_attr_group,
1967 NULL
1968};
1969
1970static struct device_type rbd_snap_device_type = {
1971 .groups = rbd_snap_attr_groups,
1972 .release = rbd_snap_dev_release,
1973};
1974
1975static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
1976 struct rbd_snap *snap)
1977{
1978 list_del(&snap->node);
1979 device_unregister(&snap->dev);
1980}
1981
1982static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
1983 struct rbd_snap *snap,
1984 struct device *parent)
1985{
1986 struct device *dev = &snap->dev;
1987 int ret;
1988
1989 dev->type = &rbd_snap_device_type;
1990 dev->parent = parent;
1991 dev->release = rbd_snap_dev_release;
1992 dev_set_name(dev, "snap_%s", snap->name);
1993 ret = device_register(dev);
1994
1995 return ret;
1996}
1997
1998static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
1999 int i, const char *name,
2000 struct rbd_snap **snapp)
2001{
2002 int ret;
2003 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2004 if (!snap)
2005 return -ENOMEM;
2006 snap->name = kstrdup(name, GFP_KERNEL);
2007 snap->size = rbd_dev->header.snap_sizes[i];
2008 snap->id = rbd_dev->header.snapc->snaps[i];
2009 if (device_is_registered(&rbd_dev->dev)) {
2010 ret = rbd_register_snap_dev(rbd_dev, snap,
2011 &rbd_dev->dev);
2012 if (ret < 0)
2013 goto err;
2014 }
2015 *snapp = snap;
2016 return 0;
2017err:
2018 kfree(snap->name);
2019 kfree(snap);
2020 return ret;
2021}
2022
2023/*
2024 * search for the previous snap in a null delimited string list
2025 */
2026const char *rbd_prev_snap_name(const char *name, const char *start)
2027{
2028 if (name < start + 2)
2029 return NULL;
2030
2031 name -= 2;
2032 while (*name) {
2033 if (name == start)
2034 return start;
2035 name--;
2036 }
2037 return name + 1;
2038}
2039
2040/*
2041 * compare the old list of snapshots that we have to what's in the header
2042 * and update it accordingly. Note that the header holds the snapshots
2043 * in a reverse order (from newest to oldest) and we need to go from
2044 * older to new so that we don't get a duplicate snap name when
2045 * doing the process (e.g., removed snapshot and recreated a new
2046 * one with the same name.
2047 */
2048static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2049{
2050 const char *name, *first_name;
2051 int i = rbd_dev->header.total_snaps;
2052 struct rbd_snap *snap, *old_snap = NULL;
2053 int ret;
2054 struct list_head *p, *n;
2055
2056 first_name = rbd_dev->header.snap_names;
2057 name = first_name + rbd_dev->header.snap_names_len;
2058
2059 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2060 u64 cur_id;
2061
2062 old_snap = list_entry(p, struct rbd_snap, node);
2063
2064 if (i)
2065 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2066
2067 if (!i || old_snap->id < cur_id) {
2068 /* old_snap->id was skipped, thus was removed */
2069 __rbd_remove_snap_dev(rbd_dev, old_snap);
2070 continue;
2071 }
2072 if (old_snap->id == cur_id) {
2073 /* we have this snapshot already */
2074 i--;
2075 name = rbd_prev_snap_name(name, first_name);
2076 continue;
2077 }
2078 for (; i > 0;
2079 i--, name = rbd_prev_snap_name(name, first_name)) {
2080 if (!name) {
2081 WARN_ON(1);
2082 return -EINVAL;
2083 }
2084 cur_id = rbd_dev->header.snapc->snaps[i];
2085 /* snapshot removal? handle it above */
2086 if (cur_id >= old_snap->id)
2087 break;
2088 /* a new snapshot */
2089 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2090 if (ret < 0)
2091 return ret;
2092
2093 /* note that we add it backward so using n and not p */
2094 list_add(&snap->node, n);
2095 p = &snap->node;
2096 }
2097 }
2098 /* we're done going over the old snap list, just add what's left */
2099 for (; i > 0; i--) {
2100 name = rbd_prev_snap_name(name, first_name);
2101 if (!name) {
2102 WARN_ON(1);
2103 return -EINVAL;
2104 }
2105 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2106 if (ret < 0)
2107 return ret;
2108 list_add(&snap->node, &rbd_dev->snaps);
2109 }
2110
2111 return 0;
2112}
2113
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002114static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2115{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002116 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002117 struct device *dev;
2118 struct rbd_snap *snap;
2119
2120 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2121 dev = &rbd_dev->dev;
2122
2123 dev->bus = &rbd_bus_type;
2124 dev->type = &rbd_device_type;
2125 dev->parent = &rbd_root_dev;
2126 dev->release = rbd_dev_release;
2127 dev_set_name(dev, "%d", rbd_dev->id);
2128 ret = device_register(dev);
2129 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002130 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002131
2132 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2133 ret = rbd_register_snap_dev(rbd_dev, snap,
2134 &rbd_dev->dev);
2135 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002136 break;
2137 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002138out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002139 mutex_unlock(&ctl_mutex);
2140 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002141}
2142
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002143static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2144{
2145 device_unregister(&rbd_dev->dev);
2146}
2147
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002148static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2149{
2150 int ret, rc;
2151
2152 do {
2153 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name,
2154 rbd_dev->header.obj_version);
2155 if (ret == -ERANGE) {
2156 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2157 rc = __rbd_update_snaps(rbd_dev);
2158 mutex_unlock(&ctl_mutex);
2159 if (rc < 0)
2160 return rc;
2161 }
2162 } while (ret == -ERANGE);
2163
2164 return ret;
2165}
2166
Alex Elder1ddbe942012-01-29 13:57:44 -06002167static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2168
2169/*
Alex Elder499afd52012-02-02 08:13:29 -06002170 * Get a unique rbd identifier for the given new rbd_dev, and add
2171 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002172 */
Alex Elder499afd52012-02-02 08:13:29 -06002173static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002174{
Alex Elder499afd52012-02-02 08:13:29 -06002175 rbd_dev->id = atomic64_inc_return(&rbd_id_max);
2176
2177 spin_lock(&rbd_dev_list_lock);
2178 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2179 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002180}
Alex Elderb7f23c32012-01-29 13:57:43 -06002181
Alex Elder1ddbe942012-01-29 13:57:44 -06002182/*
Alex Elder499afd52012-02-02 08:13:29 -06002183 * Remove an rbd_dev from the global list, and record that its
2184 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002185 */
Alex Elder499afd52012-02-02 08:13:29 -06002186static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002187{
Alex Elderd184f6b2012-01-29 13:57:44 -06002188 struct list_head *tmp;
2189 int rbd_id = rbd_dev->id;
2190 int max_id;
2191
2192 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002193
2194 spin_lock(&rbd_dev_list_lock);
2195 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002196
2197 /*
2198 * If the id being "put" is not the current maximum, there
2199 * is nothing special we need to do.
2200 */
2201 if (rbd_id != atomic64_read(&rbd_id_max)) {
2202 spin_unlock(&rbd_dev_list_lock);
2203 return;
2204 }
2205
2206 /*
2207 * We need to update the current maximum id. Search the
2208 * list to find out what it is. We're more likely to find
2209 * the maximum at the end, so search the list backward.
2210 */
2211 max_id = 0;
2212 list_for_each_prev(tmp, &rbd_dev_list) {
2213 struct rbd_device *rbd_dev;
2214
2215 rbd_dev = list_entry(tmp, struct rbd_device, node);
2216 if (rbd_id > max_id)
2217 max_id = rbd_id;
2218 }
Alex Elder499afd52012-02-02 08:13:29 -06002219 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002220
Alex Elder1ddbe942012-01-29 13:57:44 -06002221 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002222 * The max id could have been updated by rbd_id_get(), in
2223 * which case it now accurately reflects the new maximum.
2224 * Be careful not to overwrite the maximum value in that
2225 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002226 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002227 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002228}
2229
Alex Eldera725f65e2012-02-02 08:13:30 -06002230/*
Alex Eldere28fff262012-02-02 08:13:30 -06002231 * Skips over white space at *buf, and updates *buf to point to the
2232 * first found non-space character (if any). Returns the length of
2233 * the token (string of non-white space characters) found.
2234 */
2235static inline size_t next_token(const char **buf)
2236{
2237 /*
2238 * These are the characters that produce nonzero for
2239 * isspace() in the "C" and "POSIX" locales.
2240 */
2241 const char *spaces = " \f\n\r\t\v";
2242
2243 *buf += strspn(*buf, spaces); /* Find start of token */
2244
2245 return strcspn(*buf, spaces); /* Return token length */
2246}
2247
2248/*
2249 * Finds the next token in *buf, and if the provided token buffer is
2250 * big enough, copies the found token into it. The result, if
2251 * copied, is guaranteed to be terminated with '\0'.
2252 *
2253 * Returns the length of the token found (not including the '\0').
2254 * Return value will be 0 if no token is found, and it will be >=
2255 * token_size if the token would not fit.
2256 *
2257 * The *buf pointer will be updated point beyond the end of the
2258 * found token. Note that this occurs even if the token buffer is
2259 * too small to hold it.
2260 */
2261static inline size_t copy_token(const char **buf,
2262 char *token,
2263 size_t token_size)
2264{
2265 size_t len;
2266
2267 len = next_token(buf);
2268 if (len < token_size) {
2269 memcpy(token, *buf, len);
2270 *(token + len) = '\0';
2271 }
2272 *buf += len;
2273
2274 return len;
2275}
2276
2277/*
Alex Eldera725f65e2012-02-02 08:13:30 -06002278 * This fills in the pool_name, obj, obj_len, snap_name, obj_len,
2279 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2280 * on the list of monitor addresses and other options provided via
2281 * /sys/bus/rbd/add.
2282 */
2283static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2284 const char *buf,
2285 char *mon_addrs,
Alex Eldere28fff262012-02-02 08:13:30 -06002286 size_t mon_addrs_size,
2287 char *options,
2288 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002289{
Alex Eldere28fff262012-02-02 08:13:30 -06002290 size_t len;
2291
2292 /* The first four tokens are required */
2293
2294 len = copy_token(&buf, mon_addrs, mon_addrs_size);
2295 if (!len || len >= mon_addrs_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002296 return -EINVAL;
2297
Alex Eldere28fff262012-02-02 08:13:30 -06002298 len = copy_token(&buf, options, options_size);
2299 if (!len || len >= options_size)
2300 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002301
Alex Eldere28fff262012-02-02 08:13:30 -06002302 len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name));
2303 if (!len || len >= sizeof (rbd_dev->pool_name))
2304 return -EINVAL;
2305
2306 len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj));
2307 if (!len || len >= sizeof (rbd_dev->obj))
2308 return -EINVAL;
2309
2310 /* We have the object length in hand, save it. */
2311
2312 rbd_dev->obj_len = len;
2313
Alex Elder81a89792012-02-02 08:13:30 -06002314 BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN
2315 < RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX));
2316 sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002317
Alex Eldere28fff262012-02-02 08:13:30 -06002318 /*
2319 * The snapshot name is optional, but it's an error if it's
2320 * too long. If no snapshot is supplied, fill in the default.
2321 */
2322 len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name));
2323 if (!len)
2324 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2325 sizeof (RBD_SNAP_HEAD_NAME));
2326 else if (len >= sizeof (rbd_dev->snap_name))
2327 return -EINVAL;
2328
Alex Eldera725f65e2012-02-02 08:13:30 -06002329 return 0;
2330}
2331
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002332static ssize_t rbd_add(struct bus_type *bus,
2333 const char *buf,
2334 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002335{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002336 struct rbd_device *rbd_dev;
Alex Eldera725f65e2012-02-02 08:13:30 -06002337 char *mon_addrs = NULL;
Alex Elder27cc2592012-02-02 08:13:30 -06002338 char *options = NULL;
2339 struct ceph_osd_client *osdc;
2340 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002341
2342 if (!try_module_get(THIS_MODULE))
2343 return -ENODEV;
2344
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002345 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2346 if (!rbd_dev)
Alex Elder27cc2592012-02-02 08:13:30 -06002347 goto err_nomem;
Alex Eldera725f65e2012-02-02 08:13:30 -06002348 mon_addrs = kmalloc(count, GFP_KERNEL);
2349 if (!mon_addrs)
Alex Elder27cc2592012-02-02 08:13:30 -06002350 goto err_nomem;
2351 options = kmalloc(count, GFP_KERNEL);
2352 if (!options)
2353 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002354
2355 /* static rbd_device initialization */
2356 spin_lock_init(&rbd_dev->lock);
2357 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002358 INIT_LIST_HEAD(&rbd_dev->snaps);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002359
Alex Elder0e805a12012-01-11 19:42:15 -08002360 init_rwsem(&rbd_dev->header.snap_rwsem);
2361
Alex Elderd184f6b2012-01-29 13:57:44 -06002362 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002363 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002364
Alex Eldera725f65e2012-02-02 08:13:30 -06002365 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002366 BUILD_BUG_ON(DEV_NAME_LEN
2367 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
2368 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id);
Alex Eldere124a82f2012-01-29 13:57:44 -06002369
Alex Eldera725f65e2012-02-02 08:13:30 -06002370 /* parse add command */
Alex Eldere28fff262012-02-02 08:13:30 -06002371 rc = rbd_add_parse_args(rbd_dev, buf, mon_addrs, count,
2372 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002373 if (rc)
2374 goto err_put_id;
2375
2376 rbd_dev->rbd_client = rbd_get_client(mon_addrs, options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002377 if (IS_ERR(rbd_dev->rbd_client)) {
2378 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002379 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002380 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002381
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002382 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002383 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002384 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2385 if (rc < 0)
2386 goto err_out_client;
2387 rbd_dev->poolid = rc;
2388
2389 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002390 rc = register_blkdev(0, rbd_dev->name);
2391 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002392 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002393 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002394
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002395 rc = rbd_bus_add_dev(rbd_dev);
2396 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002397 goto err_out_blkdev;
2398
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002399 /* set up and announce blkdev mapping */
2400 rc = rbd_init_disk(rbd_dev);
2401 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002402 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002403
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002404 rc = rbd_init_watch_dev(rbd_dev);
2405 if (rc)
2406 goto err_out_bus;
2407
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002408 return count;
2409
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002410err_out_bus:
Alex Elder499afd52012-02-02 08:13:29 -06002411 rbd_id_put(rbd_dev);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002412
2413 /* this will also clean up rest of rbd_dev stuff */
2414
2415 rbd_bus_del_dev(rbd_dev);
2416 kfree(options);
Alex Eldera725f65e2012-02-02 08:13:30 -06002417 kfree(mon_addrs);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002418 return rc;
2419
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002420err_out_blkdev:
2421 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2422err_out_client:
2423 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002424err_put_id:
Alex Elder499afd52012-02-02 08:13:29 -06002425 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002426err_nomem:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002427 kfree(options);
Alex Eldera725f65e2012-02-02 08:13:30 -06002428 kfree(mon_addrs);
Alex Elder27cc2592012-02-02 08:13:30 -06002429 kfree(rbd_dev);
2430
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002431 dout("Error adding device %s\n", buf);
2432 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002433
2434 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002435}
2436
2437static struct rbd_device *__rbd_get_dev(unsigned long id)
2438{
2439 struct list_head *tmp;
2440 struct rbd_device *rbd_dev;
2441
Alex Eldere124a82f2012-01-29 13:57:44 -06002442 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002443 list_for_each(tmp, &rbd_dev_list) {
2444 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Eldere124a82f2012-01-29 13:57:44 -06002445 if (rbd_dev->id == id) {
2446 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002447 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002448 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002449 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002450 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002451 return NULL;
2452}
2453
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002454static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002455{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002456 struct rbd_device *rbd_dev =
2457 container_of(dev, struct rbd_device, dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002458
Alex Elder1dbb4392012-01-24 10:08:37 -06002459 if (rbd_dev->watch_request) {
2460 struct ceph_client *client = rbd_dev->rbd_client->client;
2461
2462 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002463 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002464 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002465 if (rbd_dev->watch_event)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07002466 rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002467
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002468 rbd_put_client(rbd_dev);
2469
2470 /* clean up and free blkdev */
2471 rbd_free_disk(rbd_dev);
2472 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2473 kfree(rbd_dev);
2474
2475 /* release module ref */
2476 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002477}
2478
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002479static ssize_t rbd_remove(struct bus_type *bus,
2480 const char *buf,
2481 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002482{
2483 struct rbd_device *rbd_dev = NULL;
2484 int target_id, rc;
2485 unsigned long ul;
2486 int ret = count;
2487
2488 rc = strict_strtoul(buf, 10, &ul);
2489 if (rc)
2490 return rc;
2491
2492 /* convert to int; abort if we lost anything in the conversion */
2493 target_id = (int) ul;
2494 if (target_id != ul)
2495 return -EINVAL;
2496
2497 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2498
2499 rbd_dev = __rbd_get_dev(target_id);
2500 if (!rbd_dev) {
2501 ret = -ENOENT;
2502 goto done;
2503 }
2504
Alex Elder499afd52012-02-02 08:13:29 -06002505 rbd_id_put(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002506
2507 __rbd_remove_all_snaps(rbd_dev);
2508 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002509
2510done:
2511 mutex_unlock(&ctl_mutex);
2512 return ret;
2513}
2514
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002515static ssize_t rbd_snap_add(struct device *dev,
2516 struct device_attribute *attr,
2517 const char *buf,
2518 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002519{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002520 struct rbd_device *rbd_dev = dev_to_rbd(dev);
2521 int ret;
2522 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002523 if (!name)
2524 return -ENOMEM;
2525
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002526 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002527
2528 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2529
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002530 ret = rbd_header_add_snap(rbd_dev,
2531 name, GFP_KERNEL);
2532 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002533 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002534
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002535 ret = __rbd_update_snaps(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002536 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002537 goto err_unlock;
2538
2539 /* shouldn't hold ctl_mutex when notifying.. notify might
2540 trigger a watch callback that would need to get that mutex */
2541 mutex_unlock(&ctl_mutex);
2542
2543 /* make a best effort, don't error if failed */
2544 rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002545
2546 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002547 kfree(name);
2548 return ret;
2549
2550err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002551 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002552 kfree(name);
2553 return ret;
2554}
2555
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002556/*
2557 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002558 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002559 */
2560static int rbd_sysfs_init(void)
2561{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002562 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002563
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002564 ret = bus_register(&rbd_bus_type);
Alex Elder21079782012-01-24 10:08:36 -06002565 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002566 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002567
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002568 ret = device_register(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002569
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002570 return ret;
2571}
2572
2573static void rbd_sysfs_cleanup(void)
2574{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002575 device_unregister(&rbd_root_dev);
2576 bus_unregister(&rbd_bus_type);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002577}
2578
2579int __init rbd_init(void)
2580{
2581 int rc;
2582
2583 rc = rbd_sysfs_init();
2584 if (rc)
2585 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002586 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002587 return 0;
2588}
2589
2590void __exit rbd_exit(void)
2591{
2592 rbd_sysfs_cleanup();
2593}
2594
2595module_init(rbd_init);
2596module_exit(rbd_exit);
2597
2598MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2599MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2600MODULE_DESCRIPTION("rados block device");
2601
2602/* following authorship retained from original osdblk.c */
2603MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2604
2605MODULE_LICENSE("GPL");