blob: fe7a9e15b6f2b20573136201016b8d03da455aa1 [file] [log] [blame]
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001/*
2 rbd.c -- Export ceph rados objects as a Linux block device
3
4
5 based on drivers/block/osdblk.c:
6
7 Copyright 2009 Red Hat, Inc.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING. If not, write to
20 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
21
22
23
Yehuda Sadehdfc56062010-11-19 14:51:04 -080024 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070025
Yehuda Sadehdfc56062010-11-19 14:51:04 -080026 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070027
28 */
29
30#include <linux/ceph/libceph.h>
31#include <linux/ceph/osd_client.h>
32#include <linux/ceph/mon_client.h>
33#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070034#include <linux/parser.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035
36#include <linux/kernel.h>
37#include <linux/device.h>
38#include <linux/module.h>
39#include <linux/fs.h>
40#include <linux/blkdev.h>
41
42#include "rbd_types.h"
43
Alex Elder593a9e72012-02-07 12:03:37 -060044/*
45 * The basic unit of block I/O is a sector. It is interpreted in a
46 * number of contexts in Linux (blk, bio, genhd), but the default is
47 * universally 512 bytes. These symbols are just slightly more
48 * meaningful than the bare numbers they represent.
49 */
50#define SECTOR_SHIFT 9
51#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
52
Alex Elderf0f8cef2012-01-29 13:57:44 -060053#define RBD_DRV_NAME "rbd"
54#define RBD_DRV_NAME_LONG "rbd (rados block device)"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070055
56#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
57
Yehuda Sadeh602adf42010-08-12 16:11:25 -070058#define RBD_MAX_SNAP_NAME_LEN 32
59#define RBD_MAX_OPT_LEN 1024
60
61#define RBD_SNAP_HEAD_NAME "-"
62
Alex Elder81a89792012-02-02 08:13:30 -060063/*
64 * An RBD device name will be "rbd#", where the "rbd" comes from
65 * RBD_DRV_NAME above, and # is a unique integer identifier.
66 * MAX_INT_FORMAT_WIDTH is used in ensuring DEV_NAME_LEN is big
67 * enough to hold all possible device names.
68 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070069#define DEV_NAME_LEN 32
Alex Elder81a89792012-02-02 08:13:30 -060070#define MAX_INT_FORMAT_WIDTH ((5 * sizeof (int)) / 2 + 1)
Yehuda Sadeh602adf42010-08-12 16:11:25 -070071
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070072#define RBD_NOTIFY_TIMEOUT_DEFAULT 10
73
Yehuda Sadeh602adf42010-08-12 16:11:25 -070074/*
75 * block device image metadata (in-memory version)
76 */
77struct rbd_image_header {
78 u64 image_size;
Alex Elder849b4262012-07-09 21:04:24 -050079 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070080 __u8 obj_order;
81 __u8 crypt_type;
82 __u8 comp_type;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070083 struct ceph_snap_context *snapc;
84 size_t snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070085 u32 total_snaps;
86
87 char *snap_names;
88 u64 *snap_sizes;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070089
90 u64 obj_version;
91};
92
93struct rbd_options {
94 int notify_timeout;
Yehuda Sadeh602adf42010-08-12 16:11:25 -070095};
96
97/*
Alex Elderf0f8cef2012-01-29 13:57:44 -060098 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099 */
100struct rbd_client {
101 struct ceph_client *client;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700102 struct rbd_options *rbd_opts;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700103 struct kref kref;
104 struct list_head node;
105};
106
107/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600108 * a request completion status
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700109 */
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700110struct rbd_req_status {
111 int done;
112 int rc;
113 u64 bytes;
114};
115
116/*
117 * a collection of requests
118 */
119struct rbd_req_coll {
120 int total;
121 int num_done;
122 struct kref kref;
123 struct rbd_req_status status[0];
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700124};
125
Alex Elderf0f8cef2012-01-29 13:57:44 -0600126/*
127 * a single io request
128 */
129struct rbd_request {
130 struct request *rq; /* blk layer request */
131 struct bio *bio; /* cloned bio */
132 struct page **pages; /* list of used pages */
133 u64 len;
134 int coll_index;
135 struct rbd_req_coll *coll;
136};
137
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800138struct rbd_snap {
139 struct device dev;
140 const char *name;
Josh Durgin3591538f2011-12-05 18:25:13 -0800141 u64 size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800142 struct list_head node;
143 u64 id;
144};
145
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700146/*
147 * a single device
148 */
149struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500150 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151
152 int major; /* blkdev assigned major */
153 struct gendisk *disk; /* blkdev's gendisk and rq */
154 struct request_queue *q;
155
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156 struct rbd_client *rbd_client;
157
158 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
159
160 spinlock_t lock; /* queue lock */
161
162 struct rbd_image_header header;
Alex Elder0bed54d2012-07-03 16:01:18 -0500163 char *image_name;
164 size_t image_name_len;
165 char *header_name;
Alex Elderd22f76e2012-07-12 10:46:35 -0500166 char *pool_name;
Alex Elder9bb2f332012-07-12 10:46:35 -0500167 int pool_id;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700168
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700169 struct ceph_osd_event *watch_event;
170 struct ceph_osd_request *watch_request;
171
Josh Durginc6666012011-11-21 17:11:12 -0800172 /* protects updating the header */
173 struct rw_semaphore header_rwsem;
Josh Durgine88a36e2011-11-21 18:14:25 -0800174 /* name of the snapshot this device reads from */
Alex Elder820a5f32012-07-09 21:04:24 -0500175 char *snap_name;
Josh Durgine88a36e2011-11-21 18:14:25 -0800176 /* id of the snapshot this device reads from */
Josh Durgin77dfe992011-11-21 13:04:42 -0800177 u64 snap_id; /* current snapshot id */
Josh Durgine88a36e2011-11-21 18:14:25 -0800178 /* whether the snap_id this device reads from still exists */
179 bool snap_exists;
180 int read_only;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700181
182 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800183
184 /* list of snapshots */
185 struct list_head snaps;
186
187 /* sysfs related */
188 struct device dev;
189};
190
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700191static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
Alex Eldere124a82f2012-01-29 13:57:44 -0600192
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700193static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600194static DEFINE_SPINLOCK(rbd_dev_list_lock);
195
Alex Elder432b8582012-01-29 13:57:44 -0600196static LIST_HEAD(rbd_client_list); /* clients */
197static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700198
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800199static int __rbd_init_snaps_header(struct rbd_device *rbd_dev);
200static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800201static ssize_t rbd_snap_add(struct device *dev,
202 struct device_attribute *attr,
203 const char *buf,
204 size_t count);
205static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
Justin P. Mattock69932482011-07-26 23:06:29 -0700206 struct rbd_snap *snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800207
Alex Elderf0f8cef2012-01-29 13:57:44 -0600208static ssize_t rbd_add(struct bus_type *bus, const char *buf,
209 size_t count);
210static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
211 size_t count);
212
213static struct bus_attribute rbd_bus_attrs[] = {
214 __ATTR(add, S_IWUSR, NULL, rbd_add),
215 __ATTR(remove, S_IWUSR, NULL, rbd_remove),
216 __ATTR_NULL
217};
218
219static struct bus_type rbd_bus_type = {
220 .name = "rbd",
221 .bus_attrs = rbd_bus_attrs,
222};
223
224static void rbd_root_dev_release(struct device *dev)
225{
226}
227
228static struct device rbd_root_dev = {
229 .init_name = "rbd",
230 .release = rbd_root_dev_release,
231};
232
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800233
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800234static struct device *rbd_get_dev(struct rbd_device *rbd_dev)
235{
236 return get_device(&rbd_dev->dev);
237}
238
239static void rbd_put_dev(struct rbd_device *rbd_dev)
240{
241 put_device(&rbd_dev->dev);
242}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700243
Josh Durgin263c6ca2011-12-05 10:43:42 -0800244static int __rbd_refresh_header(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700245
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700246static int rbd_open(struct block_device *bdev, fmode_t mode)
247{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600248 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700249
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800250 rbd_get_dev(rbd_dev);
251
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700252 set_device_ro(bdev, rbd_dev->read_only);
253
254 if ((mode & FMODE_WRITE) && rbd_dev->read_only)
255 return -EROFS;
256
257 return 0;
258}
259
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800260static int rbd_release(struct gendisk *disk, fmode_t mode)
261{
262 struct rbd_device *rbd_dev = disk->private_data;
263
264 rbd_put_dev(rbd_dev);
265
266 return 0;
267}
268
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700269static const struct block_device_operations rbd_bd_ops = {
270 .owner = THIS_MODULE,
271 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800272 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700273};
274
275/*
276 * Initialize an rbd client instance.
Alex Elder43ae4702012-07-03 16:01:18 -0500277 * We own *ceph_opts.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700278 */
Alex Elder43ae4702012-07-03 16:01:18 -0500279static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700280 struct rbd_options *rbd_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700281{
282 struct rbd_client *rbdc;
283 int ret = -ENOMEM;
284
285 dout("rbd_client_create\n");
286 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
287 if (!rbdc)
288 goto out_opt;
289
290 kref_init(&rbdc->kref);
291 INIT_LIST_HEAD(&rbdc->node);
292
Alex Elderbc534d82012-01-29 13:57:44 -0600293 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
294
Alex Elder43ae4702012-07-03 16:01:18 -0500295 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700296 if (IS_ERR(rbdc->client))
Alex Elderbc534d82012-01-29 13:57:44 -0600297 goto out_mutex;
Alex Elder43ae4702012-07-03 16:01:18 -0500298 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700299
300 ret = ceph_open_session(rbdc->client);
301 if (ret < 0)
302 goto out_err;
303
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700304 rbdc->rbd_opts = rbd_opts;
305
Alex Elder432b8582012-01-29 13:57:44 -0600306 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700307 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600308 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700309
Alex Elderbc534d82012-01-29 13:57:44 -0600310 mutex_unlock(&ctl_mutex);
311
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700312 dout("rbd_client_create created %p\n", rbdc);
313 return rbdc;
314
315out_err:
316 ceph_destroy_client(rbdc->client);
Alex Elderbc534d82012-01-29 13:57:44 -0600317out_mutex:
318 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700319 kfree(rbdc);
320out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500321 if (ceph_opts)
322 ceph_destroy_options(ceph_opts);
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400323 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700324}
325
326/*
327 * Find a ceph client with specific addr and configuration.
328 */
Alex Elder43ae4702012-07-03 16:01:18 -0500329static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700330{
331 struct rbd_client *client_node;
332
Alex Elder43ae4702012-07-03 16:01:18 -0500333 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700334 return NULL;
335
336 list_for_each_entry(client_node, &rbd_client_list, node)
Alex Elder43ae4702012-07-03 16:01:18 -0500337 if (!ceph_compare_options(ceph_opts, client_node->client))
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700338 return client_node;
339 return NULL;
340}
341
342/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700343 * mount options
344 */
345enum {
346 Opt_notify_timeout,
347 Opt_last_int,
348 /* int args above */
349 Opt_last_string,
350 /* string args above */
351};
352
Alex Elder43ae4702012-07-03 16:01:18 -0500353static match_table_t rbd_opts_tokens = {
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700354 {Opt_notify_timeout, "notify_timeout=%d"},
355 /* int args above */
356 /* string args above */
357 {-1, NULL}
358};
359
360static int parse_rbd_opts_token(char *c, void *private)
361{
Alex Elder43ae4702012-07-03 16:01:18 -0500362 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700363 substring_t argstr[MAX_OPT_ARGS];
364 int token, intval, ret;
365
Alex Elder43ae4702012-07-03 16:01:18 -0500366 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700367 if (token < 0)
368 return -EINVAL;
369
370 if (token < Opt_last_int) {
371 ret = match_int(&argstr[0], &intval);
372 if (ret < 0) {
373 pr_err("bad mount option arg (not int) "
374 "at '%s'\n", c);
375 return ret;
376 }
377 dout("got int token %d val %d\n", token, intval);
378 } else if (token > Opt_last_int && token < Opt_last_string) {
379 dout("got string token %d val %s\n", token,
380 argstr[0].from);
381 } else {
382 dout("got token %d\n", token);
383 }
384
385 switch (token) {
386 case Opt_notify_timeout:
Alex Elder43ae4702012-07-03 16:01:18 -0500387 rbd_opts->notify_timeout = intval;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700388 break;
389 default:
390 BUG_ON(token);
391 }
392 return 0;
393}
394
395/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700396 * Get a ceph client with specific addr and configuration, if one does
397 * not exist create it.
398 */
Alex Elder5214ecc2012-02-02 08:13:30 -0600399static struct rbd_client *rbd_get_client(const char *mon_addr,
400 size_t mon_addr_len,
401 char *options)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700402{
403 struct rbd_client *rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500404 struct ceph_options *ceph_opts;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700405 struct rbd_options *rbd_opts;
406
407 rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL);
408 if (!rbd_opts)
Alex Elderd720bcb2012-02-02 08:13:30 -0600409 return ERR_PTR(-ENOMEM);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700410
411 rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700412
Alex Elder43ae4702012-07-03 16:01:18 -0500413 ceph_opts = ceph_parse_options(options, mon_addr,
414 mon_addr + mon_addr_len,
415 parse_rbd_opts_token, rbd_opts);
416 if (IS_ERR(ceph_opts)) {
Alex Elderd720bcb2012-02-02 08:13:30 -0600417 kfree(rbd_opts);
Alex Elder43ae4702012-07-03 16:01:18 -0500418 return ERR_CAST(ceph_opts);
Alex Elderee577412012-01-24 10:08:36 -0600419 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700420
Alex Elder432b8582012-01-29 13:57:44 -0600421 spin_lock(&rbd_client_list_lock);
Alex Elder43ae4702012-07-03 16:01:18 -0500422 rbdc = __rbd_client_find(ceph_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700423 if (rbdc) {
Alex Eldere6994d3d2012-01-29 13:57:44 -0600424 /* using an existing client */
425 kref_get(&rbdc->kref);
Alex Elder432b8582012-01-29 13:57:44 -0600426 spin_unlock(&rbd_client_list_lock);
Alex Eldere6994d3d2012-01-29 13:57:44 -0600427
Alex Elder43ae4702012-07-03 16:01:18 -0500428 ceph_destroy_options(ceph_opts);
Alex Elder97bb59a2012-01-24 10:08:36 -0600429 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700430
Alex Elderd720bcb2012-02-02 08:13:30 -0600431 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432 }
Alex Elder432b8582012-01-29 13:57:44 -0600433 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700434
Alex Elder43ae4702012-07-03 16:01:18 -0500435 rbdc = rbd_client_create(ceph_opts, rbd_opts);
Alex Elderd97081b2012-01-29 13:57:44 -0600436
Alex Elderd720bcb2012-02-02 08:13:30 -0600437 if (IS_ERR(rbdc))
438 kfree(rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700439
Alex Elderd720bcb2012-02-02 08:13:30 -0600440 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441}
442
443/*
444 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600445 *
Alex Elder432b8582012-01-29 13:57:44 -0600446 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700447 */
448static void rbd_client_release(struct kref *kref)
449{
450 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
451
452 dout("rbd_release_client %p\n", rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500453 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700454 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500455 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700456
457 ceph_destroy_client(rbdc->client);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700458 kfree(rbdc->rbd_opts);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700459 kfree(rbdc);
460}
461
462/*
463 * Drop reference to ceph client node. If it's not referenced anymore, release
464 * it.
465 */
466static void rbd_put_client(struct rbd_device *rbd_dev)
467{
468 kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
469 rbd_dev->rbd_client = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700470}
471
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700472/*
473 * Destroy requests collection
474 */
475static void rbd_coll_release(struct kref *kref)
476{
477 struct rbd_req_coll *coll =
478 container_of(kref, struct rbd_req_coll, kref);
479
480 dout("rbd_coll_release %p\n", coll);
481 kfree(coll);
482}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700483
Alex Elder8e94af82012-07-25 09:32:40 -0500484static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
485{
486 return !memcmp(&ondisk->text,
487 RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT));
488}
489
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700490/*
491 * Create a new header structure, translate header format from the on-disk
492 * header.
493 */
494static int rbd_header_from_disk(struct rbd_image_header *header,
495 struct rbd_image_header_ondisk *ondisk,
Xi Wang50f7c4c2012-04-20 15:49:44 -0500496 u32 allocated_snaps,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700497 gfp_t gfp_flags)
498{
Xi Wang50f7c4c2012-04-20 15:49:44 -0500499 u32 i, snap_count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700500
Alex Elder8e94af82012-07-25 09:32:40 -0500501 if (!rbd_dev_ondisk_valid(ondisk))
Josh Durgin81e759f2011-11-15 14:49:53 -0800502 return -ENXIO;
Josh Durgin81e759f2011-11-15 14:49:53 -0800503
Alex Elder00f1f362012-02-07 12:03:36 -0600504 snap_count = le32_to_cpu(ondisk->snap_count);
Xi Wang50f7c4c2012-04-20 15:49:44 -0500505 if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context))
506 / sizeof (*ondisk))
507 return -EINVAL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700508 header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
Yan, Zhengf9f9a192012-06-06 09:15:33 -0500509 snap_count * sizeof(u64),
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700510 gfp_flags);
511 if (!header->snapc)
512 return -ENOMEM;
Alex Elder00f1f362012-02-07 12:03:36 -0600513
Alex Elder00f1f362012-02-07 12:03:36 -0600514 header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700515 if (snap_count) {
516 header->snap_names = kmalloc(header->snap_names_len,
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500517 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700518 if (!header->snap_names)
519 goto err_snapc;
520 header->snap_sizes = kmalloc(snap_count * sizeof(u64),
Dan Carpenterf8ad4952012-04-20 15:49:44 -0500521 gfp_flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700522 if (!header->snap_sizes)
523 goto err_names;
524 } else {
525 header->snap_names = NULL;
526 header->snap_sizes = NULL;
527 }
Alex Elder849b4262012-07-09 21:04:24 -0500528
529 header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1,
530 gfp_flags);
531 if (!header->object_prefix)
532 goto err_sizes;
533
Alex Elderca1e49a2012-07-10 20:30:09 -0500534 memcpy(header->object_prefix, ondisk->block_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700535 sizeof(ondisk->block_name));
Alex Elder849b4262012-07-09 21:04:24 -0500536 header->object_prefix[sizeof (ondisk->block_name)] = '\0';
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700537
538 header->image_size = le64_to_cpu(ondisk->image_size);
539 header->obj_order = ondisk->options.order;
540 header->crypt_type = ondisk->options.crypt_type;
541 header->comp_type = ondisk->options.comp_type;
542
543 atomic_set(&header->snapc->nref, 1);
Alex Elder505cbb92012-07-19 08:49:18 -0500544 header->snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700545 header->snapc->num_snaps = snap_count;
546 header->total_snaps = snap_count;
547
Alex Elder21079782012-01-24 10:08:36 -0600548 if (snap_count && allocated_snaps == snap_count) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700549 for (i = 0; i < snap_count; i++) {
550 header->snapc->snaps[i] =
551 le64_to_cpu(ondisk->snaps[i].id);
552 header->snap_sizes[i] =
553 le64_to_cpu(ondisk->snaps[i].image_size);
554 }
555
556 /* copy snapshot names */
557 memcpy(header->snap_names, &ondisk->snaps[i],
558 header->snap_names_len);
559 }
560
561 return 0;
562
Alex Elder849b4262012-07-09 21:04:24 -0500563err_sizes:
564 kfree(header->snap_sizes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700565err_names:
566 kfree(header->snap_names);
567err_snapc:
568 kfree(header->snapc);
Alex Elder00f1f362012-02-07 12:03:36 -0600569 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700570}
571
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700572static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
573 u64 *seq, u64 *size)
574{
575 int i;
576 char *p = header->snap_names;
577
Alex Elder00f1f362012-02-07 12:03:36 -0600578 for (i = 0; i < header->total_snaps; i++) {
579 if (!strcmp(snap_name, p)) {
580
581 /* Found it. Pass back its id and/or size */
582
583 if (seq)
584 *seq = header->snapc->snaps[i];
585 if (size)
586 *size = header->snap_sizes[i];
587 return i;
588 }
589 p += strlen(p) + 1; /* Skip ahead to the next name */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700590 }
Alex Elder00f1f362012-02-07 12:03:36 -0600591 return -ENOENT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700592}
593
Alex Elder0ce1a792012-07-03 16:01:18 -0500594static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700595{
Alex Elder78dc4472012-07-19 08:49:18 -0500596 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700597
Alex Elder0ce1a792012-07-03 16:01:18 -0500598 down_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700599
Alex Elder0ce1a792012-07-03 16:01:18 -0500600 if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
Josh Durgincc9d7342011-11-21 18:19:13 -0800601 sizeof (RBD_SNAP_HEAD_NAME))) {
Alex Elder0ce1a792012-07-03 16:01:18 -0500602 rbd_dev->snap_id = CEPH_NOSNAP;
Josh Durgine88a36e2011-11-21 18:14:25 -0800603 rbd_dev->snap_exists = false;
Alex Elder0ce1a792012-07-03 16:01:18 -0500604 rbd_dev->read_only = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700605 if (size)
Alex Elder78dc4472012-07-19 08:49:18 -0500606 *size = rbd_dev->header.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700607 } else {
Alex Elder78dc4472012-07-19 08:49:18 -0500608 u64 snap_id = 0;
609
610 ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name,
611 &snap_id, size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700612 if (ret < 0)
613 goto done;
Alex Elder78dc4472012-07-19 08:49:18 -0500614 rbd_dev->snap_id = snap_id;
Josh Durgine88a36e2011-11-21 18:14:25 -0800615 rbd_dev->snap_exists = true;
Alex Elder0ce1a792012-07-03 16:01:18 -0500616 rbd_dev->read_only = 1;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700617 }
618
619 ret = 0;
620done:
Alex Elder0ce1a792012-07-03 16:01:18 -0500621 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622 return ret;
623}
624
625static void rbd_header_free(struct rbd_image_header *header)
626{
Alex Elder849b4262012-07-09 21:04:24 -0500627 kfree(header->object_prefix);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700628 kfree(header->snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -0500629 kfree(header->snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -0800630 ceph_put_snap_context(header->snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700631}
632
633/*
634 * get the actual striped segment name, offset and length
635 */
636static u64 rbd_get_segment(struct rbd_image_header *header,
Alex Elderca1e49a2012-07-10 20:30:09 -0500637 const char *object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700638 u64 ofs, u64 len,
639 char *seg_name, u64 *segofs)
640{
641 u64 seg = ofs >> header->obj_order;
642
643 if (seg_name)
644 snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
Alex Elderca1e49a2012-07-10 20:30:09 -0500645 "%s.%012llx", object_prefix, seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700646
647 ofs = ofs & ((1 << header->obj_order) - 1);
648 len = min_t(u64, len, (1 << header->obj_order) - ofs);
649
650 if (segofs)
651 *segofs = ofs;
652
653 return len;
654}
655
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700656static int rbd_get_num_segments(struct rbd_image_header *header,
657 u64 ofs, u64 len)
658{
659 u64 start_seg = ofs >> header->obj_order;
660 u64 end_seg = (ofs + len - 1) >> header->obj_order;
661 return end_seg - start_seg + 1;
662}
663
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700664/*
Josh Durgin029bcbd2011-07-22 11:35:23 -0700665 * returns the size of an object in the image
666 */
667static u64 rbd_obj_bytes(struct rbd_image_header *header)
668{
669 return 1 << header->obj_order;
670}
671
672/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700673 * bio helpers
674 */
675
676static void bio_chain_put(struct bio *chain)
677{
678 struct bio *tmp;
679
680 while (chain) {
681 tmp = chain;
682 chain = chain->bi_next;
683 bio_put(tmp);
684 }
685}
686
687/*
688 * zeros a bio chain, starting at specific offset
689 */
690static void zero_bio_chain(struct bio *chain, int start_ofs)
691{
692 struct bio_vec *bv;
693 unsigned long flags;
694 void *buf;
695 int i;
696 int pos = 0;
697
698 while (chain) {
699 bio_for_each_segment(bv, chain, i) {
700 if (pos + bv->bv_len > start_ofs) {
701 int remainder = max(start_ofs - pos, 0);
702 buf = bvec_kmap_irq(bv, &flags);
703 memset(buf + remainder, 0,
704 bv->bv_len - remainder);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +0200705 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706 }
707 pos += bv->bv_len;
708 }
709
710 chain = chain->bi_next;
711 }
712}
713
714/*
715 * bio_chain_clone - clone a chain of bios up to a certain length.
716 * might return a bio_pair that will need to be released.
717 */
718static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
719 struct bio_pair **bp,
720 int len, gfp_t gfpmask)
721{
722 struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
723 int total = 0;
724
725 if (*bp) {
726 bio_pair_release(*bp);
727 *bp = NULL;
728 }
729
730 while (old_chain && (total < len)) {
731 tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
732 if (!tmp)
733 goto err_out;
734
735 if (total + old_chain->bi_size > len) {
736 struct bio_pair *bp;
737
738 /*
739 * this split can only happen with a single paged bio,
740 * split_bio will BUG_ON if this is not the case
741 */
742 dout("bio_chain_clone split! total=%d remaining=%d"
Alex Elderbd919d42012-07-13 20:35:11 -0500743 "bi_size=%u\n",
744 total, len - total, old_chain->bi_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745
746 /* split the bio. We'll release it either in the next
747 call, or it will have to be released outside */
Alex Elder593a9e72012-02-07 12:03:37 -0600748 bp = bio_split(old_chain, (len - total) / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700749 if (!bp)
750 goto err_out;
751
752 __bio_clone(tmp, &bp->bio1);
753
754 *next = &bp->bio2;
755 } else {
756 __bio_clone(tmp, old_chain);
757 *next = old_chain->bi_next;
758 }
759
760 tmp->bi_bdev = NULL;
761 gfpmask &= ~__GFP_WAIT;
762 tmp->bi_next = NULL;
763
764 if (!new_chain) {
765 new_chain = tail = tmp;
766 } else {
767 tail->bi_next = tmp;
768 tail = tmp;
769 }
770 old_chain = old_chain->bi_next;
771
772 total += tmp->bi_size;
773 }
774
775 BUG_ON(total < len);
776
777 if (tail)
778 tail->bi_next = NULL;
779
780 *old = old_chain;
781
782 return new_chain;
783
784err_out:
785 dout("bio_chain_clone with err\n");
786 bio_chain_put(new_chain);
787 return NULL;
788}
789
790/*
791 * helpers for osd request op vectors.
792 */
793static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
794 int num_ops,
795 int opcode,
796 u32 payload_len)
797{
798 *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
799 GFP_NOIO);
800 if (!*ops)
801 return -ENOMEM;
802 (*ops)[0].op = opcode;
803 /*
804 * op extent offset and length will be set later on
805 * in calc_raw_layout()
806 */
807 (*ops)[0].payload_len = payload_len;
808 return 0;
809}
810
811static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
812{
813 kfree(ops);
814}
815
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700816static void rbd_coll_end_req_index(struct request *rq,
817 struct rbd_req_coll *coll,
818 int index,
819 int ret, u64 len)
820{
821 struct request_queue *q;
822 int min, max, i;
823
Alex Elderbd919d42012-07-13 20:35:11 -0500824 dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n",
825 coll, index, ret, (unsigned long long) len);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700826
827 if (!rq)
828 return;
829
830 if (!coll) {
831 blk_end_request(rq, ret, len);
832 return;
833 }
834
835 q = rq->q;
836
837 spin_lock_irq(q->queue_lock);
838 coll->status[index].done = 1;
839 coll->status[index].rc = ret;
840 coll->status[index].bytes = len;
841 max = min = coll->num_done;
842 while (max < coll->total && coll->status[max].done)
843 max++;
844
845 for (i = min; i<max; i++) {
846 __blk_end_request(rq, coll->status[i].rc,
847 coll->status[i].bytes);
848 coll->num_done++;
849 kref_put(&coll->kref, rbd_coll_release);
850 }
851 spin_unlock_irq(q->queue_lock);
852}
853
854static void rbd_coll_end_req(struct rbd_request *req,
855 int ret, u64 len)
856{
857 rbd_coll_end_req_index(req->rq, req->coll, req->coll_index, ret, len);
858}
859
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700860/*
861 * Send ceph osd request
862 */
863static int rbd_do_request(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -0500864 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700865 struct ceph_snap_context *snapc,
866 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -0500867 const char *object_name, u64 ofs, u64 len,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700868 struct bio *bio,
869 struct page **pages,
870 int num_pages,
871 int flags,
872 struct ceph_osd_req_op *ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700873 struct rbd_req_coll *coll,
874 int coll_index,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700875 void (*rbd_cb)(struct ceph_osd_request *req,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700876 struct ceph_msg *msg),
877 struct ceph_osd_request **linger_req,
878 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700879{
880 struct ceph_osd_request *req;
881 struct ceph_file_layout *layout;
882 int ret;
883 u64 bno;
884 struct timespec mtime = CURRENT_TIME;
885 struct rbd_request *req_data;
886 struct ceph_osd_request_head *reqhead;
Alex Elder1dbb4392012-01-24 10:08:37 -0600887 struct ceph_osd_client *osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889 req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700890 if (!req_data) {
891 if (coll)
892 rbd_coll_end_req_index(rq, coll, coll_index,
893 -ENOMEM, len);
894 return -ENOMEM;
895 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700897 if (coll) {
898 req_data->coll = coll;
899 req_data->coll_index = coll_index;
900 }
901
Alex Elderbd919d42012-07-13 20:35:11 -0500902 dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name,
903 (unsigned long long) ofs, (unsigned long long) len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700904
Alex Elder0ce1a792012-07-03 16:01:18 -0500905 osdc = &rbd_dev->rbd_client->client->osdc;
Alex Elder1dbb4392012-01-24 10:08:37 -0600906 req = ceph_osdc_alloc_request(osdc, flags, snapc, ops,
907 false, GFP_NOIO, pages, bio);
Sage Weil4ad12622011-05-03 09:23:36 -0700908 if (!req) {
Sage Weil4ad12622011-05-03 09:23:36 -0700909 ret = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700910 goto done_pages;
911 }
912
913 req->r_callback = rbd_cb;
914
915 req_data->rq = rq;
916 req_data->bio = bio;
917 req_data->pages = pages;
918 req_data->len = len;
919
920 req->r_priv = req_data;
921
922 reqhead = req->r_request->front.iov_base;
923 reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
924
Alex Elderaded07e2012-07-03 16:01:18 -0500925 strncpy(req->r_oid, object_name, sizeof(req->r_oid));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700926 req->r_oid_len = strlen(req->r_oid);
927
928 layout = &req->r_file_layout;
929 memset(layout, 0, sizeof(*layout));
930 layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
931 layout->fl_stripe_count = cpu_to_le32(1);
932 layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
Alex Elder0ce1a792012-07-03 16:01:18 -0500933 layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id);
Alex Elder1dbb4392012-01-24 10:08:37 -0600934 ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno,
935 req, ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700936
937 ceph_osdc_build_request(req, ofs, &len,
938 ops,
939 snapc,
940 &mtime,
941 req->r_oid, req->r_oid_len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700942
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700943 if (linger_req) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600944 ceph_osdc_set_request_linger(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700945 *linger_req = req;
946 }
947
Alex Elder1dbb4392012-01-24 10:08:37 -0600948 ret = ceph_osdc_start_request(osdc, req, false);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949 if (ret < 0)
950 goto done_err;
951
952 if (!rbd_cb) {
Alex Elder1dbb4392012-01-24 10:08:37 -0600953 ret = ceph_osdc_wait_request(osdc, req);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700954 if (ver)
955 *ver = le64_to_cpu(req->r_reassert_version.version);
Alex Elderbd919d42012-07-13 20:35:11 -0500956 dout("reassert_ver=%llu\n",
957 (unsigned long long)
958 le64_to_cpu(req->r_reassert_version.version));
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700959 ceph_osdc_put_request(req);
960 }
961 return ret;
962
963done_err:
964 bio_chain_put(req_data->bio);
965 ceph_osdc_put_request(req);
966done_pages:
Yehuda Sadeh1fec7092011-05-13 13:52:56 -0700967 rbd_coll_end_req(req_data, ret, len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700968 kfree(req_data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700969 return ret;
970}
971
972/*
973 * Ceph osd op callback
974 */
975static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
976{
977 struct rbd_request *req_data = req->r_priv;
978 struct ceph_osd_reply_head *replyhead;
979 struct ceph_osd_op *op;
980 __s32 rc;
981 u64 bytes;
982 int read_op;
983
984 /* parse reply */
985 replyhead = msg->front.iov_base;
986 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
987 op = (void *)(replyhead + 1);
988 rc = le32_to_cpu(replyhead->result);
989 bytes = le64_to_cpu(op->extent.length);
Dan Carpenter895cfcc2012-06-06 09:15:33 -0500990 read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700991
Alex Elderbd919d42012-07-13 20:35:11 -0500992 dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n",
993 (unsigned long long) bytes, read_op, (int) rc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700994
995 if (rc == -ENOENT && read_op) {
996 zero_bio_chain(req_data->bio, 0);
997 rc = 0;
998 } else if (rc == 0 && read_op && bytes < req_data->len) {
999 zero_bio_chain(req_data->bio, bytes);
1000 bytes = req_data->len;
1001 }
1002
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001003 rbd_coll_end_req(req_data, rc, bytes);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004
1005 if (req_data->bio)
1006 bio_chain_put(req_data->bio);
1007
1008 ceph_osdc_put_request(req);
1009 kfree(req_data);
1010}
1011
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001012static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
1013{
1014 ceph_osdc_put_request(req);
1015}
1016
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017/*
1018 * Do a synchronous ceph osd operation
1019 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001020static int rbd_req_sync_op(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001021 struct ceph_snap_context *snapc,
1022 u64 snapid,
1023 int opcode,
1024 int flags,
1025 struct ceph_osd_req_op *orig_ops,
Alex Elderaded07e2012-07-03 16:01:18 -05001026 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001027 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001028 char *buf,
1029 struct ceph_osd_request **linger_req,
1030 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001031{
1032 int ret;
1033 struct page **pages;
1034 int num_pages;
1035 struct ceph_osd_req_op *ops = orig_ops;
1036 u32 payload_len;
1037
1038 num_pages = calc_pages_for(ofs , len);
1039 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
Dan Carpenterb8d06382010-10-11 21:14:23 +02001040 if (IS_ERR(pages))
1041 return PTR_ERR(pages);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001042
1043 if (!orig_ops) {
1044 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
1045 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1046 if (ret < 0)
1047 goto done;
1048
1049 if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
1050 ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
1051 if (ret < 0)
1052 goto done_ops;
1053 }
1054 }
1055
Alex Elder0ce1a792012-07-03 16:01:18 -05001056 ret = rbd_do_request(NULL, rbd_dev, snapc, snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001057 object_name, ofs, len, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058 pages, num_pages,
1059 flags,
1060 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001061 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001062 NULL,
1063 linger_req, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001064 if (ret < 0)
1065 goto done_ops;
1066
1067 if ((flags & CEPH_OSD_FLAG_READ) && buf)
1068 ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
1069
1070done_ops:
1071 if (!orig_ops)
1072 rbd_destroy_ops(ops);
1073done:
1074 ceph_release_page_vector(pages, num_pages);
1075 return ret;
1076}
1077
1078/*
1079 * Do an asynchronous ceph osd operation
1080 */
1081static int rbd_do_op(struct request *rq,
Alex Elder0ce1a792012-07-03 16:01:18 -05001082 struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001083 struct ceph_snap_context *snapc,
1084 u64 snapid,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001085 int opcode, int flags,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001086 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001087 struct bio *bio,
1088 struct rbd_req_coll *coll,
1089 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001090{
1091 char *seg_name;
1092 u64 seg_ofs;
1093 u64 seg_len;
1094 int ret;
1095 struct ceph_osd_req_op *ops;
1096 u32 payload_len;
1097
1098 seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
1099 if (!seg_name)
1100 return -ENOMEM;
1101
1102 seg_len = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001103 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001104 ofs, len,
1105 seg_name, &seg_ofs);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001106
1107 payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
1108
1109 ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
1110 if (ret < 0)
1111 goto done;
1112
1113 /* we've taken care of segment sizes earlier when we
1114 cloned the bios. We should never have a segment
1115 truncated at this point */
1116 BUG_ON(seg_len < len);
1117
1118 ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
1119 seg_name, seg_ofs, seg_len,
1120 bio,
1121 NULL, 0,
1122 flags,
1123 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001124 coll, coll_index,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001125 rbd_req_cb, 0, NULL);
Sage Weil11f77002011-05-12 16:13:54 -07001126
1127 rbd_destroy_ops(ops);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001128done:
1129 kfree(seg_name);
1130 return ret;
1131}
1132
1133/*
1134 * Request async osd write
1135 */
1136static int rbd_req_write(struct request *rq,
1137 struct rbd_device *rbd_dev,
1138 struct ceph_snap_context *snapc,
1139 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001140 struct bio *bio,
1141 struct rbd_req_coll *coll,
1142 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001143{
1144 return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
1145 CEPH_OSD_OP_WRITE,
1146 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001147 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001148}
1149
1150/*
1151 * Request async osd read
1152 */
1153static int rbd_req_read(struct request *rq,
1154 struct rbd_device *rbd_dev,
1155 u64 snapid,
1156 u64 ofs, u64 len,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001157 struct bio *bio,
1158 struct rbd_req_coll *coll,
1159 int coll_index)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001160{
1161 return rbd_do_op(rq, rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001162 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001163 CEPH_OSD_OP_READ,
1164 CEPH_OSD_FLAG_READ,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001165 ofs, len, bio, coll, coll_index);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001166}
1167
1168/*
1169 * Request sync osd read
1170 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001171static int rbd_req_sync_read(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001172 struct ceph_snap_context *snapc,
1173 u64 snapid,
Alex Elderaded07e2012-07-03 16:01:18 -05001174 const char *object_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001175 u64 ofs, u64 len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001176 char *buf,
1177 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178{
Alex Elder0ce1a792012-07-03 16:01:18 -05001179 return rbd_req_sync_op(rbd_dev, NULL,
Josh Durginb06e6a62011-11-21 18:16:52 -08001180 snapid,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181 CEPH_OSD_OP_READ,
1182 CEPH_OSD_FLAG_READ,
1183 NULL,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001184 object_name, ofs, len, buf, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001185}
1186
1187/*
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001188 * Request sync osd watch
1189 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001190static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001191 u64 ver,
1192 u64 notify_id,
Alex Elderaded07e2012-07-03 16:01:18 -05001193 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001194{
1195 struct ceph_osd_req_op *ops;
Sage Weil11f77002011-05-12 16:13:54 -07001196 int ret;
1197
1198 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001199 if (ret < 0)
1200 return ret;
1201
Josh Durgina71b8912011-12-05 18:10:44 -08001202 ops[0].watch.ver = cpu_to_le64(ver);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001203 ops[0].watch.cookie = notify_id;
1204 ops[0].watch.flag = 0;
1205
Alex Elder0ce1a792012-07-03 16:01:18 -05001206 ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP,
Alex Elderaded07e2012-07-03 16:01:18 -05001207 object_name, 0, 0, NULL,
Alex Elderad4f2322012-07-03 16:01:19 -05001208 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001209 CEPH_OSD_FLAG_READ,
1210 ops,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001211 NULL, 0,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001212 rbd_simple_req_cb, 0, NULL);
1213
1214 rbd_destroy_ops(ops);
1215 return ret;
1216}
1217
1218static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1219{
Alex Elder0ce1a792012-07-03 16:01:18 -05001220 struct rbd_device *rbd_dev = (struct rbd_device *)data;
Josh Durgina71b8912011-12-05 18:10:44 -08001221 u64 hver;
Sage Weil13143d22011-05-12 16:08:30 -07001222 int rc;
1223
Alex Elder0ce1a792012-07-03 16:01:18 -05001224 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001225 return;
1226
Alex Elderbd919d42012-07-13 20:35:11 -05001227 dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n",
1228 rbd_dev->header_name, (unsigned long long) notify_id,
1229 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001230 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Alex Elder0ce1a792012-07-03 16:01:18 -05001231 rc = __rbd_refresh_header(rbd_dev);
Josh Durgina71b8912011-12-05 18:10:44 -08001232 hver = rbd_dev->header.obj_version;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001233 mutex_unlock(&ctl_mutex);
Sage Weil13143d22011-05-12 16:08:30 -07001234 if (rc)
Alex Elderf0f8cef2012-01-29 13:57:44 -06001235 pr_warning(RBD_DRV_NAME "%d got notification but failed to "
Alex Elder0ce1a792012-07-03 16:01:18 -05001236 " update snaps: %d\n", rbd_dev->major, rc);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001237
Josh Durgina71b8912011-12-05 18:10:44 -08001238 rbd_req_sync_notify_ack(rbd_dev, hver, notify_id, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001239}
1240
1241/*
1242 * Request sync osd watch
1243 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001244static int rbd_req_sync_watch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001245 const char *object_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001246 u64 ver)
1247{
1248 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001249 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001250
1251 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1252 if (ret < 0)
1253 return ret;
1254
1255 ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0,
Alex Elder0ce1a792012-07-03 16:01:18 -05001256 (void *)rbd_dev, &rbd_dev->watch_event);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001257 if (ret < 0)
1258 goto fail;
1259
1260 ops[0].watch.ver = cpu_to_le64(ver);
Alex Elder0ce1a792012-07-03 16:01:18 -05001261 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001262 ops[0].watch.flag = 1;
1263
Alex Elder0ce1a792012-07-03 16:01:18 -05001264 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001265 CEPH_NOSNAP,
1266 0,
1267 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1268 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001269 object_name, 0, 0, NULL,
Alex Elder0ce1a792012-07-03 16:01:18 -05001270 &rbd_dev->watch_request, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001271
1272 if (ret < 0)
1273 goto fail_event;
1274
1275 rbd_destroy_ops(ops);
1276 return 0;
1277
1278fail_event:
Alex Elder0ce1a792012-07-03 16:01:18 -05001279 ceph_osdc_cancel_event(rbd_dev->watch_event);
1280 rbd_dev->watch_event = NULL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001281fail:
1282 rbd_destroy_ops(ops);
1283 return ret;
1284}
1285
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001286/*
1287 * Request sync osd unwatch
1288 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001289static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001290 const char *object_name)
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001291{
1292 struct ceph_osd_req_op *ops;
1293
1294 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0);
1295 if (ret < 0)
1296 return ret;
1297
1298 ops[0].watch.ver = 0;
Alex Elder0ce1a792012-07-03 16:01:18 -05001299 ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001300 ops[0].watch.flag = 0;
1301
Alex Elder0ce1a792012-07-03 16:01:18 -05001302 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001303 CEPH_NOSNAP,
1304 0,
1305 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1306 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001307 object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001308
1309 rbd_destroy_ops(ops);
Alex Elder0ce1a792012-07-03 16:01:18 -05001310 ceph_osdc_cancel_event(rbd_dev->watch_event);
1311 rbd_dev->watch_event = NULL;
Yehuda Sadeh79e3057c2011-07-12 16:56:57 -07001312 return ret;
1313}
1314
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001315struct rbd_notify_info {
Alex Elder0ce1a792012-07-03 16:01:18 -05001316 struct rbd_device *rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001317};
1318
1319static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data)
1320{
Alex Elder0ce1a792012-07-03 16:01:18 -05001321 struct rbd_device *rbd_dev = (struct rbd_device *)data;
1322 if (!rbd_dev)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001323 return;
1324
Alex Elderbd919d42012-07-13 20:35:11 -05001325 dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n",
1326 rbd_dev->header_name, (unsigned long long) notify_id,
1327 (unsigned int) opcode);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001328}
1329
1330/*
1331 * Request sync osd notify
1332 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001333static int rbd_req_sync_notify(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001334 const char *object_name)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001335{
1336 struct ceph_osd_req_op *ops;
Alex Elder0ce1a792012-07-03 16:01:18 -05001337 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001338 struct ceph_osd_event *event;
1339 struct rbd_notify_info info;
1340 int payload_len = sizeof(u32) + sizeof(u32);
1341 int ret;
1342
1343 ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len);
1344 if (ret < 0)
1345 return ret;
1346
Alex Elder0ce1a792012-07-03 16:01:18 -05001347 info.rbd_dev = rbd_dev;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001348
1349 ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1,
1350 (void *)&info, &event);
1351 if (ret < 0)
1352 goto fail;
1353
1354 ops[0].watch.ver = 1;
1355 ops[0].watch.flag = 1;
1356 ops[0].watch.cookie = event->cookie;
1357 ops[0].watch.prot_ver = RADOS_NOTIFY_VER;
1358 ops[0].watch.timeout = 12;
1359
Alex Elder0ce1a792012-07-03 16:01:18 -05001360 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001361 CEPH_NOSNAP,
1362 0,
1363 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1364 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001365 object_name, 0, 0, NULL, NULL, NULL);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001366 if (ret < 0)
1367 goto fail_event;
1368
1369 ret = ceph_osdc_wait_event(event, CEPH_OSD_TIMEOUT_DEFAULT);
1370 dout("ceph_osdc_wait_event returned %d\n", ret);
1371 rbd_destroy_ops(ops);
1372 return 0;
1373
1374fail_event:
1375 ceph_osdc_cancel_event(event);
1376fail:
1377 rbd_destroy_ops(ops);
1378 return ret;
1379}
1380
1381/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001382 * Request sync osd read
1383 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001384static int rbd_req_sync_exec(struct rbd_device *rbd_dev,
Alex Elderaded07e2012-07-03 16:01:18 -05001385 const char *object_name,
1386 const char *class_name,
1387 const char *method_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001388 const char *data,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001389 int len,
1390 u64 *ver)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001391{
1392 struct ceph_osd_req_op *ops;
Alex Elderaded07e2012-07-03 16:01:18 -05001393 int class_name_len = strlen(class_name);
1394 int method_name_len = strlen(method_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001395 int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
Alex Elderaded07e2012-07-03 16:01:18 -05001396 class_name_len + method_name_len + len);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001397 if (ret < 0)
1398 return ret;
1399
Alex Elderaded07e2012-07-03 16:01:18 -05001400 ops[0].cls.class_name = class_name;
1401 ops[0].cls.class_len = (__u8) class_name_len;
1402 ops[0].cls.method_name = method_name;
1403 ops[0].cls.method_len = (__u8) method_name_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001404 ops[0].cls.argc = 0;
1405 ops[0].cls.indata = data;
1406 ops[0].cls.indata_len = len;
1407
Alex Elder0ce1a792012-07-03 16:01:18 -05001408 ret = rbd_req_sync_op(rbd_dev, NULL,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001409 CEPH_NOSNAP,
1410 0,
1411 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
1412 ops,
Alex Elderd1f57ea2012-06-26 12:57:03 -07001413 object_name, 0, 0, NULL, NULL, ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001414
1415 rbd_destroy_ops(ops);
1416
1417 dout("cls_exec returned %d\n", ret);
1418 return ret;
1419}
1420
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001421static struct rbd_req_coll *rbd_alloc_coll(int num_reqs)
1422{
1423 struct rbd_req_coll *coll =
1424 kzalloc(sizeof(struct rbd_req_coll) +
1425 sizeof(struct rbd_req_status) * num_reqs,
1426 GFP_ATOMIC);
1427
1428 if (!coll)
1429 return NULL;
1430 coll->total = num_reqs;
1431 kref_init(&coll->kref);
1432 return coll;
1433}
1434
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001435/*
1436 * block device queue callback
1437 */
1438static void rbd_rq_fn(struct request_queue *q)
1439{
1440 struct rbd_device *rbd_dev = q->queuedata;
1441 struct request *rq;
1442 struct bio_pair *bp = NULL;
1443
Alex Elder00f1f362012-02-07 12:03:36 -06001444 while ((rq = blk_fetch_request(q))) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001445 struct bio *bio;
1446 struct bio *rq_bio, *next_bio = NULL;
1447 bool do_write;
Alex Elderbd919d42012-07-13 20:35:11 -05001448 unsigned int size;
1449 u64 op_size = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001450 u64 ofs;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001451 int num_segs, cur_seg = 0;
1452 struct rbd_req_coll *coll;
Josh Durgind1d25642011-12-05 14:03:05 -08001453 struct ceph_snap_context *snapc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001454
1455 /* peek at request from block layer */
1456 if (!rq)
1457 break;
1458
1459 dout("fetched request\n");
1460
1461 /* filter out block requests we don't understand */
1462 if ((rq->cmd_type != REQ_TYPE_FS)) {
1463 __blk_end_request_all(rq, 0);
Alex Elder00f1f362012-02-07 12:03:36 -06001464 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001465 }
1466
1467 /* deduce our operation (read, write) */
1468 do_write = (rq_data_dir(rq) == WRITE);
1469
1470 size = blk_rq_bytes(rq);
Alex Elder593a9e72012-02-07 12:03:37 -06001471 ofs = blk_rq_pos(rq) * SECTOR_SIZE;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001472 rq_bio = rq->bio;
1473 if (do_write && rbd_dev->read_only) {
1474 __blk_end_request_all(rq, -EROFS);
Alex Elder00f1f362012-02-07 12:03:36 -06001475 continue;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001476 }
1477
1478 spin_unlock_irq(q->queue_lock);
1479
Josh Durgind1d25642011-12-05 14:03:05 -08001480 down_read(&rbd_dev->header_rwsem);
Josh Durgine88a36e2011-11-21 18:14:25 -08001481
Josh Durgind1d25642011-12-05 14:03:05 -08001482 if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) {
Josh Durgine88a36e2011-11-21 18:14:25 -08001483 up_read(&rbd_dev->header_rwsem);
Josh Durgind1d25642011-12-05 14:03:05 -08001484 dout("request for non-existent snapshot");
1485 spin_lock_irq(q->queue_lock);
1486 __blk_end_request_all(rq, -ENXIO);
1487 continue;
Josh Durgine88a36e2011-11-21 18:14:25 -08001488 }
1489
Josh Durgind1d25642011-12-05 14:03:05 -08001490 snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1491
1492 up_read(&rbd_dev->header_rwsem);
1493
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001494 dout("%s 0x%x bytes at 0x%llx\n",
1495 do_write ? "write" : "read",
Alex Elderbd919d42012-07-13 20:35:11 -05001496 size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001497
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001498 num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size);
1499 coll = rbd_alloc_coll(num_segs);
1500 if (!coll) {
1501 spin_lock_irq(q->queue_lock);
1502 __blk_end_request_all(rq, -ENOMEM);
Josh Durgind1d25642011-12-05 14:03:05 -08001503 ceph_put_snap_context(snapc);
Alex Elder00f1f362012-02-07 12:03:36 -06001504 continue;
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001505 }
1506
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001507 do {
1508 /* a bio clone to be passed down to OSD req */
Alex Elderbd919d42012-07-13 20:35:11 -05001509 dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001510 op_size = rbd_get_segment(&rbd_dev->header,
Alex Elderca1e49a2012-07-10 20:30:09 -05001511 rbd_dev->header.object_prefix,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001512 ofs, size,
1513 NULL, NULL);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001514 kref_get(&coll->kref);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001515 bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
1516 op_size, GFP_ATOMIC);
1517 if (!bio) {
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001518 rbd_coll_end_req_index(rq, coll, cur_seg,
1519 -ENOMEM, op_size);
1520 goto next_seg;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001521 }
1522
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001523
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001524 /* init OSD command: write or read */
1525 if (do_write)
1526 rbd_req_write(rq, rbd_dev,
Josh Durgind1d25642011-12-05 14:03:05 -08001527 snapc,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001528 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001529 op_size, bio,
1530 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001531 else
1532 rbd_req_read(rq, rbd_dev,
Josh Durgin77dfe992011-11-21 13:04:42 -08001533 rbd_dev->snap_id,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001534 ofs,
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001535 op_size, bio,
1536 coll, cur_seg);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001537
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001538next_seg:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001539 size -= op_size;
1540 ofs += op_size;
1541
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001542 cur_seg++;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001543 rq_bio = next_bio;
1544 } while (size > 0);
Yehuda Sadeh1fec7092011-05-13 13:52:56 -07001545 kref_put(&coll->kref, rbd_coll_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001546
1547 if (bp)
1548 bio_pair_release(bp);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001549 spin_lock_irq(q->queue_lock);
Josh Durgind1d25642011-12-05 14:03:05 -08001550
1551 ceph_put_snap_context(snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001552 }
1553}
1554
1555/*
1556 * a queue callback. Makes sure that we don't create a bio that spans across
1557 * multiple osd objects. One exception would be with a single page bios,
1558 * which we handle later at bio_chain_clone
1559 */
1560static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
1561 struct bio_vec *bvec)
1562{
1563 struct rbd_device *rbd_dev = q->queuedata;
Alex Elder593a9e72012-02-07 12:03:37 -06001564 unsigned int chunk_sectors;
1565 sector_t sector;
1566 unsigned int bio_sectors;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001567 int max;
1568
Alex Elder593a9e72012-02-07 12:03:37 -06001569 chunk_sectors = 1 << (rbd_dev->header.obj_order - SECTOR_SHIFT);
1570 sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
1571 bio_sectors = bmd->bi_size >> SECTOR_SHIFT;
1572
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001573 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
Alex Elder593a9e72012-02-07 12:03:37 -06001574 + bio_sectors)) << SECTOR_SHIFT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001575 if (max < 0)
1576 max = 0; /* bio_add cannot handle a negative return */
1577 if (max <= bvec->bv_len && bio_sectors == 0)
1578 return bvec->bv_len;
1579 return max;
1580}
1581
1582static void rbd_free_disk(struct rbd_device *rbd_dev)
1583{
1584 struct gendisk *disk = rbd_dev->disk;
1585
1586 if (!disk)
1587 return;
1588
1589 rbd_header_free(&rbd_dev->header);
1590
1591 if (disk->flags & GENHD_FL_UP)
1592 del_gendisk(disk);
1593 if (disk->queue)
1594 blk_cleanup_queue(disk->queue);
1595 put_disk(disk);
1596}
1597
1598/*
1599 * reload the ondisk the header
1600 */
1601static int rbd_read_header(struct rbd_device *rbd_dev,
1602 struct rbd_image_header *header)
1603{
1604 ssize_t rc;
1605 struct rbd_image_header_ondisk *dh;
Xi Wang50f7c4c2012-04-20 15:49:44 -05001606 u32 snap_count = 0;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001607 u64 ver;
Alex Elder00f1f362012-02-07 12:03:36 -06001608 size_t len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001609
Alex Elder00f1f362012-02-07 12:03:36 -06001610 /*
1611 * First reads the fixed-size header to determine the number
1612 * of snapshots, then re-reads it, along with all snapshot
1613 * records as well as their stored names.
1614 */
1615 len = sizeof (*dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001616 while (1) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001617 dh = kmalloc(len, GFP_KERNEL);
1618 if (!dh)
1619 return -ENOMEM;
1620
1621 rc = rbd_req_sync_read(rbd_dev,
1622 NULL, CEPH_NOSNAP,
Alex Elder0bed54d2012-07-03 16:01:18 -05001623 rbd_dev->header_name,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001624 0, len,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001625 (char *)dh, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001626 if (rc < 0)
1627 goto out_dh;
1628
1629 rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
Josh Durgin81e759f2011-11-15 14:49:53 -08001630 if (rc < 0) {
Alex Elder00f1f362012-02-07 12:03:36 -06001631 if (rc == -ENXIO)
Josh Durgin81e759f2011-11-15 14:49:53 -08001632 pr_warning("unrecognized header format"
Alex Elder0bed54d2012-07-03 16:01:18 -05001633 " for image %s\n",
1634 rbd_dev->image_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001635 goto out_dh;
Josh Durgin81e759f2011-11-15 14:49:53 -08001636 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001637
Alex Elder00f1f362012-02-07 12:03:36 -06001638 if (snap_count == header->total_snaps)
1639 break;
1640
1641 snap_count = header->total_snaps;
1642 len = sizeof (*dh) +
1643 snap_count * sizeof(struct rbd_image_snap_ondisk) +
1644 header->snap_names_len;
1645
1646 rbd_header_free(header);
1647 kfree(dh);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001648 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001649 header->obj_version = ver;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001650
1651out_dh:
1652 kfree(dh);
1653 return rc;
1654}
1655
1656/*
1657 * create a snapshot
1658 */
Alex Elder0ce1a792012-07-03 16:01:18 -05001659static int rbd_header_add_snap(struct rbd_device *rbd_dev,
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001660 const char *snap_name,
1661 gfp_t gfp_flags)
1662{
1663 int name_len = strlen(snap_name);
1664 u64 new_snapid;
1665 int ret;
Sage Weil916d4d62011-05-12 16:10:50 -07001666 void *data, *p, *e;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07001667 u64 ver;
Alex Elder1dbb4392012-01-24 10:08:37 -06001668 struct ceph_mon_client *monc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001669
1670 /* we should create a snapshot only if we're pointing at the head */
Alex Elder0ce1a792012-07-03 16:01:18 -05001671 if (rbd_dev->snap_id != CEPH_NOSNAP)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001672 return -EINVAL;
1673
Alex Elder0ce1a792012-07-03 16:01:18 -05001674 monc = &rbd_dev->rbd_client->client->monc;
1675 ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid);
Alex Elderbd919d42012-07-13 20:35:11 -05001676 dout("created snapid=%llu\n", (unsigned long long) new_snapid);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001677 if (ret < 0)
1678 return ret;
1679
1680 data = kmalloc(name_len + 16, gfp_flags);
1681 if (!data)
1682 return -ENOMEM;
1683
Sage Weil916d4d62011-05-12 16:10:50 -07001684 p = data;
1685 e = data + name_len + 16;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001686
Sage Weil916d4d62011-05-12 16:10:50 -07001687 ceph_encode_string_safe(&p, e, snap_name, name_len, bad);
1688 ceph_encode_64_safe(&p, e, new_snapid, bad);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001689
Alex Elder0bed54d2012-07-03 16:01:18 -05001690 ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name,
Alex Elder0ce1a792012-07-03 16:01:18 -05001691 "rbd", "snap_add",
Sage Weil916d4d62011-05-12 16:10:50 -07001692 data, p - data, &ver);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001693
Sage Weil916d4d62011-05-12 16:10:50 -07001694 kfree(data);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001695
Alex Elder505cbb92012-07-19 08:49:18 -05001696 return ret < 0 ? ret : 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001697bad:
1698 return -ERANGE;
1699}
1700
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001701static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev)
1702{
1703 struct rbd_snap *snap;
Alex Eldera0593292012-07-19 09:09:27 -05001704 struct rbd_snap *next;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001705
Alex Eldera0593292012-07-19 09:09:27 -05001706 list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001707 __rbd_remove_snap_dev(rbd_dev, snap);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001708}
1709
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001710/*
1711 * only read the first part of the ondisk header, without the snaps info
1712 */
Josh Durgin263c6ca2011-12-05 10:43:42 -08001713static int __rbd_refresh_header(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001714{
1715 int ret;
1716 struct rbd_image_header h;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001717
1718 ret = rbd_read_header(rbd_dev, &h);
1719 if (ret < 0)
1720 return ret;
1721
Josh Durgina51aa0c2011-12-05 10:35:04 -08001722 down_write(&rbd_dev->header_rwsem);
1723
Sage Weil9db4b3e2011-04-19 22:49:06 -07001724 /* resized? */
Josh Durgin474ef7c2011-11-21 17:13:54 -08001725 if (rbd_dev->snap_id == CEPH_NOSNAP) {
1726 sector_t size = (sector_t) h.image_size / SECTOR_SIZE;
1727
1728 dout("setting size to %llu sectors", (unsigned long long) size);
1729 set_capacity(rbd_dev->disk, size);
1730 }
Sage Weil9db4b3e2011-04-19 22:49:06 -07001731
Alex Elder849b4262012-07-09 21:04:24 -05001732 /* rbd_dev->header.object_prefix shouldn't change */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001733 kfree(rbd_dev->header.snap_sizes);
Alex Elder849b4262012-07-09 21:04:24 -05001734 kfree(rbd_dev->header.snap_names);
Josh Durgind1d25642011-12-05 14:03:05 -08001735 /* osd requests may still refer to snapc */
1736 ceph_put_snap_context(rbd_dev->header.snapc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001737
Josh Durgina71b8912011-12-05 18:10:44 -08001738 rbd_dev->header.obj_version = h.obj_version;
Josh Durgin93a24e02011-12-05 10:41:28 -08001739 rbd_dev->header.image_size = h.image_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001740 rbd_dev->header.total_snaps = h.total_snaps;
1741 rbd_dev->header.snapc = h.snapc;
1742 rbd_dev->header.snap_names = h.snap_names;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001743 rbd_dev->header.snap_names_len = h.snap_names_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001744 rbd_dev->header.snap_sizes = h.snap_sizes;
Alex Elder849b4262012-07-09 21:04:24 -05001745 /* Free the extra copy of the object prefix */
1746 WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix));
1747 kfree(h.object_prefix);
1748
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001749 ret = __rbd_init_snaps_header(rbd_dev);
1750
Josh Durginc6666012011-11-21 17:11:12 -08001751 up_write(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001752
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001753 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001754}
1755
1756static int rbd_init_disk(struct rbd_device *rbd_dev)
1757{
1758 struct gendisk *disk;
1759 struct request_queue *q;
1760 int rc;
Alex Elder593a9e72012-02-07 12:03:37 -06001761 u64 segment_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001762 u64 total_size = 0;
1763
1764 /* contact OSD, request size info about the object being mapped */
1765 rc = rbd_read_header(rbd_dev, &rbd_dev->header);
1766 if (rc)
1767 return rc;
1768
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001769 /* no need to lock here, as rbd_dev is not registered yet */
1770 rc = __rbd_init_snaps_header(rbd_dev);
1771 if (rc)
1772 return rc;
1773
Josh Durgincc9d7342011-11-21 18:19:13 -08001774 rc = rbd_header_set_snap(rbd_dev, &total_size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001775 if (rc)
1776 return rc;
1777
1778 /* create gendisk info */
1779 rc = -ENOMEM;
1780 disk = alloc_disk(RBD_MINORS_PER_MAJOR);
1781 if (!disk)
1782 goto out;
1783
Alex Elderf0f8cef2012-01-29 13:57:44 -06001784 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05001785 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001786 disk->major = rbd_dev->major;
1787 disk->first_minor = 0;
1788 disk->fops = &rbd_bd_ops;
1789 disk->private_data = rbd_dev;
1790
1791 /* init rq */
1792 rc = -ENOMEM;
1793 q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
1794 if (!q)
1795 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07001796
Alex Elder593a9e72012-02-07 12:03:37 -06001797 /* We use the default size, but let's be explicit about it. */
1798 blk_queue_physical_block_size(q, SECTOR_SIZE);
1799
Josh Durgin029bcbd2011-07-22 11:35:23 -07001800 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06001801 segment_size = rbd_obj_bytes(&rbd_dev->header);
1802 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
1803 blk_queue_max_segment_size(q, segment_size);
1804 blk_queue_io_min(q, segment_size);
1805 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07001806
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001807 blk_queue_merge_bvec(q, rbd_merge_bvec);
1808 disk->queue = q;
1809
1810 q->queuedata = rbd_dev;
1811
1812 rbd_dev->disk = disk;
1813 rbd_dev->q = q;
1814
1815 /* finally, announce the disk to the world */
Alex Elder593a9e72012-02-07 12:03:37 -06001816 set_capacity(disk, total_size / SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001817 add_disk(disk);
1818
1819 pr_info("%s: added with size 0x%llx\n",
1820 disk->disk_name, (unsigned long long)total_size);
1821 return 0;
1822
1823out_disk:
1824 put_disk(disk);
1825out:
1826 return rc;
1827}
1828
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001829/*
1830 sysfs
1831*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001832
Alex Elder593a9e72012-02-07 12:03:37 -06001833static struct rbd_device *dev_to_rbd_dev(struct device *dev)
1834{
1835 return container_of(dev, struct rbd_device, dev);
1836}
1837
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001838static ssize_t rbd_size_show(struct device *dev,
1839 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001840{
Alex Elder593a9e72012-02-07 12:03:37 -06001841 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Josh Durgina51aa0c2011-12-05 10:35:04 -08001842 sector_t size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001843
Josh Durgina51aa0c2011-12-05 10:35:04 -08001844 down_read(&rbd_dev->header_rwsem);
1845 size = get_capacity(rbd_dev->disk);
1846 up_read(&rbd_dev->header_rwsem);
1847
1848 return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001849}
1850
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001851static ssize_t rbd_major_show(struct device *dev,
1852 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001853{
Alex Elder593a9e72012-02-07 12:03:37 -06001854 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001855
1856 return sprintf(buf, "%d\n", rbd_dev->major);
1857}
1858
1859static ssize_t rbd_client_id_show(struct device *dev,
1860 struct device_attribute *attr, char *buf)
1861{
Alex Elder593a9e72012-02-07 12:03:37 -06001862 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001863
Alex Elder1dbb4392012-01-24 10:08:37 -06001864 return sprintf(buf, "client%lld\n",
1865 ceph_client_id(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001866}
1867
1868static ssize_t rbd_pool_show(struct device *dev,
1869 struct device_attribute *attr, char *buf)
1870{
Alex Elder593a9e72012-02-07 12:03:37 -06001871 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001872
1873 return sprintf(buf, "%s\n", rbd_dev->pool_name);
1874}
1875
Alex Elder9bb2f332012-07-12 10:46:35 -05001876static ssize_t rbd_pool_id_show(struct device *dev,
1877 struct device_attribute *attr, char *buf)
1878{
1879 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
1880
1881 return sprintf(buf, "%d\n", rbd_dev->pool_id);
1882}
1883
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001884static ssize_t rbd_name_show(struct device *dev,
1885 struct device_attribute *attr, char *buf)
1886{
Alex Elder593a9e72012-02-07 12:03:37 -06001887 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001888
Alex Elder0bed54d2012-07-03 16:01:18 -05001889 return sprintf(buf, "%s\n", rbd_dev->image_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001890}
1891
1892static ssize_t rbd_snap_show(struct device *dev,
1893 struct device_attribute *attr,
1894 char *buf)
1895{
Alex Elder593a9e72012-02-07 12:03:37 -06001896 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001897
1898 return sprintf(buf, "%s\n", rbd_dev->snap_name);
1899}
1900
1901static ssize_t rbd_image_refresh(struct device *dev,
1902 struct device_attribute *attr,
1903 const char *buf,
1904 size_t size)
1905{
Alex Elder593a9e72012-02-07 12:03:37 -06001906 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001907 int rc;
1908 int ret = size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001909
1910 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1911
Josh Durgin263c6ca2011-12-05 10:43:42 -08001912 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001913 if (rc < 0)
1914 ret = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001915
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001916 mutex_unlock(&ctl_mutex);
1917 return ret;
1918}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001919
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001920static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
1921static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
1922static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
1923static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05001924static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001925static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
1926static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
1927static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
1928static DEVICE_ATTR(create_snap, S_IWUSR, NULL, rbd_snap_add);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001929
1930static struct attribute *rbd_attrs[] = {
1931 &dev_attr_size.attr,
1932 &dev_attr_major.attr,
1933 &dev_attr_client_id.attr,
1934 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05001935 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001936 &dev_attr_name.attr,
1937 &dev_attr_current_snap.attr,
1938 &dev_attr_refresh.attr,
1939 &dev_attr_create_snap.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001940 NULL
1941};
1942
1943static struct attribute_group rbd_attr_group = {
1944 .attrs = rbd_attrs,
1945};
1946
1947static const struct attribute_group *rbd_attr_groups[] = {
1948 &rbd_attr_group,
1949 NULL
1950};
1951
1952static void rbd_sysfs_dev_release(struct device *dev)
1953{
1954}
1955
1956static struct device_type rbd_device_type = {
1957 .name = "rbd",
1958 .groups = rbd_attr_groups,
1959 .release = rbd_sysfs_dev_release,
1960};
1961
1962
1963/*
1964 sysfs - snapshots
1965*/
1966
1967static ssize_t rbd_snap_size_show(struct device *dev,
1968 struct device_attribute *attr,
1969 char *buf)
1970{
1971 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1972
Josh Durgin3591538f2011-12-05 18:25:13 -08001973 return sprintf(buf, "%llu\n", (unsigned long long)snap->size);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001974}
1975
1976static ssize_t rbd_snap_id_show(struct device *dev,
1977 struct device_attribute *attr,
1978 char *buf)
1979{
1980 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
1981
Josh Durgin3591538f2011-12-05 18:25:13 -08001982 return sprintf(buf, "%llu\n", (unsigned long long)snap->id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08001983}
1984
1985static DEVICE_ATTR(snap_size, S_IRUGO, rbd_snap_size_show, NULL);
1986static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
1987
1988static struct attribute *rbd_snap_attrs[] = {
1989 &dev_attr_snap_size.attr,
1990 &dev_attr_snap_id.attr,
1991 NULL,
1992};
1993
1994static struct attribute_group rbd_snap_attr_group = {
1995 .attrs = rbd_snap_attrs,
1996};
1997
1998static void rbd_snap_dev_release(struct device *dev)
1999{
2000 struct rbd_snap *snap = container_of(dev, struct rbd_snap, dev);
2001 kfree(snap->name);
2002 kfree(snap);
2003}
2004
2005static const struct attribute_group *rbd_snap_attr_groups[] = {
2006 &rbd_snap_attr_group,
2007 NULL
2008};
2009
2010static struct device_type rbd_snap_device_type = {
2011 .groups = rbd_snap_attr_groups,
2012 .release = rbd_snap_dev_release,
2013};
2014
2015static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev,
2016 struct rbd_snap *snap)
2017{
2018 list_del(&snap->node);
2019 device_unregister(&snap->dev);
2020}
2021
2022static int rbd_register_snap_dev(struct rbd_device *rbd_dev,
2023 struct rbd_snap *snap,
2024 struct device *parent)
2025{
2026 struct device *dev = &snap->dev;
2027 int ret;
2028
2029 dev->type = &rbd_snap_device_type;
2030 dev->parent = parent;
2031 dev->release = rbd_snap_dev_release;
2032 dev_set_name(dev, "snap_%s", snap->name);
2033 ret = device_register(dev);
2034
2035 return ret;
2036}
2037
2038static int __rbd_add_snap_dev(struct rbd_device *rbd_dev,
2039 int i, const char *name,
2040 struct rbd_snap **snapp)
2041{
2042 int ret;
2043 struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL);
2044 if (!snap)
2045 return -ENOMEM;
2046 snap->name = kstrdup(name, GFP_KERNEL);
2047 snap->size = rbd_dev->header.snap_sizes[i];
2048 snap->id = rbd_dev->header.snapc->snaps[i];
2049 if (device_is_registered(&rbd_dev->dev)) {
2050 ret = rbd_register_snap_dev(rbd_dev, snap,
2051 &rbd_dev->dev);
2052 if (ret < 0)
2053 goto err;
2054 }
2055 *snapp = snap;
2056 return 0;
2057err:
2058 kfree(snap->name);
2059 kfree(snap);
2060 return ret;
2061}
2062
2063/*
2064 * search for the previous snap in a null delimited string list
2065 */
2066const char *rbd_prev_snap_name(const char *name, const char *start)
2067{
2068 if (name < start + 2)
2069 return NULL;
2070
2071 name -= 2;
2072 while (*name) {
2073 if (name == start)
2074 return start;
2075 name--;
2076 }
2077 return name + 1;
2078}
2079
2080/*
2081 * compare the old list of snapshots that we have to what's in the header
2082 * and update it accordingly. Note that the header holds the snapshots
2083 * in a reverse order (from newest to oldest) and we need to go from
2084 * older to new so that we don't get a duplicate snap name when
2085 * doing the process (e.g., removed snapshot and recreated a new
2086 * one with the same name.
2087 */
2088static int __rbd_init_snaps_header(struct rbd_device *rbd_dev)
2089{
2090 const char *name, *first_name;
2091 int i = rbd_dev->header.total_snaps;
2092 struct rbd_snap *snap, *old_snap = NULL;
2093 int ret;
2094 struct list_head *p, *n;
2095
2096 first_name = rbd_dev->header.snap_names;
2097 name = first_name + rbd_dev->header.snap_names_len;
2098
2099 list_for_each_prev_safe(p, n, &rbd_dev->snaps) {
2100 u64 cur_id;
2101
2102 old_snap = list_entry(p, struct rbd_snap, node);
2103
2104 if (i)
2105 cur_id = rbd_dev->header.snapc->snaps[i - 1];
2106
2107 if (!i || old_snap->id < cur_id) {
Josh Durgine88a36e2011-11-21 18:14:25 -08002108 /*
2109 * old_snap->id was skipped, thus was
2110 * removed. If this rbd_dev is mapped to
2111 * the removed snapshot, record that it no
2112 * longer exists, to prevent further I/O.
2113 */
2114 if (rbd_dev->snap_id == old_snap->id)
2115 rbd_dev->snap_exists = false;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002116 __rbd_remove_snap_dev(rbd_dev, old_snap);
2117 continue;
2118 }
2119 if (old_snap->id == cur_id) {
2120 /* we have this snapshot already */
2121 i--;
2122 name = rbd_prev_snap_name(name, first_name);
2123 continue;
2124 }
2125 for (; i > 0;
2126 i--, name = rbd_prev_snap_name(name, first_name)) {
2127 if (!name) {
2128 WARN_ON(1);
2129 return -EINVAL;
2130 }
2131 cur_id = rbd_dev->header.snapc->snaps[i];
2132 /* snapshot removal? handle it above */
2133 if (cur_id >= old_snap->id)
2134 break;
2135 /* a new snapshot */
2136 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2137 if (ret < 0)
2138 return ret;
2139
2140 /* note that we add it backward so using n and not p */
2141 list_add(&snap->node, n);
2142 p = &snap->node;
2143 }
2144 }
2145 /* we're done going over the old snap list, just add what's left */
2146 for (; i > 0; i--) {
2147 name = rbd_prev_snap_name(name, first_name);
2148 if (!name) {
2149 WARN_ON(1);
2150 return -EINVAL;
2151 }
2152 ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap);
2153 if (ret < 0)
2154 return ret;
2155 list_add(&snap->node, &rbd_dev->snaps);
2156 }
2157
2158 return 0;
2159}
2160
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002161static int rbd_bus_add_dev(struct rbd_device *rbd_dev)
2162{
Alex Elderf0f8cef2012-01-29 13:57:44 -06002163 int ret;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002164 struct device *dev;
2165 struct rbd_snap *snap;
2166
2167 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2168 dev = &rbd_dev->dev;
2169
2170 dev->bus = &rbd_bus_type;
2171 dev->type = &rbd_device_type;
2172 dev->parent = &rbd_root_dev;
2173 dev->release = rbd_dev_release;
Alex Elderde71a292012-07-03 16:01:19 -05002174 dev_set_name(dev, "%d", rbd_dev->dev_id);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002175 ret = device_register(dev);
2176 if (ret < 0)
Alex Elderf0f8cef2012-01-29 13:57:44 -06002177 goto out;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002178
2179 list_for_each_entry(snap, &rbd_dev->snaps, node) {
2180 ret = rbd_register_snap_dev(rbd_dev, snap,
2181 &rbd_dev->dev);
2182 if (ret < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002183 break;
2184 }
Alex Elderf0f8cef2012-01-29 13:57:44 -06002185out:
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002186 mutex_unlock(&ctl_mutex);
2187 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002188}
2189
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002190static void rbd_bus_del_dev(struct rbd_device *rbd_dev)
2191{
2192 device_unregister(&rbd_dev->dev);
2193}
2194
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002195static int rbd_init_watch_dev(struct rbd_device *rbd_dev)
2196{
2197 int ret, rc;
2198
2199 do {
Alex Elder0bed54d2012-07-03 16:01:18 -05002200 ret = rbd_req_sync_watch(rbd_dev, rbd_dev->header_name,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002201 rbd_dev->header.obj_version);
2202 if (ret == -ERANGE) {
2203 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
Josh Durgin263c6ca2011-12-05 10:43:42 -08002204 rc = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002205 mutex_unlock(&ctl_mutex);
2206 if (rc < 0)
2207 return rc;
2208 }
2209 } while (ret == -ERANGE);
2210
2211 return ret;
2212}
2213
Alex Elder1ddbe942012-01-29 13:57:44 -06002214static atomic64_t rbd_id_max = ATOMIC64_INIT(0);
2215
2216/*
Alex Elder499afd52012-02-02 08:13:29 -06002217 * Get a unique rbd identifier for the given new rbd_dev, and add
2218 * the rbd_dev to the global list. The minimum rbd id is 1.
Alex Elder1ddbe942012-01-29 13:57:44 -06002219 */
Alex Elder499afd52012-02-02 08:13:29 -06002220static void rbd_id_get(struct rbd_device *rbd_dev)
Alex Elderb7f23c32012-01-29 13:57:43 -06002221{
Alex Elderde71a292012-07-03 16:01:19 -05002222 rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max);
Alex Elder499afd52012-02-02 08:13:29 -06002223
2224 spin_lock(&rbd_dev_list_lock);
2225 list_add_tail(&rbd_dev->node, &rbd_dev_list);
2226 spin_unlock(&rbd_dev_list_lock);
Alex Elder1ddbe942012-01-29 13:57:44 -06002227}
Alex Elderb7f23c32012-01-29 13:57:43 -06002228
Alex Elder1ddbe942012-01-29 13:57:44 -06002229/*
Alex Elder499afd52012-02-02 08:13:29 -06002230 * Remove an rbd_dev from the global list, and record that its
2231 * identifier is no longer in use.
Alex Elder1ddbe942012-01-29 13:57:44 -06002232 */
Alex Elder499afd52012-02-02 08:13:29 -06002233static void rbd_id_put(struct rbd_device *rbd_dev)
Alex Elder1ddbe942012-01-29 13:57:44 -06002234{
Alex Elderd184f6b2012-01-29 13:57:44 -06002235 struct list_head *tmp;
Alex Elderde71a292012-07-03 16:01:19 -05002236 int rbd_id = rbd_dev->dev_id;
Alex Elderd184f6b2012-01-29 13:57:44 -06002237 int max_id;
2238
2239 BUG_ON(rbd_id < 1);
Alex Elder499afd52012-02-02 08:13:29 -06002240
2241 spin_lock(&rbd_dev_list_lock);
2242 list_del_init(&rbd_dev->node);
Alex Elderd184f6b2012-01-29 13:57:44 -06002243
2244 /*
2245 * If the id being "put" is not the current maximum, there
2246 * is nothing special we need to do.
2247 */
2248 if (rbd_id != atomic64_read(&rbd_id_max)) {
2249 spin_unlock(&rbd_dev_list_lock);
2250 return;
2251 }
2252
2253 /*
2254 * We need to update the current maximum id. Search the
2255 * list to find out what it is. We're more likely to find
2256 * the maximum at the end, so search the list backward.
2257 */
2258 max_id = 0;
2259 list_for_each_prev(tmp, &rbd_dev_list) {
2260 struct rbd_device *rbd_dev;
2261
2262 rbd_dev = list_entry(tmp, struct rbd_device, node);
2263 if (rbd_id > max_id)
2264 max_id = rbd_id;
2265 }
Alex Elder499afd52012-02-02 08:13:29 -06002266 spin_unlock(&rbd_dev_list_lock);
Alex Elderb7f23c32012-01-29 13:57:43 -06002267
Alex Elder1ddbe942012-01-29 13:57:44 -06002268 /*
Alex Elderd184f6b2012-01-29 13:57:44 -06002269 * The max id could have been updated by rbd_id_get(), in
2270 * which case it now accurately reflects the new maximum.
2271 * Be careful not to overwrite the maximum value in that
2272 * case.
Alex Elder1ddbe942012-01-29 13:57:44 -06002273 */
Alex Elderd184f6b2012-01-29 13:57:44 -06002274 atomic64_cmpxchg(&rbd_id_max, rbd_id, max_id);
Alex Elderb7f23c32012-01-29 13:57:43 -06002275}
2276
Alex Eldera725f65e2012-02-02 08:13:30 -06002277/*
Alex Eldere28fff262012-02-02 08:13:30 -06002278 * Skips over white space at *buf, and updates *buf to point to the
2279 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06002280 * the token (string of non-white space characters) found. Note
2281 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06002282 */
2283static inline size_t next_token(const char **buf)
2284{
2285 /*
2286 * These are the characters that produce nonzero for
2287 * isspace() in the "C" and "POSIX" locales.
2288 */
2289 const char *spaces = " \f\n\r\t\v";
2290
2291 *buf += strspn(*buf, spaces); /* Find start of token */
2292
2293 return strcspn(*buf, spaces); /* Return token length */
2294}
2295
2296/*
2297 * Finds the next token in *buf, and if the provided token buffer is
2298 * big enough, copies the found token into it. The result, if
Alex Elder593a9e72012-02-07 12:03:37 -06002299 * copied, is guaranteed to be terminated with '\0'. Note that *buf
2300 * must be terminated with '\0' on entry.
Alex Eldere28fff262012-02-02 08:13:30 -06002301 *
2302 * Returns the length of the token found (not including the '\0').
2303 * Return value will be 0 if no token is found, and it will be >=
2304 * token_size if the token would not fit.
2305 *
Alex Elder593a9e72012-02-07 12:03:37 -06002306 * The *buf pointer will be updated to point beyond the end of the
Alex Eldere28fff262012-02-02 08:13:30 -06002307 * found token. Note that this occurs even if the token buffer is
2308 * too small to hold it.
2309 */
2310static inline size_t copy_token(const char **buf,
2311 char *token,
2312 size_t token_size)
2313{
2314 size_t len;
2315
2316 len = next_token(buf);
2317 if (len < token_size) {
2318 memcpy(token, *buf, len);
2319 *(token + len) = '\0';
2320 }
2321 *buf += len;
2322
2323 return len;
2324}
2325
2326/*
Alex Elderea3352f2012-07-09 21:04:23 -05002327 * Finds the next token in *buf, dynamically allocates a buffer big
2328 * enough to hold a copy of it, and copies the token into the new
2329 * buffer. The copy is guaranteed to be terminated with '\0'. Note
2330 * that a duplicate buffer is created even for a zero-length token.
2331 *
2332 * Returns a pointer to the newly-allocated duplicate, or a null
2333 * pointer if memory for the duplicate was not available. If
2334 * the lenp argument is a non-null pointer, the length of the token
2335 * (not including the '\0') is returned in *lenp.
2336 *
2337 * If successful, the *buf pointer will be updated to point beyond
2338 * the end of the found token.
2339 *
2340 * Note: uses GFP_KERNEL for allocation.
2341 */
2342static inline char *dup_token(const char **buf, size_t *lenp)
2343{
2344 char *dup;
2345 size_t len;
2346
2347 len = next_token(buf);
2348 dup = kmalloc(len + 1, GFP_KERNEL);
2349 if (!dup)
2350 return NULL;
2351
2352 memcpy(dup, *buf, len);
2353 *(dup + len) = '\0';
2354 *buf += len;
2355
2356 if (lenp)
2357 *lenp = len;
2358
2359 return dup;
2360}
2361
2362/*
Alex Elder0bed54d2012-07-03 16:01:18 -05002363 * This fills in the pool_name, image_name, image_name_len, snap_name,
Alex Eldera725f65e2012-02-02 08:13:30 -06002364 * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based
2365 * on the list of monitor addresses and other options provided via
2366 * /sys/bus/rbd/add.
Alex Elderd22f76e2012-07-12 10:46:35 -05002367 *
2368 * Note: rbd_dev is assumed to have been initially zero-filled.
Alex Eldera725f65e2012-02-02 08:13:30 -06002369 */
2370static int rbd_add_parse_args(struct rbd_device *rbd_dev,
2371 const char *buf,
Alex Elder7ef32142012-02-02 08:13:30 -06002372 const char **mon_addrs,
Alex Elder5214ecc2012-02-02 08:13:30 -06002373 size_t *mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002374 char *options,
Alex Elder0bed54d2012-07-03 16:01:18 -05002375 size_t options_size)
Alex Eldera725f65e2012-02-02 08:13:30 -06002376{
Alex Elderd22f76e2012-07-12 10:46:35 -05002377 size_t len;
2378 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06002379
2380 /* The first four tokens are required */
2381
Alex Elder7ef32142012-02-02 08:13:30 -06002382 len = next_token(&buf);
2383 if (!len)
Alex Eldera725f65e2012-02-02 08:13:30 -06002384 return -EINVAL;
Alex Elder5214ecc2012-02-02 08:13:30 -06002385 *mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06002386 *mon_addrs = buf;
2387
2388 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06002389
Alex Eldere28fff262012-02-02 08:13:30 -06002390 len = copy_token(&buf, options, options_size);
2391 if (!len || len >= options_size)
2392 return -EINVAL;
Alex Eldera725f65e2012-02-02 08:13:30 -06002393
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002394 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05002395 rbd_dev->pool_name = dup_token(&buf, NULL);
2396 if (!rbd_dev->pool_name)
Alex Elderd22f76e2012-07-12 10:46:35 -05002397 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002398
Alex Elder0bed54d2012-07-03 16:01:18 -05002399 rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len);
2400 if (!rbd_dev->image_name)
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002401 goto out_err;
Alex Eldere28fff262012-02-02 08:13:30 -06002402
Alex Eldercb8627c2012-07-09 21:04:23 -05002403 /* Create the name of the header object */
2404
Alex Elder0bed54d2012-07-03 16:01:18 -05002405 rbd_dev->header_name = kmalloc(rbd_dev->image_name_len
Alex Elderbf3e5ae2012-07-09 21:04:23 -05002406 + sizeof (RBD_SUFFIX),
2407 GFP_KERNEL);
Alex Elder0bed54d2012-07-03 16:01:18 -05002408 if (!rbd_dev->header_name)
Alex Eldercb8627c2012-07-09 21:04:23 -05002409 goto out_err;
Alex Elder0bed54d2012-07-03 16:01:18 -05002410 sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX);
Alex Eldera725f65e2012-02-02 08:13:30 -06002411
Alex Eldere28fff262012-02-02 08:13:30 -06002412 /*
Alex Elder820a5f32012-07-09 21:04:24 -05002413 * The snapshot name is optional. If none is is supplied,
2414 * we use the default value.
Alex Eldere28fff262012-02-02 08:13:30 -06002415 */
Alex Elder820a5f32012-07-09 21:04:24 -05002416 rbd_dev->snap_name = dup_token(&buf, &len);
2417 if (!rbd_dev->snap_name)
2418 goto out_err;
2419 if (!len) {
2420 /* Replace the empty name with the default */
2421 kfree(rbd_dev->snap_name);
2422 rbd_dev->snap_name
2423 = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL);
2424 if (!rbd_dev->snap_name)
2425 goto out_err;
2426
Alex Eldere28fff262012-02-02 08:13:30 -06002427 memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME,
2428 sizeof (RBD_SNAP_HEAD_NAME));
Alex Elder849b4262012-07-09 21:04:24 -05002429 }
Alex Eldere28fff262012-02-02 08:13:30 -06002430
Alex Eldera725f65e2012-02-02 08:13:30 -06002431 return 0;
Alex Elderd22f76e2012-07-12 10:46:35 -05002432
2433out_err:
Alex Elder0bed54d2012-07-03 16:01:18 -05002434 kfree(rbd_dev->header_name);
2435 kfree(rbd_dev->image_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002436 kfree(rbd_dev->pool_name);
2437 rbd_dev->pool_name = NULL;
2438
2439 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06002440}
2441
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002442static ssize_t rbd_add(struct bus_type *bus,
2443 const char *buf,
2444 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002445{
Alex Eldercb8627c2012-07-09 21:04:23 -05002446 char *options;
2447 struct rbd_device *rbd_dev = NULL;
Alex Elder7ef32142012-02-02 08:13:30 -06002448 const char *mon_addrs = NULL;
2449 size_t mon_addrs_size = 0;
Alex Elder27cc2592012-02-02 08:13:30 -06002450 struct ceph_osd_client *osdc;
2451 int rc = -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002452
2453 if (!try_module_get(THIS_MODULE))
2454 return -ENODEV;
2455
Alex Elder27cc2592012-02-02 08:13:30 -06002456 options = kmalloc(count, GFP_KERNEL);
2457 if (!options)
2458 goto err_nomem;
Alex Eldercb8627c2012-07-09 21:04:23 -05002459 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
2460 if (!rbd_dev)
2461 goto err_nomem;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002462
2463 /* static rbd_device initialization */
2464 spin_lock_init(&rbd_dev->lock);
2465 INIT_LIST_HEAD(&rbd_dev->node);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002466 INIT_LIST_HEAD(&rbd_dev->snaps);
Josh Durginc6666012011-11-21 17:11:12 -08002467 init_rwsem(&rbd_dev->header_rwsem);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002468
Alex Elderd184f6b2012-01-29 13:57:44 -06002469 /* generate unique id: find highest unique id, add one */
Alex Elder499afd52012-02-02 08:13:29 -06002470 rbd_id_get(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002471
Alex Eldera725f65e2012-02-02 08:13:30 -06002472 /* Fill in the device name, now that we have its id. */
Alex Elder81a89792012-02-02 08:13:30 -06002473 BUILD_BUG_ON(DEV_NAME_LEN
2474 < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH);
Alex Elderde71a292012-07-03 16:01:19 -05002475 sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id);
Alex Eldere124a82f2012-01-29 13:57:44 -06002476
Alex Eldera725f65e2012-02-02 08:13:30 -06002477 /* parse add command */
Alex Elder7ef32142012-02-02 08:13:30 -06002478 rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size,
Alex Eldere28fff262012-02-02 08:13:30 -06002479 options, count);
Alex Eldera725f65e2012-02-02 08:13:30 -06002480 if (rc)
2481 goto err_put_id;
2482
Alex Elder5214ecc2012-02-02 08:13:30 -06002483 rbd_dev->rbd_client = rbd_get_client(mon_addrs, mon_addrs_size - 1,
2484 options);
Alex Elderd720bcb2012-02-02 08:13:30 -06002485 if (IS_ERR(rbd_dev->rbd_client)) {
2486 rc = PTR_ERR(rbd_dev->rbd_client);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002487 goto err_put_id;
Alex Elderd720bcb2012-02-02 08:13:30 -06002488 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002489
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002490 /* pick the pool */
Alex Elder1dbb4392012-01-24 10:08:37 -06002491 osdc = &rbd_dev->rbd_client->client->osdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002492 rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
2493 if (rc < 0)
2494 goto err_out_client;
Alex Elder9bb2f332012-07-12 10:46:35 -05002495 rbd_dev->pool_id = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002496
2497 /* register our block device */
Alex Elder27cc2592012-02-02 08:13:30 -06002498 rc = register_blkdev(0, rbd_dev->name);
2499 if (rc < 0)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002500 goto err_out_client;
Alex Elder27cc2592012-02-02 08:13:30 -06002501 rbd_dev->major = rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002502
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002503 rc = rbd_bus_add_dev(rbd_dev);
2504 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002505 goto err_out_blkdev;
2506
Alex Elder32eec682012-02-08 16:11:14 -06002507 /*
2508 * At this point cleanup in the event of an error is the job
2509 * of the sysfs code (initiated by rbd_bus_del_dev()).
2510 *
2511 * Set up and announce blkdev mapping.
2512 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002513 rc = rbd_init_disk(rbd_dev);
2514 if (rc)
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002515 goto err_out_bus;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002516
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002517 rc = rbd_init_watch_dev(rbd_dev);
2518 if (rc)
2519 goto err_out_bus;
2520
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002521 return count;
2522
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002523err_out_bus:
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002524 /* this will also clean up rest of rbd_dev stuff */
2525
2526 rbd_bus_del_dev(rbd_dev);
2527 kfree(options);
Yehuda Sadeh766fc432011-01-07 14:58:42 -08002528 return rc;
2529
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002530err_out_blkdev:
2531 unregister_blkdev(rbd_dev->major, rbd_dev->name);
2532err_out_client:
2533 rbd_put_client(rbd_dev);
Alex Elderf0f8cef2012-01-29 13:57:44 -06002534err_put_id:
Alex Eldercb8627c2012-07-09 21:04:23 -05002535 if (rbd_dev->pool_name) {
Alex Elder820a5f32012-07-09 21:04:24 -05002536 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002537 kfree(rbd_dev->header_name);
2538 kfree(rbd_dev->image_name);
Alex Eldercb8627c2012-07-09 21:04:23 -05002539 kfree(rbd_dev->pool_name);
2540 }
Alex Elder499afd52012-02-02 08:13:29 -06002541 rbd_id_put(rbd_dev);
Alex Elder27cc2592012-02-02 08:13:30 -06002542err_nomem:
Alex Elder27cc2592012-02-02 08:13:30 -06002543 kfree(rbd_dev);
Alex Eldercb8627c2012-07-09 21:04:23 -05002544 kfree(options);
Alex Elder27cc2592012-02-02 08:13:30 -06002545
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002546 dout("Error adding device %s\n", buf);
2547 module_put(THIS_MODULE);
Alex Elder27cc2592012-02-02 08:13:30 -06002548
2549 return (ssize_t) rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002550}
2551
Alex Elderde71a292012-07-03 16:01:19 -05002552static struct rbd_device *__rbd_get_dev(unsigned long dev_id)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002553{
2554 struct list_head *tmp;
2555 struct rbd_device *rbd_dev;
2556
Alex Eldere124a82f2012-01-29 13:57:44 -06002557 spin_lock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002558 list_for_each(tmp, &rbd_dev_list) {
2559 rbd_dev = list_entry(tmp, struct rbd_device, node);
Alex Elderde71a292012-07-03 16:01:19 -05002560 if (rbd_dev->dev_id == dev_id) {
Alex Eldere124a82f2012-01-29 13:57:44 -06002561 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002562 return rbd_dev;
Alex Eldere124a82f2012-01-29 13:57:44 -06002563 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002564 }
Alex Eldere124a82f2012-01-29 13:57:44 -06002565 spin_unlock(&rbd_dev_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002566 return NULL;
2567}
2568
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002569static void rbd_dev_release(struct device *dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002570{
Alex Elder593a9e72012-02-07 12:03:37 -06002571 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002572
Alex Elder1dbb4392012-01-24 10:08:37 -06002573 if (rbd_dev->watch_request) {
2574 struct ceph_client *client = rbd_dev->rbd_client->client;
2575
2576 ceph_osdc_unregister_linger_request(&client->osdc,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002577 rbd_dev->watch_request);
Alex Elder1dbb4392012-01-24 10:08:37 -06002578 }
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002579 if (rbd_dev->watch_event)
Alex Elder0bed54d2012-07-03 16:01:18 -05002580 rbd_req_sync_unwatch(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002581
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002582 rbd_put_client(rbd_dev);
2583
2584 /* clean up and free blkdev */
2585 rbd_free_disk(rbd_dev);
2586 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Alex Elder32eec682012-02-08 16:11:14 -06002587
2588 /* done with the id, and with the rbd_dev */
Alex Elder820a5f32012-07-09 21:04:24 -05002589 kfree(rbd_dev->snap_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002590 kfree(rbd_dev->header_name);
Alex Elderd22f76e2012-07-12 10:46:35 -05002591 kfree(rbd_dev->pool_name);
Alex Elder0bed54d2012-07-03 16:01:18 -05002592 kfree(rbd_dev->image_name);
Alex Elder32eec682012-02-08 16:11:14 -06002593 rbd_id_put(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002594 kfree(rbd_dev);
2595
2596 /* release module ref */
2597 module_put(THIS_MODULE);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002598}
2599
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002600static ssize_t rbd_remove(struct bus_type *bus,
2601 const char *buf,
2602 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002603{
2604 struct rbd_device *rbd_dev = NULL;
2605 int target_id, rc;
2606 unsigned long ul;
2607 int ret = count;
2608
2609 rc = strict_strtoul(buf, 10, &ul);
2610 if (rc)
2611 return rc;
2612
2613 /* convert to int; abort if we lost anything in the conversion */
2614 target_id = (int) ul;
2615 if (target_id != ul)
2616 return -EINVAL;
2617
2618 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2619
2620 rbd_dev = __rbd_get_dev(target_id);
2621 if (!rbd_dev) {
2622 ret = -ENOENT;
2623 goto done;
2624 }
2625
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002626 __rbd_remove_all_snaps(rbd_dev);
2627 rbd_bus_del_dev(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002628
2629done:
2630 mutex_unlock(&ctl_mutex);
2631 return ret;
2632}
2633
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002634static ssize_t rbd_snap_add(struct device *dev,
2635 struct device_attribute *attr,
2636 const char *buf,
2637 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002638{
Alex Elder593a9e72012-02-07 12:03:37 -06002639 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002640 int ret;
2641 char *name = kmalloc(count + 1, GFP_KERNEL);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002642 if (!name)
2643 return -ENOMEM;
2644
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002645 snprintf(name, count, "%s", buf);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002646
2647 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
2648
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002649 ret = rbd_header_add_snap(rbd_dev,
2650 name, GFP_KERNEL);
2651 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002652 goto err_unlock;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002653
Josh Durgin263c6ca2011-12-05 10:43:42 -08002654 ret = __rbd_refresh_header(rbd_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002655 if (ret < 0)
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002656 goto err_unlock;
2657
2658 /* shouldn't hold ctl_mutex when notifying.. notify might
2659 trigger a watch callback that would need to get that mutex */
2660 mutex_unlock(&ctl_mutex);
2661
2662 /* make a best effort, don't error if failed */
Alex Elder0bed54d2012-07-03 16:01:18 -05002663 rbd_req_sync_notify(rbd_dev, rbd_dev->header_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002664
2665 ret = count;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -07002666 kfree(name);
2667 return ret;
2668
2669err_unlock:
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002670 mutex_unlock(&ctl_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002671 kfree(name);
2672 return ret;
2673}
2674
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002675/*
2676 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002677 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002678 */
2679static int rbd_sysfs_init(void)
2680{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002681 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002682
Alex Elderfed4c142012-02-07 12:03:36 -06002683 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06002684 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002685 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002686
Alex Elderfed4c142012-02-07 12:03:36 -06002687 ret = bus_register(&rbd_bus_type);
2688 if (ret < 0)
2689 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002690
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002691 return ret;
2692}
2693
2694static void rbd_sysfs_cleanup(void)
2695{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08002696 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06002697 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002698}
2699
2700int __init rbd_init(void)
2701{
2702 int rc;
2703
2704 rc = rbd_sysfs_init();
2705 if (rc)
2706 return rc;
Alex Elderf0f8cef2012-01-29 13:57:44 -06002707 pr_info("loaded " RBD_DRV_NAME_LONG "\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002708 return 0;
2709}
2710
2711void __exit rbd_exit(void)
2712{
2713 rbd_sysfs_cleanup();
2714}
2715
2716module_init(rbd_init);
2717module_exit(rbd_exit);
2718
2719MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
2720MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
2721MODULE_DESCRIPTION("rados block device");
2722
2723/* following authorship retained from original osdblk.c */
2724MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
2725
2726MODULE_LICENSE("GPL");