blob: b44b457b4dc782411752415bae2b3748d7e323de [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070036#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050037#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070038
39#include <linux/kernel.h>
40#include <linux/device.h>
41#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010042#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070043#include <linux/fs.h>
44#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050045#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020046#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040047#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070048
49#include "rbd_types.h"
50
Alex Elderaafb2302012-09-06 16:00:54 -050051#define RBD_DEBUG /* Activate rbd_assert() calls */
52
Alex Elder593a9e72012-02-07 12:03:37 -060053/*
54 * The basic unit of block I/O is a sector. It is interpreted in a
55 * number of contexts in Linux (blk, bio, genhd), but the default is
56 * universally 512 bytes. These symbols are just slightly more
57 * meaningful than the bare numbers they represent.
58 */
59#define SECTOR_SHIFT 9
60#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
61
Alex Eldera2acd002013-05-08 22:50:04 -050062/*
63 * Increment the given counter and return its updated value.
64 * If the counter is already 0 it will not be incremented.
65 * If the counter is already at its maximum value returns
66 * -EINVAL without updating it.
67 */
68static int atomic_inc_return_safe(atomic_t *v)
69{
70 unsigned int counter;
71
72 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73 if (counter <= (unsigned int)INT_MAX)
74 return (int)counter;
75
76 atomic_dec(v);
77
78 return -EINVAL;
79}
80
81/* Decrement the counter. Return the resulting value, or -EINVAL */
82static int atomic_dec_return_safe(atomic_t *v)
83{
84 int counter;
85
86 counter = atomic_dec_return(v);
87 if (counter >= 0)
88 return counter;
89
90 atomic_inc(v);
91
92 return -EINVAL;
93}
94
Alex Elderf0f8cef2012-01-29 13:57:44 -060095#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096
Ilya Dryomov7e513d42013-12-16 19:26:32 +020097#define RBD_MINORS_PER_MAJOR 256
98#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200100#define RBD_MAX_PARENT_CHAIN_LEN 16
101
Alex Elderd4b125e2012-07-03 16:01:19 -0500102#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
103#define RBD_MAX_SNAP_NAME_LEN \
104 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105
Alex Elder35d489f2012-07-03 16:01:19 -0500106#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700107
108#define RBD_SNAP_HEAD_NAME "-"
109
Alex Elder9682fc62013-04-30 00:44:33 -0500110#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
111
Alex Elder9e15b772012-10-30 19:40:33 -0500112/* This allows a single page to hold an image name sent by OSD */
113#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500114#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500115
Alex Elder1e130192012-07-03 16:01:19 -0500116#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500117
Ilya Dryomoved95b212016-08-12 16:40:02 +0200118#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200119#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
120
Alex Elderd8891402012-10-09 13:50:17 -0700121/* Feature bits */
122
Alex Elder5cbf6f122013-04-11 09:29:48 -0500123#define RBD_FEATURE_LAYERING (1<<0)
124#define RBD_FEATURE_STRIPINGV2 (1<<1)
Ilya Dryomoved95b212016-08-12 16:40:02 +0200125#define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2)
Ilya Dryomov7e973322017-01-25 18:16:22 +0100126#define RBD_FEATURE_DATA_POOL (1<<7)
Ilya Dryomoved95b212016-08-12 16:40:02 +0200127#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
128 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100129 RBD_FEATURE_EXCLUSIVE_LOCK | \
130 RBD_FEATURE_DATA_POOL)
Alex Elderd8891402012-10-09 13:50:17 -0700131
132/* Features supported by this (client software) implementation. */
133
Alex Elder770eba62012-10-25 23:34:40 -0500134#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700135
Alex Elder81a89792012-02-02 08:13:30 -0600136/*
137 * An RBD device name will be "rbd#", where the "rbd" comes from
138 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600139 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700140#define DEV_NAME_LEN 32
141
142/*
143 * block device image metadata (in-memory version)
144 */
145struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500146 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500147 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700148 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500149 u64 stripe_unit;
150 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100151 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500152 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700153
Alex Elderf84344f2012-08-31 17:29:51 -0500154 /* The remaining fields need to be updated occasionally */
155 u64 image_size;
156 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500157 char *snap_names; /* format 1 only */
158 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700159};
160
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500161/*
162 * An rbd image specification.
163 *
164 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500165 * identify an image. Each rbd_dev structure includes a pointer to
166 * an rbd_spec structure that encapsulates this identity.
167 *
168 * Each of the id's in an rbd_spec has an associated name. For a
169 * user-mapped image, the names are supplied and the id's associated
170 * with them are looked up. For a layered image, a parent image is
171 * defined by the tuple, and the names are looked up.
172 *
173 * An rbd_dev structure contains a parent_spec pointer which is
174 * non-null if the image it represents is a child in a layered
175 * image. This pointer will refer to the rbd_spec structure used
176 * by the parent rbd_dev for its own identity (i.e., the structure
177 * is shared between the parent and child).
178 *
179 * Since these structures are populated once, during the discovery
180 * phase of image construction, they are effectively immutable so
181 * we make no effort to synchronize access to them.
182 *
183 * Note that code herein does not assume the image name is known (it
184 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500185 */
186struct rbd_spec {
187 u64 pool_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500188 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500189
Alex Elderecb4dc222013-04-26 09:43:47 -0500190 const char *image_id;
191 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500192
193 u64 snap_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500194 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500195
196 struct kref kref;
197};
198
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600200 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700201 */
202struct rbd_client {
203 struct ceph_client *client;
204 struct kref kref;
205 struct list_head node;
206};
207
Alex Elderbf0d5f502012-11-22 00:00:08 -0600208struct rbd_img_request;
209typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
210
211#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
212
213struct rbd_obj_request;
214typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
215
Alex Elder9969ebc2013-01-18 12:31:10 -0600216enum obj_request_type {
217 OBJ_REQUEST_NODATA, OBJ_REQUEST_BIO, OBJ_REQUEST_PAGES
218};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600219
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800220enum obj_operation_type {
221 OBJ_OP_WRITE,
222 OBJ_OP_READ,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800223 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800224};
225
Alex Elder926f9b32013-02-11 12:33:24 -0600226enum obj_req_flags {
227 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600228 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600229 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
230 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600231};
232
Alex Elderbf0d5f502012-11-22 00:00:08 -0600233struct rbd_obj_request {
234 const char *object_name;
235 u64 offset; /* object start byte */
236 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600237 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600238
Alex Elderc5b5ef62013-02-11 12:33:24 -0600239 /*
240 * An object request associated with an image will have its
241 * img_data flag set; a standalone object request will not.
242 *
243 * A standalone object request will have which == BAD_WHICH
244 * and a null obj_request pointer.
245 *
246 * An object request initiated in support of a layered image
247 * object (to check for its existence before a write) will
248 * have which == BAD_WHICH and a non-null obj_request pointer.
249 *
250 * Finally, an object request for rbd image data will have
251 * which != BAD_WHICH, and will have a non-null img_request
252 * pointer. The value of which will be in the range
253 * 0..(img_request->obj_request_count-1).
254 */
255 union {
256 struct rbd_obj_request *obj_request; /* STAT op */
257 struct {
258 struct rbd_img_request *img_request;
259 u64 img_offset;
260 /* links for img_request->obj_requests list */
261 struct list_head links;
262 };
263 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600264 u32 which; /* posn image request list */
265
266 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600267 union {
268 struct bio *bio_list;
269 struct {
270 struct page **pages;
271 u32 page_count;
272 };
273 };
Alex Elder0eefd472013-04-19 15:34:50 -0500274 struct page **copyup_pages;
Alex Elderebda6402013-05-10 16:29:22 -0500275 u32 copyup_page_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276
277 struct ceph_osd_request *osd_req;
278
279 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800280 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600281
282 rbd_obj_callback_t callback;
Alex Elder788e2df2013-01-17 12:25:27 -0600283 struct completion completion;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600284
285 struct kref kref;
286};
287
Alex Elder0c425242013-02-08 09:55:49 -0600288enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600289 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
290 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600291 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800292 IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600293};
294
Alex Elderbf0d5f502012-11-22 00:00:08 -0600295struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600296 struct rbd_device *rbd_dev;
297 u64 offset; /* starting image byte offset */
298 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600299 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600300 union {
Alex Elder9849e982013-01-24 16:13:36 -0600301 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600302 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600303 };
304 union {
305 struct request *rq; /* block request */
306 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600307 };
Alex Elder3d7efd12013-04-19 15:34:50 -0500308 struct page **copyup_pages;
Alex Elderebda6402013-05-10 16:29:22 -0500309 u32 copyup_page_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600310 spinlock_t completion_lock;/* protects next_completion */
311 u32 next_completion;
312 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500313 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600314 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600315
316 u32 obj_request_count;
317 struct list_head obj_requests; /* rbd_obj_request structs */
318
319 struct kref kref;
320};
321
322#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600323 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600324#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600325 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600326#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600327 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600328
Ilya Dryomov99d16942016-08-12 16:11:41 +0200329enum rbd_watch_state {
330 RBD_WATCH_STATE_UNREGISTERED,
331 RBD_WATCH_STATE_REGISTERED,
332 RBD_WATCH_STATE_ERROR,
333};
334
Ilya Dryomoved95b212016-08-12 16:40:02 +0200335enum rbd_lock_state {
336 RBD_LOCK_STATE_UNLOCKED,
337 RBD_LOCK_STATE_LOCKED,
338 RBD_LOCK_STATE_RELEASING,
339};
340
341/* WatchNotify::ClientId */
342struct rbd_client_id {
343 u64 gid;
344 u64 handle;
345};
346
Alex Elderf84344f2012-08-31 17:29:51 -0500347struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500348 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500349 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500350 bool read_only;
351};
352
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700353/*
354 * a single device
355 */
356struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500357 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700358
359 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200360 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700361 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700362
Alex Eldera30b71b2012-07-10 20:30:11 -0500363 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700364 struct rbd_client *rbd_client;
365
366 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
367
Alex Elderb82d1672013-01-14 12:43:31 -0600368 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369
370 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600371 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500372 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300373 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200374 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700375
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200376 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200377 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500378
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200379 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600380
Ilya Dryomov99d16942016-08-12 16:11:41 +0200381 struct mutex watch_mutex;
382 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200383 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200384 u64 watch_cookie;
385 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700386
Ilya Dryomoved95b212016-08-12 16:40:02 +0200387 struct rw_semaphore lock_rwsem;
388 enum rbd_lock_state lock_state;
389 struct rbd_client_id owner_cid;
390 struct work_struct acquired_lock_work;
391 struct work_struct released_lock_work;
392 struct delayed_work lock_dwork;
393 struct work_struct unlock_work;
394 wait_queue_head_t lock_waitq;
395
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200396 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700397
Alex Elder86b00e02012-10-25 23:34:42 -0500398 struct rbd_spec *parent_spec;
399 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500400 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500401 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500402
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100403 /* Block layer tags. */
404 struct blk_mq_tag_set tag_set;
405
Josh Durginc6666012011-11-21 17:11:12 -0800406 /* protects updating the header */
407 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500408
409 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410
411 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800412
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800413 /* sysfs related */
414 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600415 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800416};
417
Alex Elderb82d1672013-01-14 12:43:31 -0600418/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200419 * Flag bits for rbd_dev->flags:
420 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
421 * by rbd_dev->lock
422 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600423 */
Alex Elder6d292902013-01-14 12:43:31 -0600424enum rbd_dev_flags {
425 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600426 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200427 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600428};
429
Alex Eldercfbf6372013-05-31 17:40:45 -0500430static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600431
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700432static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600433static DEFINE_SPINLOCK(rbd_dev_list_lock);
434
Alex Elder432b8582012-01-29 13:57:44 -0600435static LIST_HEAD(rbd_client_list); /* clients */
436static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700437
Alex Elder78c2a442013-05-01 12:43:04 -0500438/* Slab caches for frequently-allocated structures */
439
Alex Elder1c2a9df2013-05-01 12:43:03 -0500440static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500441static struct kmem_cache *rbd_obj_request_cache;
Alex Elder78c2a442013-05-01 12:43:04 -0500442static struct kmem_cache *rbd_segment_name_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500443
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200444static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200445static DEFINE_IDA(rbd_dev_id_ida);
446
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400447static struct workqueue_struct *rbd_wq;
448
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200449/*
450 * Default to false for now, as single-major requires >= 0.75 version of
451 * userspace rbd utility.
452 */
453static bool single_major = false;
454module_param(single_major, bool, S_IRUGO);
455MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)");
456
Alex Elder3d7efd12013-04-19 15:34:50 -0500457static int rbd_img_request_submit(struct rbd_img_request *img_request);
458
Alex Elderf0f8cef2012-01-29 13:57:44 -0600459static ssize_t rbd_add(struct bus_type *bus, const char *buf,
460 size_t count);
461static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
462 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200463static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
464 size_t count);
465static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
466 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200467static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Eldera2acd002013-05-08 22:50:04 -0500468static void rbd_spec_put(struct rbd_spec *spec);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600469
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200470static int rbd_dev_id_to_minor(int dev_id)
471{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200472 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200473}
474
475static int minor_to_rbd_dev_id(int minor)
476{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200477 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200478}
479
Ilya Dryomoved95b212016-08-12 16:40:02 +0200480static bool rbd_is_lock_supported(struct rbd_device *rbd_dev)
481{
482 return (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
483 rbd_dev->spec->snap_id == CEPH_NOSNAP &&
484 !rbd_dev->mapping.read_only;
485}
486
487static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
488{
489 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
490 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
491}
492
493static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
494{
495 bool is_lock_owner;
496
497 down_read(&rbd_dev->lock_rwsem);
498 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
499 up_read(&rbd_dev->lock_rwsem);
500 return is_lock_owner;
501}
502
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700503static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
504static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200505static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
506static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700507
508static struct attribute *rbd_bus_attrs[] = {
509 &bus_attr_add.attr,
510 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200511 &bus_attr_add_single_major.attr,
512 &bus_attr_remove_single_major.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700513 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600514};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200515
516static umode_t rbd_bus_is_visible(struct kobject *kobj,
517 struct attribute *attr, int index)
518{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200519 if (!single_major &&
520 (attr == &bus_attr_add_single_major.attr ||
521 attr == &bus_attr_remove_single_major.attr))
522 return 0;
523
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200524 return attr->mode;
525}
526
527static const struct attribute_group rbd_bus_group = {
528 .attrs = rbd_bus_attrs,
529 .is_visible = rbd_bus_is_visible,
530};
531__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600532
533static struct bus_type rbd_bus_type = {
534 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700535 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600536};
537
538static void rbd_root_dev_release(struct device *dev)
539{
540}
541
542static struct device rbd_root_dev = {
543 .init_name = "rbd",
544 .release = rbd_root_dev_release,
545};
546
Alex Elder06ecc6c2012-11-01 10:17:15 -0500547static __printf(2, 3)
548void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
549{
550 struct va_format vaf;
551 va_list args;
552
553 va_start(args, fmt);
554 vaf.fmt = fmt;
555 vaf.va = &args;
556
557 if (!rbd_dev)
558 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
559 else if (rbd_dev->disk)
560 printk(KERN_WARNING "%s: %s: %pV\n",
561 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
562 else if (rbd_dev->spec && rbd_dev->spec->image_name)
563 printk(KERN_WARNING "%s: image %s: %pV\n",
564 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
565 else if (rbd_dev->spec && rbd_dev->spec->image_id)
566 printk(KERN_WARNING "%s: id %s: %pV\n",
567 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
568 else /* punt */
569 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
570 RBD_DRV_NAME, rbd_dev, &vaf);
571 va_end(args);
572}
573
Alex Elderaafb2302012-09-06 16:00:54 -0500574#ifdef RBD_DEBUG
575#define rbd_assert(expr) \
576 if (unlikely(!(expr))) { \
577 printk(KERN_ERR "\nAssertion failure in %s() " \
578 "at line %d:\n\n" \
579 "\trbd_assert(%s);\n\n", \
580 __func__, __LINE__, #expr); \
581 BUG(); \
582 }
583#else /* !RBD_DEBUG */
584# define rbd_assert(expr) ((void) 0)
585#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800586
Ilya Dryomov27617132015-07-16 17:36:11 +0300587static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
Alex Elderb454e362013-04-19 15:34:50 -0500588static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder05a46af2013-04-26 15:44:36 -0500589static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
590static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600591
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500592static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500593static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400594static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400595static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500596static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
597 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500598static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
599 u8 *order, u64 *snap_size);
600static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
601 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700602
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700603static int rbd_open(struct block_device *bdev, fmode_t mode)
604{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600605 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600606 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700607
Alex Elderf84344f2012-08-31 17:29:51 -0500608 if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609 return -EROFS;
610
Alex Eldera14ea262013-02-05 13:23:12 -0600611 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600612 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
613 removing = true;
614 else
615 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600616 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600617 if (removing)
618 return -ENOENT;
619
Alex Elderc3e946c2012-11-16 09:29:16 -0600620 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700621
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700622 return 0;
623}
624
Al Virodb2a1442013-05-05 21:52:57 -0400625static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800626{
627 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600628 unsigned long open_count_before;
629
Alex Eldera14ea262013-02-05 13:23:12 -0600630 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600631 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600632 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600633 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800634
Alex Elderc3e946c2012-11-16 09:29:16 -0600635 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800636}
637
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800638static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
639{
Josh Durgin77f33c02013-09-30 17:09:54 -0700640 int ret = 0;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800641 int val;
642 bool ro;
Josh Durgin77f33c02013-09-30 17:09:54 -0700643 bool ro_changed = false;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800644
Josh Durgin77f33c02013-09-30 17:09:54 -0700645 /* get_user() may sleep, so call it before taking rbd_dev->lock */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800646 if (get_user(val, (int __user *)(arg)))
647 return -EFAULT;
648
649 ro = val ? true : false;
650 /* Snapshot doesn't allow to write*/
651 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
652 return -EROFS;
653
Josh Durgin77f33c02013-09-30 17:09:54 -0700654 spin_lock_irq(&rbd_dev->lock);
655 /* prevent others open this device */
656 if (rbd_dev->open_count > 1) {
657 ret = -EBUSY;
658 goto out;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800659 }
660
Josh Durgin77f33c02013-09-30 17:09:54 -0700661 if (rbd_dev->mapping.read_only != ro) {
662 rbd_dev->mapping.read_only = ro;
663 ro_changed = true;
664 }
665
666out:
667 spin_unlock_irq(&rbd_dev->lock);
668 /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */
669 if (ret == 0 && ro_changed)
670 set_disk_ro(rbd_dev->disk, ro ? 1 : 0);
671
672 return ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800673}
674
675static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
676 unsigned int cmd, unsigned long arg)
677{
678 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
679 int ret = 0;
680
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800681 switch (cmd) {
682 case BLKROSET:
683 ret = rbd_ioctl_set_ro(rbd_dev, arg);
684 break;
685 default:
686 ret = -ENOTTY;
687 }
688
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800689 return ret;
690}
691
692#ifdef CONFIG_COMPAT
693static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
694 unsigned int cmd, unsigned long arg)
695{
696 return rbd_ioctl(bdev, mode, cmd, arg);
697}
698#endif /* CONFIG_COMPAT */
699
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700700static const struct block_device_operations rbd_bd_ops = {
701 .owner = THIS_MODULE,
702 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800703 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800704 .ioctl = rbd_ioctl,
705#ifdef CONFIG_COMPAT
706 .compat_ioctl = rbd_compat_ioctl,
707#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708};
709
710/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500711 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500712 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713 */
Alex Elderf8c38922012-08-10 13:12:07 -0700714static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700715{
716 struct rbd_client *rbdc;
717 int ret = -ENOMEM;
718
Alex Elder37206ee2013-02-20 17:32:08 -0600719 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700720 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
721 if (!rbdc)
722 goto out_opt;
723
724 kref_init(&rbdc->kref);
725 INIT_LIST_HEAD(&rbdc->node);
726
Alex Elder43ae4702012-07-03 16:01:18 -0500727 rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700728 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500729 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500730 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700731
732 ret = ceph_open_session(rbdc->client);
733 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500734 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735
Alex Elder432b8582012-01-29 13:57:44 -0600736 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700737 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600738 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700739
Alex Elder37206ee2013-02-20 17:32:08 -0600740 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600741
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500743out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500745out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700746 kfree(rbdc);
747out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500748 if (ceph_opts)
749 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600750 dout("%s: error %d\n", __func__, ret);
751
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400752 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753}
754
Alex Elder2f82ee52012-10-30 19:40:33 -0500755static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
756{
757 kref_get(&rbdc->kref);
758
759 return rbdc;
760}
761
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700762/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700763 * Find a ceph client with specific addr and configuration. If
764 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700765 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700766static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767{
768 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700769 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770
Alex Elder43ae4702012-07-03 16:01:18 -0500771 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700772 return NULL;
773
Alex Elder1f7ba332012-08-10 13:12:07 -0700774 spin_lock(&rbd_client_list_lock);
775 list_for_each_entry(client_node, &rbd_client_list, node) {
776 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500777 __rbd_get_client(client_node);
778
Alex Elder1f7ba332012-08-10 13:12:07 -0700779 found = true;
780 break;
781 }
782 }
783 spin_unlock(&rbd_client_list_lock);
784
785 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786}
787
788/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300789 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700790 */
791enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300792 Opt_queue_depth,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700793 Opt_last_int,
794 /* int args above */
795 Opt_last_string,
796 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700797 Opt_read_only,
798 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200799 Opt_lock_on_read,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300800 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700801};
802
Alex Elder43ae4702012-07-03 16:01:18 -0500803static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300804 {Opt_queue_depth, "queue_depth=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700805 /* int args above */
806 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500807 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700808 {Opt_read_only, "ro"}, /* Alternate spelling */
809 {Opt_read_write, "read_write"},
810 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200811 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300812 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700813};
814
Alex Elder98571b52013-01-20 14:44:42 -0600815struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300816 int queue_depth;
Alex Elder98571b52013-01-20 14:44:42 -0600817 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200818 bool lock_on_read;
Alex Elder98571b52013-01-20 14:44:42 -0600819};
820
Ilya Dryomovb5584182015-06-23 16:21:19 +0300821#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Alex Elder98571b52013-01-20 14:44:42 -0600822#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200823#define RBD_LOCK_ON_READ_DEFAULT false
Alex Elder98571b52013-01-20 14:44:42 -0600824
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700825static int parse_rbd_opts_token(char *c, void *private)
826{
Alex Elder43ae4702012-07-03 16:01:18 -0500827 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700828 substring_t argstr[MAX_OPT_ARGS];
829 int token, intval, ret;
830
Alex Elder43ae4702012-07-03 16:01:18 -0500831 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700832 if (token < Opt_last_int) {
833 ret = match_int(&argstr[0], &intval);
834 if (ret < 0) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300835 pr_err("bad mount option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700836 return ret;
837 }
838 dout("got int token %d val %d\n", token, intval);
839 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300840 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700841 } else {
842 dout("got token %d\n", token);
843 }
844
845 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300846 case Opt_queue_depth:
847 if (intval < 1) {
848 pr_err("queue_depth out of range\n");
849 return -EINVAL;
850 }
851 rbd_opts->queue_depth = intval;
852 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700853 case Opt_read_only:
854 rbd_opts->read_only = true;
855 break;
856 case Opt_read_write:
857 rbd_opts->read_only = false;
858 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200859 case Opt_lock_on_read:
860 rbd_opts->lock_on_read = true;
861 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700862 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300863 /* libceph prints "bad option" msg */
864 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700865 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300866
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700867 return 0;
868}
869
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800870static char* obj_op_name(enum obj_operation_type op_type)
871{
872 switch (op_type) {
873 case OBJ_OP_READ:
874 return "read";
875 case OBJ_OP_WRITE:
876 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800877 case OBJ_OP_DISCARD:
878 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800879 default:
880 return "???";
881 }
882}
883
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700884/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700885 * Get a ceph client with specific addr and configuration, if one does
Alex Elder7262cfc2013-05-16 15:04:20 -0500886 * not exist create it. Either way, ceph_opts is consumed by this
887 * function.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500889static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700890{
Alex Elderf8c38922012-08-10 13:12:07 -0700891 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700892
Alex Eldercfbf6372013-05-31 17:40:45 -0500893 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
Alex Elder1f7ba332012-08-10 13:12:07 -0700894 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500895 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500896 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500897 else
Alex Elderf8c38922012-08-10 13:12:07 -0700898 rbdc = rbd_client_create(ceph_opts);
Alex Eldercfbf6372013-05-31 17:40:45 -0500899 mutex_unlock(&client_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700900
Alex Elder9d3997f2012-10-25 23:34:42 -0500901 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700902}
903
904/*
905 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600906 *
Alex Elder432b8582012-01-29 13:57:44 -0600907 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700908 */
909static void rbd_client_release(struct kref *kref)
910{
911 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
912
Alex Elder37206ee2013-02-20 17:32:08 -0600913 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500914 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700915 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500916 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700917
918 ceph_destroy_client(rbdc->client);
919 kfree(rbdc);
920}
921
922/*
923 * Drop reference to ceph client node. If it's not referenced anymore, release
924 * it.
925 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500926static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700927{
Alex Elderc53d5892012-10-25 23:34:42 -0500928 if (rbdc)
929 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700930}
931
Alex Eldera30b71b2012-07-10 20:30:11 -0500932static bool rbd_image_format_valid(u32 image_format)
933{
934 return image_format == 1 || image_format == 2;
935}
936
Alex Elder8e94af82012-07-25 09:32:40 -0500937static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
938{
Alex Elder103a1502012-08-02 11:29:45 -0500939 size_t size;
940 u32 snap_count;
941
942 /* The header has to start with the magic rbd header text */
943 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
944 return false;
945
Alex Elderdb2388b2012-10-20 22:17:27 -0500946 /* The bio layer requires at least sector-sized I/O */
947
948 if (ondisk->options.order < SECTOR_SHIFT)
949 return false;
950
951 /* If we use u64 in a few spots we may be able to loosen this */
952
953 if (ondisk->options.order > 8 * sizeof (int) - 1)
954 return false;
955
Alex Elder103a1502012-08-02 11:29:45 -0500956 /*
957 * The size of a snapshot header has to fit in a size_t, and
958 * that limits the number of snapshots.
959 */
960 snap_count = le32_to_cpu(ondisk->snap_count);
961 size = SIZE_MAX - sizeof (struct ceph_snap_context);
962 if (snap_count > size / sizeof (__le64))
963 return false;
964
965 /*
966 * Not only that, but the size of the entire the snapshot
967 * header must also be representable in a size_t.
968 */
969 size -= snap_count * sizeof (__le64);
970 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
971 return false;
972
973 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500974}
975
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700976/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100977 * returns the size of an object in the image
978 */
979static u32 rbd_obj_bytes(struct rbd_image_header *header)
980{
981 return 1U << header->obj_order;
982}
983
Ilya Dryomov263423f2017-01-25 18:16:22 +0100984static void rbd_init_layout(struct rbd_device *rbd_dev)
985{
986 if (rbd_dev->header.stripe_unit == 0 ||
987 rbd_dev->header.stripe_count == 0) {
988 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
989 rbd_dev->header.stripe_count = 1;
990 }
991
992 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
993 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
994 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +0100995 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
996 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +0100997 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
998}
999
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001000/*
Alex Elderbb23e372013-05-06 09:51:29 -05001001 * Fill an rbd image header with information from the given format 1
1002 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001003 */
Alex Elder662518b2013-05-06 09:51:29 -05001004static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001005 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001006{
Alex Elder662518b2013-05-06 09:51:29 -05001007 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001008 bool first_time = header->object_prefix == NULL;
1009 struct ceph_snap_context *snapc;
1010 char *object_prefix = NULL;
1011 char *snap_names = NULL;
1012 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001013 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001014 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001015 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001016
Alex Elderbb23e372013-05-06 09:51:29 -05001017 /* Allocate this now to avoid having to handle failure below */
1018
1019 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001020 object_prefix = kstrndup(ondisk->object_prefix,
1021 sizeof(ondisk->object_prefix),
1022 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001023 if (!object_prefix)
1024 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001025 }
1026
1027 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001028
Alex Elder103a1502012-08-02 11:29:45 -05001029 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001030 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1031 if (!snapc)
1032 goto out_err;
1033 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001034 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001035 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001036 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1037
Alex Elderbb23e372013-05-06 09:51:29 -05001038 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001039
Alex Elderbb23e372013-05-06 09:51:29 -05001040 if (snap_names_len > (u64)SIZE_MAX)
1041 goto out_2big;
1042 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1043 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001044 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001045
1046 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001047 snap_sizes = kmalloc_array(snap_count,
1048 sizeof(*header->snap_sizes),
1049 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001050 if (!snap_sizes)
1051 goto out_err;
1052
Alex Elderf785cc12012-08-23 23:22:06 -05001053 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001054 * Copy the names, and fill in each snapshot's id
1055 * and size.
1056 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001057 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001058 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001059 * snap_names_len bytes beyond the end of the
1060 * snapshot id array, this memcpy() is safe.
1061 */
Alex Elderbb23e372013-05-06 09:51:29 -05001062 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1063 snaps = ondisk->snaps;
1064 for (i = 0; i < snap_count; i++) {
1065 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1066 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1067 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001068 }
Alex Elder849b4262012-07-09 21:04:24 -05001069
Alex Elderbb23e372013-05-06 09:51:29 -05001070 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001071
Alex Elderbb23e372013-05-06 09:51:29 -05001072 if (first_time) {
1073 header->object_prefix = object_prefix;
1074 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001075 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001076 } else {
1077 ceph_put_snap_context(header->snapc);
1078 kfree(header->snap_names);
1079 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001080 }
1081
1082 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001083
Alex Elderf84344f2012-08-31 17:29:51 -05001084 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001085 header->snapc = snapc;
1086 header->snap_names = snap_names;
1087 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001088
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001089 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001090out_2big:
1091 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001092out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001093 kfree(snap_sizes);
1094 kfree(snap_names);
1095 ceph_put_snap_context(snapc);
1096 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001097
Alex Elderbb23e372013-05-06 09:51:29 -05001098 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001099}
1100
Alex Elder9682fc62013-04-30 00:44:33 -05001101static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1102{
1103 const char *snap_name;
1104
1105 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1106
1107 /* Skip over names until we find the one we are looking for */
1108
1109 snap_name = rbd_dev->header.snap_names;
1110 while (which--)
1111 snap_name += strlen(snap_name) + 1;
1112
1113 return kstrdup(snap_name, GFP_KERNEL);
1114}
1115
Alex Elder30d1cff2013-05-01 12:43:03 -05001116/*
1117 * Snapshot id comparison function for use with qsort()/bsearch().
1118 * Note that result is for snapshots in *descending* order.
1119 */
1120static int snapid_compare_reverse(const void *s1, const void *s2)
1121{
1122 u64 snap_id1 = *(u64 *)s1;
1123 u64 snap_id2 = *(u64 *)s2;
1124
1125 if (snap_id1 < snap_id2)
1126 return 1;
1127 return snap_id1 == snap_id2 ? 0 : -1;
1128}
1129
1130/*
1131 * Search a snapshot context to see if the given snapshot id is
1132 * present.
1133 *
1134 * Returns the position of the snapshot id in the array if it's found,
1135 * or BAD_SNAP_INDEX otherwise.
1136 *
1137 * Note: The snapshot array is in kept sorted (by the osd) in
1138 * reverse order, highest snapshot id first.
1139 */
Alex Elder9682fc62013-04-30 00:44:33 -05001140static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1141{
1142 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001143 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001144
Alex Elder30d1cff2013-05-01 12:43:03 -05001145 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1146 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001147
Alex Elder30d1cff2013-05-01 12:43:03 -05001148 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001149}
1150
Alex Elder2ad3d712013-04-30 00:44:33 -05001151static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1152 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001153{
1154 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001155 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001156
1157 which = rbd_dev_snap_index(rbd_dev, snap_id);
1158 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001159 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001160
Josh Durginda6a6b62013-09-04 17:57:31 -07001161 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1162 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001163}
1164
Alex Elder9e15b772012-10-30 19:40:33 -05001165static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1166{
Alex Elder9e15b772012-10-30 19:40:33 -05001167 if (snap_id == CEPH_NOSNAP)
1168 return RBD_SNAP_HEAD_NAME;
1169
Alex Elder54cac612013-04-30 00:44:33 -05001170 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1171 if (rbd_dev->image_format == 1)
1172 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001173
Alex Elder54cac612013-04-30 00:44:33 -05001174 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001175}
1176
Alex Elder2ad3d712013-04-30 00:44:33 -05001177static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1178 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001179{
Alex Elder2ad3d712013-04-30 00:44:33 -05001180 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1181 if (snap_id == CEPH_NOSNAP) {
1182 *snap_size = rbd_dev->header.image_size;
1183 } else if (rbd_dev->image_format == 1) {
1184 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001185
Alex Elder2ad3d712013-04-30 00:44:33 -05001186 which = rbd_dev_snap_index(rbd_dev, snap_id);
1187 if (which == BAD_SNAP_INDEX)
1188 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001189
Alex Elder2ad3d712013-04-30 00:44:33 -05001190 *snap_size = rbd_dev->header.snap_sizes[which];
1191 } else {
1192 u64 size = 0;
1193 int ret;
1194
1195 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1196 if (ret)
1197 return ret;
1198
1199 *snap_size = size;
1200 }
1201 return 0;
1202}
1203
1204static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1205 u64 *snap_features)
1206{
1207 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1208 if (snap_id == CEPH_NOSNAP) {
1209 *snap_features = rbd_dev->header.features;
1210 } else if (rbd_dev->image_format == 1) {
1211 *snap_features = 0; /* No features for format 1 */
1212 } else {
1213 u64 features = 0;
1214 int ret;
1215
1216 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1217 if (ret)
1218 return ret;
1219
1220 *snap_features = features;
1221 }
1222 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001223}
1224
Alex Elderd1cf5782013-04-27 09:59:30 -05001225static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001227 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001228 u64 size = 0;
1229 u64 features = 0;
1230 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001231
Alex Elder2ad3d712013-04-30 00:44:33 -05001232 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1233 if (ret)
1234 return ret;
1235 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1236 if (ret)
1237 return ret;
1238
1239 rbd_dev->mapping.size = size;
1240 rbd_dev->mapping.features = features;
1241
Alex Elder8b0241f2013-04-25 23:15:08 -05001242 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001243}
1244
Alex Elderd1cf5782013-04-27 09:59:30 -05001245static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1246{
1247 rbd_dev->mapping.size = 0;
1248 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001249}
1250
Himangi Saraogi7d5079a2014-07-24 03:17:07 +05301251static void rbd_segment_name_free(const char *name)
1252{
1253 /* The explicit cast here is needed to drop the const qualifier */
1254
1255 kmem_cache_free(rbd_segment_name_cache, (void *)name);
1256}
1257
Alex Elder98571b52013-01-20 14:44:42 -06001258static const char *rbd_segment_name(struct rbd_device *rbd_dev, u64 offset)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001259{
Alex Elder65ccfe22012-08-09 10:33:26 -07001260 char *name;
1261 u64 segment;
1262 int ret;
Josh Durgin3a96d5c2013-06-12 19:15:06 -07001263 char *name_format;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001264
Alex Elder78c2a442013-05-01 12:43:04 -05001265 name = kmem_cache_alloc(rbd_segment_name_cache, GFP_NOIO);
Alex Elder65ccfe22012-08-09 10:33:26 -07001266 if (!name)
1267 return NULL;
1268 segment = offset >> rbd_dev->header.obj_order;
Josh Durgin3a96d5c2013-06-12 19:15:06 -07001269 name_format = "%s.%012llx";
1270 if (rbd_dev->image_format == 2)
1271 name_format = "%s.%016llx";
Ilya Dryomov2d0ebc52014-01-27 17:40:18 +02001272 ret = snprintf(name, CEPH_MAX_OID_NAME_LEN + 1, name_format,
Alex Elder65ccfe22012-08-09 10:33:26 -07001273 rbd_dev->header.object_prefix, segment);
Ilya Dryomov2d0ebc52014-01-27 17:40:18 +02001274 if (ret < 0 || ret > CEPH_MAX_OID_NAME_LEN) {
Alex Elder65ccfe22012-08-09 10:33:26 -07001275 pr_err("error formatting segment name for #%llu (%d)\n",
1276 segment, ret);
Himangi Saraogi7d5079a2014-07-24 03:17:07 +05301277 rbd_segment_name_free(name);
Alex Elder65ccfe22012-08-09 10:33:26 -07001278 name = NULL;
1279 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001280
Alex Elder65ccfe22012-08-09 10:33:26 -07001281 return name;
1282}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001283
Alex Elder65ccfe22012-08-09 10:33:26 -07001284static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1285{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001286 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001287
Alex Elder65ccfe22012-08-09 10:33:26 -07001288 return offset & (segment_size - 1);
1289}
1290
1291static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1292 u64 offset, u64 length)
1293{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001294 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Alex Elder65ccfe22012-08-09 10:33:26 -07001295
1296 offset &= segment_size - 1;
1297
Alex Elderaafb2302012-09-06 16:00:54 -05001298 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001299 if (offset + length > segment_size)
1300 length = segment_size - offset;
1301
1302 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001303}
1304
1305/*
1306 * bio helpers
1307 */
1308
1309static void bio_chain_put(struct bio *chain)
1310{
1311 struct bio *tmp;
1312
1313 while (chain) {
1314 tmp = chain;
1315 chain = chain->bi_next;
1316 bio_put(tmp);
1317 }
1318}
1319
1320/*
1321 * zeros a bio chain, starting at specific offset
1322 */
1323static void zero_bio_chain(struct bio *chain, int start_ofs)
1324{
Kent Overstreet79886132013-11-23 17:19:00 -08001325 struct bio_vec bv;
1326 struct bvec_iter iter;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001327 unsigned long flags;
1328 void *buf;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001329 int pos = 0;
1330
1331 while (chain) {
Kent Overstreet79886132013-11-23 17:19:00 -08001332 bio_for_each_segment(bv, chain, iter) {
1333 if (pos + bv.bv_len > start_ofs) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001334 int remainder = max(start_ofs - pos, 0);
Kent Overstreet79886132013-11-23 17:19:00 -08001335 buf = bvec_kmap_irq(&bv, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001336 memset(buf + remainder, 0,
Kent Overstreet79886132013-11-23 17:19:00 -08001337 bv.bv_len - remainder);
1338 flush_dcache_page(bv.bv_page);
Dan Carpenter85b5aaa2010-10-11 21:15:11 +02001339 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001340 }
Kent Overstreet79886132013-11-23 17:19:00 -08001341 pos += bv.bv_len;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001342 }
1343
1344 chain = chain->bi_next;
1345 }
1346}
1347
1348/*
Alex Elderb9434c52013-04-19 15:34:50 -05001349 * similar to zero_bio_chain(), zeros data defined by a page array,
1350 * starting at the given byte offset from the start of the array and
1351 * continuing up to the given end offset. The pages array is
1352 * assumed to be big enough to hold all bytes up to the end.
1353 */
1354static void zero_pages(struct page **pages, u64 offset, u64 end)
1355{
1356 struct page **page = &pages[offset >> PAGE_SHIFT];
1357
1358 rbd_assert(end > offset);
1359 rbd_assert(end - offset <= (u64)SIZE_MAX);
1360 while (offset < end) {
1361 size_t page_offset;
1362 size_t length;
1363 unsigned long flags;
1364 void *kaddr;
1365
Geert Uytterhoeven491205a2013-05-13 20:35:37 -05001366 page_offset = offset & ~PAGE_MASK;
1367 length = min_t(size_t, PAGE_SIZE - page_offset, end - offset);
Alex Elderb9434c52013-04-19 15:34:50 -05001368 local_irq_save(flags);
1369 kaddr = kmap_atomic(*page);
1370 memset(kaddr + page_offset, 0, length);
Alex Eldere2156052013-05-22 20:54:25 -05001371 flush_dcache_page(*page);
Alex Elderb9434c52013-04-19 15:34:50 -05001372 kunmap_atomic(kaddr);
1373 local_irq_restore(flags);
1374
1375 offset += length;
1376 page++;
1377 }
1378}
1379
1380/*
Alex Elderf7760da2012-10-20 22:17:27 -05001381 * Clone a portion of a bio, starting at the given byte offset
1382 * and continuing for the number of bytes indicated.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001383 */
Alex Elderf7760da2012-10-20 22:17:27 -05001384static struct bio *bio_clone_range(struct bio *bio_src,
1385 unsigned int offset,
1386 unsigned int len,
1387 gfp_t gfpmask)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001388{
Alex Elderf7760da2012-10-20 22:17:27 -05001389 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001390
Kent Overstreet5341a622013-08-07 14:31:11 -07001391 bio = bio_clone(bio_src, gfpmask);
Alex Elderf7760da2012-10-20 22:17:27 -05001392 if (!bio)
1393 return NULL; /* ENOMEM */
1394
Kent Overstreet5341a622013-08-07 14:31:11 -07001395 bio_advance(bio, offset);
Kent Overstreet4f024f32013-10-11 15:44:27 -07001396 bio->bi_iter.bi_size = len;
Alex Elder542582f2012-08-09 10:33:25 -07001397
Alex Elderf7760da2012-10-20 22:17:27 -05001398 return bio;
1399}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001400
Alex Elderf7760da2012-10-20 22:17:27 -05001401/*
1402 * Clone a portion of a bio chain, starting at the given byte offset
1403 * into the first bio in the source chain and continuing for the
1404 * number of bytes indicated. The result is another bio chain of
1405 * exactly the given length, or a null pointer on error.
1406 *
1407 * The bio_src and offset parameters are both in-out. On entry they
1408 * refer to the first source bio and the offset into that bio where
1409 * the start of data to be cloned is located.
1410 *
1411 * On return, bio_src is updated to refer to the bio in the source
1412 * chain that contains first un-cloned byte, and *offset will
1413 * contain the offset of that byte within that bio.
1414 */
1415static struct bio *bio_chain_clone_range(struct bio **bio_src,
1416 unsigned int *offset,
1417 unsigned int len,
1418 gfp_t gfpmask)
1419{
1420 struct bio *bi = *bio_src;
1421 unsigned int off = *offset;
1422 struct bio *chain = NULL;
1423 struct bio **end;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001424
Alex Elderf7760da2012-10-20 22:17:27 -05001425 /* Build up a chain of clone bios up to the limit */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001426
Kent Overstreet4f024f32013-10-11 15:44:27 -07001427 if (!bi || off >= bi->bi_iter.bi_size || !len)
Alex Elderf7760da2012-10-20 22:17:27 -05001428 return NULL; /* Nothing to clone */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001429
Alex Elderf7760da2012-10-20 22:17:27 -05001430 end = &chain;
1431 while (len) {
1432 unsigned int bi_size;
1433 struct bio *bio;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001434
Alex Elderf5400b72012-11-01 10:17:15 -05001435 if (!bi) {
1436 rbd_warn(NULL, "bio_chain exhausted with %u left", len);
Alex Elderf7760da2012-10-20 22:17:27 -05001437 goto out_err; /* EINVAL; ran out of bio's */
Alex Elderf5400b72012-11-01 10:17:15 -05001438 }
Kent Overstreet4f024f32013-10-11 15:44:27 -07001439 bi_size = min_t(unsigned int, bi->bi_iter.bi_size - off, len);
Alex Elderf7760da2012-10-20 22:17:27 -05001440 bio = bio_clone_range(bi, off, bi_size, gfpmask);
1441 if (!bio)
1442 goto out_err; /* ENOMEM */
1443
1444 *end = bio;
1445 end = &bio->bi_next;
1446
1447 off += bi_size;
Kent Overstreet4f024f32013-10-11 15:44:27 -07001448 if (off == bi->bi_iter.bi_size) {
Alex Elderf7760da2012-10-20 22:17:27 -05001449 bi = bi->bi_next;
1450 off = 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001451 }
Alex Elderf7760da2012-10-20 22:17:27 -05001452 len -= bi_size;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001453 }
Alex Elderf7760da2012-10-20 22:17:27 -05001454 *bio_src = bi;
1455 *offset = off;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001456
Alex Elderf7760da2012-10-20 22:17:27 -05001457 return chain;
1458out_err:
1459 bio_chain_put(chain);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001460
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001461 return NULL;
1462}
1463
Alex Elder926f9b32013-02-11 12:33:24 -06001464/*
1465 * The default/initial value for all object request flags is 0. For
1466 * each flag, once its value is set to 1 it is never reset to 0
1467 * again.
1468 */
Alex Elder6365d332013-02-11 12:33:24 -06001469static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1470{
1471 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001472 struct rbd_device *rbd_dev;
1473
Alex Elder57acbaa2013-02-11 12:33:24 -06001474 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001475 rbd_warn(rbd_dev, "obj_request %p already marked img_data",
Alex Elder6365d332013-02-11 12:33:24 -06001476 obj_request);
1477 }
1478}
1479
1480static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1481{
1482 smp_mb();
1483 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1484}
1485
Alex Elder57acbaa2013-02-11 12:33:24 -06001486static void obj_request_done_set(struct rbd_obj_request *obj_request)
1487{
1488 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1489 struct rbd_device *rbd_dev = NULL;
1490
1491 if (obj_request_img_data_test(obj_request))
1492 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001493 rbd_warn(rbd_dev, "obj_request %p already marked done",
Alex Elder57acbaa2013-02-11 12:33:24 -06001494 obj_request);
1495 }
1496}
1497
1498static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1499{
1500 smp_mb();
1501 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1502}
1503
Alex Elder5679c592013-02-11 12:33:24 -06001504/*
1505 * This sets the KNOWN flag after (possibly) setting the EXISTS
1506 * flag. The latter is set based on the "exists" value provided.
1507 *
1508 * Note that for our purposes once an object exists it never goes
1509 * away again. It's possible that the response from two existence
1510 * checks are separated by the creation of the target object, and
1511 * the first ("doesn't exist") response arrives *after* the second
1512 * ("does exist"). In that case we ignore the second one.
1513 */
1514static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1515 bool exists)
1516{
1517 if (exists)
1518 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1519 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1520 smp_mb();
1521}
1522
1523static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1524{
1525 smp_mb();
1526 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1527}
1528
1529static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1530{
1531 smp_mb();
1532 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1533}
1534
Ilya Dryomov96385562014-06-10 13:53:29 +04001535static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1536{
1537 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1538
1539 return obj_request->img_offset <
1540 round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1541}
1542
Alex Elderbf0d5f502012-11-22 00:00:08 -06001543static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1544{
Alex Elder37206ee2013-02-20 17:32:08 -06001545 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1546 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001547 kref_get(&obj_request->kref);
1548}
1549
1550static void rbd_obj_request_destroy(struct kref *kref);
1551static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1552{
1553 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001554 dout("%s: obj %p (was %d)\n", __func__, obj_request,
1555 atomic_read(&obj_request->kref.refcount));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001556 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1557}
1558
Alex Elder0f2d5be2014-04-26 14:21:44 +04001559static void rbd_img_request_get(struct rbd_img_request *img_request)
1560{
1561 dout("%s: img %p (was %d)\n", __func__, img_request,
1562 atomic_read(&img_request->kref.refcount));
1563 kref_get(&img_request->kref);
1564}
1565
Alex Eldere93f3152013-05-08 22:50:04 -05001566static bool img_request_child_test(struct rbd_img_request *img_request);
1567static void rbd_parent_request_destroy(struct kref *kref);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001568static void rbd_img_request_destroy(struct kref *kref);
1569static void rbd_img_request_put(struct rbd_img_request *img_request)
1570{
1571 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001572 dout("%s: img %p (was %d)\n", __func__, img_request,
1573 atomic_read(&img_request->kref.refcount));
Alex Eldere93f3152013-05-08 22:50:04 -05001574 if (img_request_child_test(img_request))
1575 kref_put(&img_request->kref, rbd_parent_request_destroy);
1576 else
1577 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001578}
1579
1580static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1581 struct rbd_obj_request *obj_request)
1582{
Alex Elder25dcf952013-01-25 17:08:55 -06001583 rbd_assert(obj_request->img_request == NULL);
1584
Alex Elderb155e862013-04-15 14:50:37 -05001585 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001586 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001587 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001588 rbd_assert(!obj_request_img_data_test(obj_request));
1589 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001590 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001591 img_request->obj_request_count++;
1592 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001593 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1594 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001595}
1596
1597static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1598 struct rbd_obj_request *obj_request)
1599{
1600 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001601
Alex Elder37206ee2013-02-20 17:32:08 -06001602 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1603 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001604 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001605 rbd_assert(img_request->obj_request_count > 0);
1606 img_request->obj_request_count--;
1607 rbd_assert(obj_request->which == img_request->obj_request_count);
1608 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001609 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001610 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001611 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001612 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001613 rbd_obj_request_put(obj_request);
1614}
1615
1616static bool obj_request_type_valid(enum obj_request_type type)
1617{
1618 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001619 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001620 case OBJ_REQUEST_BIO:
Alex Elder788e2df2013-01-17 12:25:27 -06001621 case OBJ_REQUEST_PAGES:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001622 return true;
1623 default:
1624 return false;
1625 }
1626}
1627
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001628static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
1629
Ilya Dryomov980917f2016-09-12 18:59:42 +02001630static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001631{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001632 struct ceph_osd_request *osd_req = obj_request->osd_req;
1633
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001634 dout("%s %p \"%s\" %llu~%llu osd_req %p\n", __func__,
1635 obj_request, obj_request->object_name, obj_request->offset,
1636 obj_request->length, osd_req);
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001637 if (obj_request_img_data_test(obj_request)) {
1638 WARN_ON(obj_request->callback != rbd_img_obj_callback);
1639 rbd_img_request_get(obj_request->img_request);
1640 }
Ilya Dryomov980917f2016-09-12 18:59:42 +02001641 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001642}
1643
1644static void rbd_img_request_complete(struct rbd_img_request *img_request)
1645{
Alex Elder55f27e02013-04-10 12:34:25 -05001646
Alex Elder37206ee2013-02-20 17:32:08 -06001647 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001648
1649 /*
1650 * If no error occurred, compute the aggregate transfer
1651 * count for the image request. We could instead use
1652 * atomic64_cmpxchg() to update it as each object request
1653 * completes; not clear which way is better off hand.
1654 */
1655 if (!img_request->result) {
1656 struct rbd_obj_request *obj_request;
1657 u64 xferred = 0;
1658
1659 for_each_obj_request(img_request, obj_request)
1660 xferred += obj_request->xferred;
1661 img_request->xferred = xferred;
1662 }
1663
Alex Elderbf0d5f502012-11-22 00:00:08 -06001664 if (img_request->callback)
1665 img_request->callback(img_request);
1666 else
1667 rbd_img_request_put(img_request);
1668}
1669
Alex Elder0c425242013-02-08 09:55:49 -06001670/*
1671 * The default/initial value for all image request flags is 0. Each
1672 * is conditionally set to 1 at image request initialization time
1673 * and currently never change thereafter.
1674 */
1675static void img_request_write_set(struct rbd_img_request *img_request)
1676{
1677 set_bit(IMG_REQ_WRITE, &img_request->flags);
1678 smp_mb();
1679}
1680
1681static bool img_request_write_test(struct rbd_img_request *img_request)
1682{
1683 smp_mb();
1684 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1685}
1686
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001687/*
1688 * Set the discard flag when the img_request is an discard request
1689 */
1690static void img_request_discard_set(struct rbd_img_request *img_request)
1691{
1692 set_bit(IMG_REQ_DISCARD, &img_request->flags);
1693 smp_mb();
1694}
1695
1696static bool img_request_discard_test(struct rbd_img_request *img_request)
1697{
1698 smp_mb();
1699 return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
1700}
1701
Alex Elder9849e982013-01-24 16:13:36 -06001702static void img_request_child_set(struct rbd_img_request *img_request)
1703{
1704 set_bit(IMG_REQ_CHILD, &img_request->flags);
1705 smp_mb();
1706}
1707
Alex Eldere93f3152013-05-08 22:50:04 -05001708static void img_request_child_clear(struct rbd_img_request *img_request)
1709{
1710 clear_bit(IMG_REQ_CHILD, &img_request->flags);
1711 smp_mb();
1712}
1713
Alex Elder9849e982013-01-24 16:13:36 -06001714static bool img_request_child_test(struct rbd_img_request *img_request)
1715{
1716 smp_mb();
1717 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1718}
1719
Alex Elderd0b2e942013-01-24 16:13:36 -06001720static void img_request_layered_set(struct rbd_img_request *img_request)
1721{
1722 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1723 smp_mb();
1724}
1725
Alex Eldera2acd002013-05-08 22:50:04 -05001726static void img_request_layered_clear(struct rbd_img_request *img_request)
1727{
1728 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1729 smp_mb();
1730}
1731
Alex Elderd0b2e942013-01-24 16:13:36 -06001732static bool img_request_layered_test(struct rbd_img_request *img_request)
1733{
1734 smp_mb();
1735 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1736}
1737
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001738static enum obj_operation_type
1739rbd_img_request_op_type(struct rbd_img_request *img_request)
1740{
1741 if (img_request_write_test(img_request))
1742 return OBJ_OP_WRITE;
1743 else if (img_request_discard_test(img_request))
1744 return OBJ_OP_DISCARD;
1745 else
1746 return OBJ_OP_READ;
1747}
1748
Alex Elder6e2a4502013-03-27 09:16:30 -05001749static void
1750rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1751{
Alex Elderb9434c52013-04-19 15:34:50 -05001752 u64 xferred = obj_request->xferred;
1753 u64 length = obj_request->length;
1754
Alex Elder6e2a4502013-03-27 09:16:30 -05001755 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1756 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001757 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001758 /*
Josh Durgin17c1cc12013-08-26 17:55:38 -07001759 * ENOENT means a hole in the image. We zero-fill the entire
1760 * length of the request. A short read also implies zero-fill
1761 * to the end of the request. An error requires the whole
1762 * length of the request to be reported finished with an error
1763 * to the block layer. In each case we update the xferred
1764 * count to indicate the whole request was satisfied.
Alex Elder6e2a4502013-03-27 09:16:30 -05001765 */
Alex Elderb9434c52013-04-19 15:34:50 -05001766 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001767 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001768 if (obj_request->type == OBJ_REQUEST_BIO)
1769 zero_bio_chain(obj_request->bio_list, 0);
1770 else
1771 zero_pages(obj_request->pages, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001772 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001773 } else if (xferred < length && !obj_request->result) {
1774 if (obj_request->type == OBJ_REQUEST_BIO)
1775 zero_bio_chain(obj_request->bio_list, xferred);
1776 else
1777 zero_pages(obj_request->pages, xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001778 }
Josh Durgin17c1cc12013-08-26 17:55:38 -07001779 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001780 obj_request_done_set(obj_request);
1781}
1782
Alex Elderbf0d5f502012-11-22 00:00:08 -06001783static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1784{
Alex Elder37206ee2013-02-20 17:32:08 -06001785 dout("%s: obj %p cb %p\n", __func__, obj_request,
1786 obj_request->callback);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001787 if (obj_request->callback)
1788 obj_request->callback(obj_request);
Alex Elder788e2df2013-01-17 12:25:27 -06001789 else
1790 complete_all(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001791}
1792
Ilya Dryomov0dcc6852016-09-26 15:43:52 +02001793static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err)
1794{
1795 obj_request->result = err;
1796 obj_request->xferred = 0;
1797 /*
1798 * kludge - mirror rbd_obj_request_submit() to match a put in
1799 * rbd_img_obj_callback()
1800 */
1801 if (obj_request_img_data_test(obj_request)) {
1802 WARN_ON(obj_request->callback != rbd_img_obj_callback);
1803 rbd_img_request_get(obj_request->img_request);
1804 }
1805 obj_request_done_set(obj_request);
1806 rbd_obj_request_complete(obj_request);
1807}
1808
Alex Elderc47f9372013-02-26 14:23:07 -06001809static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001810{
Alex Elder57acbaa2013-02-11 12:33:24 -06001811 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001812 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001813 bool layered = false;
1814
1815 if (obj_request_img_data_test(obj_request)) {
1816 img_request = obj_request->img_request;
1817 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001818 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001819 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001820
1821 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1822 obj_request, img_request, obj_request->result,
1823 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001824 if (layered && obj_request->result == -ENOENT &&
1825 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001826 rbd_img_parent_read(obj_request);
1827 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001828 rbd_img_obj_request_read_callback(obj_request);
1829 else
1830 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001831}
1832
Alex Elderc47f9372013-02-26 14:23:07 -06001833static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001834{
Sage Weil1b83bef2013-02-25 16:11:12 -08001835 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1836 obj_request->result, obj_request->length);
1837 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001838 * There is no such thing as a successful short write. Set
1839 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001840 */
1841 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001842 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001843}
1844
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001845static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
1846{
1847 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1848 obj_request->result, obj_request->length);
1849 /*
1850 * There is no such thing as a successful short discard. Set
1851 * it to our originally-requested length.
1852 */
1853 obj_request->xferred = obj_request->length;
Josh Durgind0265de2014-04-07 16:54:10 -07001854 /* discarding a non-existent object is not a problem */
1855 if (obj_request->result == -ENOENT)
1856 obj_request->result = 0;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001857 obj_request_done_set(obj_request);
1858}
1859
Alex Elderfbfab532013-02-08 09:55:48 -06001860/*
1861 * For a simple stat call there's nothing to do. We'll do more if
1862 * this is part of a write sequence for a layered image.
1863 */
Alex Elderc47f9372013-02-26 14:23:07 -06001864static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001865{
Alex Elder37206ee2013-02-20 17:32:08 -06001866 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001867 obj_request_done_set(obj_request);
1868}
1869
Ilya Dryomov27617132015-07-16 17:36:11 +03001870static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
1871{
1872 dout("%s: obj %p\n", __func__, obj_request);
1873
1874 if (obj_request_img_data_test(obj_request))
1875 rbd_osd_copyup_callback(obj_request);
1876 else
1877 obj_request_done_set(obj_request);
1878}
1879
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001880static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001881{
1882 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001883 u16 opcode;
1884
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001885 dout("%s: osd_req %p\n", __func__, osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001886 rbd_assert(osd_req == obj_request->osd_req);
Alex Elder57acbaa2013-02-11 12:33:24 -06001887 if (obj_request_img_data_test(obj_request)) {
1888 rbd_assert(obj_request->img_request);
1889 rbd_assert(obj_request->which != BAD_WHICH);
1890 } else {
1891 rbd_assert(obj_request->which == BAD_WHICH);
1892 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001893
Sage Weil1b83bef2013-02-25 16:11:12 -08001894 if (osd_req->r_result < 0)
1895 obj_request->result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001896
Alex Elderc47f9372013-02-26 14:23:07 -06001897 /*
1898 * We support a 64-bit length, but ultimately it has to be
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01001899 * passed to the block layer, which just supports a 32-bit
1900 * length field.
Alex Elderc47f9372013-02-26 14:23:07 -06001901 */
Yan, Zheng7665d852016-01-07 16:48:57 +08001902 obj_request->xferred = osd_req->r_ops[0].outdata_len;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001903 rbd_assert(obj_request->xferred < (u64)UINT_MAX);
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001904
Alex Elder79528732013-04-03 21:32:51 -05001905 opcode = osd_req->r_ops[0].op;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001906 switch (opcode) {
1907 case CEPH_OSD_OP_READ:
Alex Elderc47f9372013-02-26 14:23:07 -06001908 rbd_osd_read_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001909 break;
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001910 case CEPH_OSD_OP_SETALLOCHINT:
Ilya Dryomove30b7572015-10-07 17:27:17 +02001911 rbd_assert(osd_req->r_ops[1].op == CEPH_OSD_OP_WRITE ||
1912 osd_req->r_ops[1].op == CEPH_OSD_OP_WRITEFULL);
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001913 /* fall through */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001914 case CEPH_OSD_OP_WRITE:
Ilya Dryomove30b7572015-10-07 17:27:17 +02001915 case CEPH_OSD_OP_WRITEFULL:
Alex Elderc47f9372013-02-26 14:23:07 -06001916 rbd_osd_write_callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001917 break;
Alex Elderfbfab532013-02-08 09:55:48 -06001918 case CEPH_OSD_OP_STAT:
Alex Elderc47f9372013-02-26 14:23:07 -06001919 rbd_osd_stat_callback(obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001920 break;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001921 case CEPH_OSD_OP_DELETE:
1922 case CEPH_OSD_OP_TRUNCATE:
1923 case CEPH_OSD_OP_ZERO:
1924 rbd_osd_discard_callback(obj_request);
1925 break;
Alex Elder36be9a72013-01-19 00:30:28 -06001926 case CEPH_OSD_OP_CALL:
Ilya Dryomov27617132015-07-16 17:36:11 +03001927 rbd_osd_call_callback(obj_request);
1928 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001929 default:
Ilya Dryomov9584d502014-07-11 12:11:20 +04001930 rbd_warn(NULL, "%s: unsupported op %hu",
Alex Elderbf0d5f502012-11-22 00:00:08 -06001931 obj_request->object_name, (unsigned short) opcode);
1932 break;
1933 }
1934
Alex Elder07741302013-02-05 23:41:50 -06001935 if (obj_request_done_test(obj_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001936 rbd_obj_request_complete(obj_request);
1937}
1938
Alex Elder9d4df012013-04-19 15:34:50 -05001939static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001940{
Alex Elder8c042b02013-04-03 01:28:58 -05001941 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001942
Ilya Dryomov7c848832016-09-15 17:56:39 +02001943 rbd_assert(obj_request_img_data_test(obj_request));
1944 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001945}
1946
1947static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1948{
Alex Elder9d4df012013-04-19 15:34:50 -05001949 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001950
Ilya Dryomovbb873b5392016-05-26 00:29:52 +02001951 osd_req->r_mtime = CURRENT_TIME;
1952 osd_req->r_data_offset = obj_request->offset;
Alex Elder430c28c2013-04-03 21:32:51 -05001953}
1954
Ilya Dryomovbc812072017-01-25 18:16:23 +01001955static struct ceph_osd_request *
1956__rbd_osd_req_create(struct rbd_device *rbd_dev,
1957 struct ceph_snap_context *snapc,
1958 int num_ops, unsigned int flags,
1959 struct rbd_obj_request *obj_request)
1960{
1961 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1962 struct ceph_osd_request *req;
1963
1964 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1965 if (!req)
1966 return NULL;
1967
1968 req->r_flags = flags;
1969 req->r_callback = rbd_osd_req_callback;
1970 req->r_priv = obj_request;
1971
1972 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
1973 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, "%s",
1974 obj_request->object_name))
1975 goto err_req;
1976
1977 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1978 goto err_req;
1979
1980 return req;
1981
1982err_req:
1983 ceph_osdc_put_request(req);
1984 return NULL;
1985}
1986
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02001987/*
1988 * Create an osd request. A read request has one osd op (read).
1989 * A write request has either one (watch) or two (hint+write) osd ops.
1990 * (All rbd data writes are prefixed with an allocation hint op, but
1991 * technically osd watch is a write request, hence this distinction.)
1992 */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001993static struct ceph_osd_request *rbd_osd_req_create(
1994 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001995 enum obj_operation_type op_type,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02001996 unsigned int num_ops,
Alex Elder430c28c2013-04-03 21:32:51 -05001997 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001998{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001999 struct ceph_snap_context *snapc = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002000
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002001 if (obj_request_img_data_test(obj_request) &&
2002 (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
Alex Elder6365d332013-02-11 12:33:24 -06002003 struct rbd_img_request *img_request = obj_request->img_request;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002004 if (op_type == OBJ_OP_WRITE) {
2005 rbd_assert(img_request_write_test(img_request));
2006 } else {
2007 rbd_assert(img_request_discard_test(img_request));
2008 }
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002009 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002010 }
2011
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002012 rbd_assert(num_ops == 1 || ((op_type == OBJ_OP_WRITE) && num_ops == 2));
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02002013
Ilya Dryomovbc812072017-01-25 18:16:23 +01002014 return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
2015 (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
2016 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK : CEPH_OSD_FLAG_READ,
2017 obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002018}
2019
Alex Elder0eefd472013-04-19 15:34:50 -05002020/*
Josh Durgind3246fb2014-04-07 16:49:21 -07002021 * Create a copyup osd request based on the information in the object
2022 * request supplied. A copyup request has two or three osd ops, a
2023 * copyup method call, potentially a hint op, and a write or truncate
2024 * or zero op.
Alex Elder0eefd472013-04-19 15:34:50 -05002025 */
2026static struct ceph_osd_request *
2027rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
2028{
2029 struct rbd_img_request *img_request;
Josh Durgind3246fb2014-04-07 16:49:21 -07002030 int num_osd_ops = 3;
Alex Elder0eefd472013-04-19 15:34:50 -05002031
2032 rbd_assert(obj_request_img_data_test(obj_request));
2033 img_request = obj_request->img_request;
2034 rbd_assert(img_request);
Josh Durgind3246fb2014-04-07 16:49:21 -07002035 rbd_assert(img_request_write_test(img_request) ||
2036 img_request_discard_test(img_request));
Alex Elder0eefd472013-04-19 15:34:50 -05002037
Josh Durgind3246fb2014-04-07 16:49:21 -07002038 if (img_request_discard_test(img_request))
2039 num_osd_ops = 2;
2040
Ilya Dryomovbc812072017-01-25 18:16:23 +01002041 return __rbd_osd_req_create(img_request->rbd_dev,
2042 img_request->snapc, num_osd_ops,
2043 CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
2044 obj_request);
Alex Elder0eefd472013-04-19 15:34:50 -05002045}
2046
Alex Elderbf0d5f502012-11-22 00:00:08 -06002047static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
2048{
2049 ceph_osdc_put_request(osd_req);
2050}
2051
2052/* object_name is assumed to be a non-null pointer and NUL-terminated */
2053
2054static struct rbd_obj_request *rbd_obj_request_create(const char *object_name,
Alex Elderbf0d5f502012-11-22 00:00:08 -06002055 enum obj_request_type type)
2056{
2057 struct rbd_obj_request *obj_request;
2058 size_t size;
2059 char *name;
2060
2061 rbd_assert(obj_request_type_valid(type));
2062
2063 size = strlen(object_name) + 1;
Ilya Dryomov5a60e872015-06-24 17:24:33 +03002064 name = kmalloc(size, GFP_NOIO);
Alex Elderf907ad52013-05-01 12:43:03 -05002065 if (!name)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002066 return NULL;
2067
Ilya Dryomov5a60e872015-06-24 17:24:33 +03002068 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Alex Elderf907ad52013-05-01 12:43:03 -05002069 if (!obj_request) {
2070 kfree(name);
2071 return NULL;
2072 }
2073
Alex Elderbf0d5f502012-11-22 00:00:08 -06002074 obj_request->object_name = memcpy(name, object_name, size);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002075 obj_request->which = BAD_WHICH;
2076 obj_request->type = type;
2077 INIT_LIST_HEAD(&obj_request->links);
Alex Elder788e2df2013-01-17 12:25:27 -06002078 init_completion(&obj_request->completion);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002079 kref_init(&obj_request->kref);
2080
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002081 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002082 return obj_request;
2083}
2084
2085static void rbd_obj_request_destroy(struct kref *kref)
2086{
2087 struct rbd_obj_request *obj_request;
2088
2089 obj_request = container_of(kref, struct rbd_obj_request, kref);
2090
Alex Elder37206ee2013-02-20 17:32:08 -06002091 dout("%s: obj %p\n", __func__, obj_request);
2092
Alex Elderbf0d5f502012-11-22 00:00:08 -06002093 rbd_assert(obj_request->img_request == NULL);
2094 rbd_assert(obj_request->which == BAD_WHICH);
2095
2096 if (obj_request->osd_req)
2097 rbd_osd_req_destroy(obj_request->osd_req);
2098
2099 rbd_assert(obj_request_type_valid(obj_request->type));
2100 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06002101 case OBJ_REQUEST_NODATA:
2102 break; /* Nothing to do */
Alex Elderbf0d5f502012-11-22 00:00:08 -06002103 case OBJ_REQUEST_BIO:
2104 if (obj_request->bio_list)
2105 bio_chain_put(obj_request->bio_list);
2106 break;
Alex Elder788e2df2013-01-17 12:25:27 -06002107 case OBJ_REQUEST_PAGES:
Ilya Dryomov04dc9232016-09-15 18:05:16 +02002108 /* img_data requests don't own their page array */
2109 if (obj_request->pages &&
2110 !obj_request_img_data_test(obj_request))
Alex Elder788e2df2013-01-17 12:25:27 -06002111 ceph_release_page_vector(obj_request->pages,
2112 obj_request->page_count);
2113 break;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002114 }
2115
Alex Elderf907ad52013-05-01 12:43:03 -05002116 kfree(obj_request->object_name);
Alex Elder868311b2013-05-01 12:43:03 -05002117 obj_request->object_name = NULL;
2118 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002119}
2120
Alex Elderfb65d2282013-05-08 22:50:04 -05002121/* It's OK to call this for a device with no parent */
2122
2123static void rbd_spec_put(struct rbd_spec *spec);
2124static void rbd_dev_unparent(struct rbd_device *rbd_dev)
2125{
2126 rbd_dev_remove_parent(rbd_dev);
2127 rbd_spec_put(rbd_dev->parent_spec);
2128 rbd_dev->parent_spec = NULL;
2129 rbd_dev->parent_overlap = 0;
2130}
2131
Alex Elderbf0d5f502012-11-22 00:00:08 -06002132/*
Alex Eldera2acd002013-05-08 22:50:04 -05002133 * Parent image reference counting is used to determine when an
2134 * image's parent fields can be safely torn down--after there are no
2135 * more in-flight requests to the parent image. When the last
2136 * reference is dropped, cleaning them up is safe.
2137 */
2138static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
2139{
2140 int counter;
2141
2142 if (!rbd_dev->parent_spec)
2143 return;
2144
2145 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
2146 if (counter > 0)
2147 return;
2148
2149 /* Last reference; clean up parent data structures */
2150
2151 if (!counter)
2152 rbd_dev_unparent(rbd_dev);
2153 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04002154 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05002155}
2156
2157/*
2158 * If an image has a non-zero parent overlap, get a reference to its
2159 * parent.
2160 *
2161 * Returns true if the rbd device has a parent with a non-zero
2162 * overlap and a reference for it was successfully taken, or
2163 * false otherwise.
2164 */
2165static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2166{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002167 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05002168
2169 if (!rbd_dev->parent_spec)
2170 return false;
2171
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002172 down_read(&rbd_dev->header_rwsem);
2173 if (rbd_dev->parent_overlap)
2174 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2175 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05002176
2177 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04002178 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05002179
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002180 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05002181}
2182
Alex Elderbf0d5f502012-11-22 00:00:08 -06002183/*
2184 * Caller is responsible for filling in the list of object requests
2185 * that comprises the image request, and the Linux request pointer
2186 * (if there is one).
2187 */
Alex Eldercc344fa2013-02-19 12:25:56 -06002188static struct rbd_img_request *rbd_img_request_create(
2189 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06002190 u64 offset, u64 length,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002191 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07002192 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002193{
2194 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002195
Ilya Dryomov7a716aa2014-08-05 11:25:54 +04002196 img_request = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002197 if (!img_request)
2198 return NULL;
2199
Alex Elderbf0d5f502012-11-22 00:00:08 -06002200 img_request->rq = NULL;
2201 img_request->rbd_dev = rbd_dev;
2202 img_request->offset = offset;
2203 img_request->length = length;
Alex Elder0c425242013-02-08 09:55:49 -06002204 img_request->flags = 0;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002205 if (op_type == OBJ_OP_DISCARD) {
2206 img_request_discard_set(img_request);
2207 img_request->snapc = snapc;
2208 } else if (op_type == OBJ_OP_WRITE) {
Alex Elder0c425242013-02-08 09:55:49 -06002209 img_request_write_set(img_request);
Josh Durgin4e752f02014-04-08 11:12:11 -07002210 img_request->snapc = snapc;
Alex Elder0c425242013-02-08 09:55:49 -06002211 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002212 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06002213 }
Alex Eldera2acd002013-05-08 22:50:04 -05002214 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06002215 img_request_layered_set(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002216 spin_lock_init(&img_request->completion_lock);
2217 img_request->next_completion = 0;
2218 img_request->callback = NULL;
Alex Eldera5a337d2013-01-24 16:13:36 -06002219 img_request->result = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002220 img_request->obj_request_count = 0;
2221 INIT_LIST_HEAD(&img_request->obj_requests);
2222 kref_init(&img_request->kref);
2223
Alex Elder37206ee2013-02-20 17:32:08 -06002224 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002225 obj_op_name(op_type), offset, length, img_request);
Alex Elder37206ee2013-02-20 17:32:08 -06002226
Alex Elderbf0d5f502012-11-22 00:00:08 -06002227 return img_request;
2228}
2229
2230static void rbd_img_request_destroy(struct kref *kref)
2231{
2232 struct rbd_img_request *img_request;
2233 struct rbd_obj_request *obj_request;
2234 struct rbd_obj_request *next_obj_request;
2235
2236 img_request = container_of(kref, struct rbd_img_request, kref);
2237
Alex Elder37206ee2013-02-20 17:32:08 -06002238 dout("%s: img %p\n", __func__, img_request);
2239
Alex Elderbf0d5f502012-11-22 00:00:08 -06002240 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2241 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06002242 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002243
Alex Eldera2acd002013-05-08 22:50:04 -05002244 if (img_request_layered_test(img_request)) {
2245 img_request_layered_clear(img_request);
2246 rbd_dev_parent_put(img_request->rbd_dev);
2247 }
2248
Josh Durginbef95452014-04-04 17:47:52 -07002249 if (img_request_write_test(img_request) ||
2250 img_request_discard_test(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05002251 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002252
Alex Elder1c2a9df2013-05-01 12:43:03 -05002253 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002254}
2255
Alex Eldere93f3152013-05-08 22:50:04 -05002256static struct rbd_img_request *rbd_parent_request_create(
2257 struct rbd_obj_request *obj_request,
2258 u64 img_offset, u64 length)
2259{
2260 struct rbd_img_request *parent_request;
2261 struct rbd_device *rbd_dev;
2262
2263 rbd_assert(obj_request->img_request);
2264 rbd_dev = obj_request->img_request->rbd_dev;
2265
Josh Durgin4e752f02014-04-08 11:12:11 -07002266 parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002267 length, OBJ_OP_READ, NULL);
Alex Eldere93f3152013-05-08 22:50:04 -05002268 if (!parent_request)
2269 return NULL;
2270
2271 img_request_child_set(parent_request);
2272 rbd_obj_request_get(obj_request);
2273 parent_request->obj_request = obj_request;
2274
2275 return parent_request;
2276}
2277
2278static void rbd_parent_request_destroy(struct kref *kref)
2279{
2280 struct rbd_img_request *parent_request;
2281 struct rbd_obj_request *orig_request;
2282
2283 parent_request = container_of(kref, struct rbd_img_request, kref);
2284 orig_request = parent_request->obj_request;
2285
2286 parent_request->obj_request = NULL;
2287 rbd_obj_request_put(orig_request);
2288 img_request_child_clear(parent_request);
2289
2290 rbd_img_request_destroy(kref);
2291}
2292
Alex Elder12178572013-02-08 09:55:49 -06002293static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2294{
Alex Elder6365d332013-02-11 12:33:24 -06002295 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06002296 unsigned int xferred;
2297 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002298 bool more;
Alex Elder12178572013-02-08 09:55:49 -06002299
Alex Elder6365d332013-02-11 12:33:24 -06002300 rbd_assert(obj_request_img_data_test(obj_request));
2301 img_request = obj_request->img_request;
2302
Alex Elder12178572013-02-08 09:55:49 -06002303 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2304 xferred = (unsigned int)obj_request->xferred;
2305 result = obj_request->result;
2306 if (result) {
2307 struct rbd_device *rbd_dev = img_request->rbd_dev;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002308 enum obj_operation_type op_type;
2309
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002310 if (img_request_discard_test(img_request))
2311 op_type = OBJ_OP_DISCARD;
2312 else if (img_request_write_test(img_request))
2313 op_type = OBJ_OP_WRITE;
2314 else
2315 op_type = OBJ_OP_READ;
Alex Elder12178572013-02-08 09:55:49 -06002316
Ilya Dryomov9584d502014-07-11 12:11:20 +04002317 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002318 obj_op_name(op_type), obj_request->length,
2319 obj_request->img_offset, obj_request->offset);
Ilya Dryomov9584d502014-07-11 12:11:20 +04002320 rbd_warn(rbd_dev, " result %d xferred %x",
Alex Elder12178572013-02-08 09:55:49 -06002321 result, xferred);
2322 if (!img_request->result)
2323 img_request->result = result;
Ilya Dryomov082a75d2015-04-25 15:56:15 +03002324 /*
2325 * Need to end I/O on the entire obj_request worth of
2326 * bytes in case of error.
2327 */
2328 xferred = obj_request->length;
Alex Elder12178572013-02-08 09:55:49 -06002329 }
2330
Alex Elder8b3e1a52013-01-24 16:13:36 -06002331 if (img_request_child_test(img_request)) {
2332 rbd_assert(img_request->obj_request != NULL);
2333 more = obj_request->which < img_request->obj_request_count - 1;
2334 } else {
2335 rbd_assert(img_request->rq != NULL);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01002336
2337 more = blk_update_request(img_request->rq, result, xferred);
2338 if (!more)
2339 __blk_mq_end_request(img_request->rq, result);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002340 }
2341
2342 return more;
Alex Elder12178572013-02-08 09:55:49 -06002343}
2344
Alex Elder21692382013-04-05 01:27:12 -05002345static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2346{
2347 struct rbd_img_request *img_request;
2348 u32 which = obj_request->which;
2349 bool more = true;
2350
Alex Elder6365d332013-02-11 12:33:24 -06002351 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05002352 img_request = obj_request->img_request;
2353
2354 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2355 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05002356 rbd_assert(img_request->obj_request_count > 0);
2357 rbd_assert(which != BAD_WHICH);
2358 rbd_assert(which < img_request->obj_request_count);
Alex Elder21692382013-04-05 01:27:12 -05002359
2360 spin_lock_irq(&img_request->completion_lock);
2361 if (which != img_request->next_completion)
2362 goto out;
2363
2364 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05002365 rbd_assert(more);
2366 rbd_assert(which < img_request->obj_request_count);
2367
2368 if (!obj_request_done_test(obj_request))
2369 break;
Alex Elder12178572013-02-08 09:55:49 -06002370 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002371 which++;
2372 }
2373
2374 rbd_assert(more ^ (which == img_request->obj_request_count));
2375 img_request->next_completion = which;
2376out:
2377 spin_unlock_irq(&img_request->completion_lock);
Alex Elder0f2d5be2014-04-26 14:21:44 +04002378 rbd_img_request_put(img_request);
Alex Elder21692382013-04-05 01:27:12 -05002379
2380 if (!more)
2381 rbd_img_request_complete(img_request);
2382}
2383
Alex Elderf1a47392013-04-19 15:34:50 -05002384/*
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002385 * Add individual osd ops to the given ceph_osd_request and prepare
2386 * them for submission. num_ops is the current number of
2387 * osd operations already to the object request.
2388 */
2389static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
2390 struct ceph_osd_request *osd_request,
2391 enum obj_operation_type op_type,
2392 unsigned int num_ops)
2393{
2394 struct rbd_img_request *img_request = obj_request->img_request;
2395 struct rbd_device *rbd_dev = img_request->rbd_dev;
2396 u64 object_size = rbd_obj_bytes(&rbd_dev->header);
2397 u64 offset = obj_request->offset;
2398 u64 length = obj_request->length;
2399 u64 img_end;
2400 u16 opcode;
2401
2402 if (op_type == OBJ_OP_DISCARD) {
Josh Durgind3246fb2014-04-07 16:49:21 -07002403 if (!offset && length == object_size &&
2404 (!img_request_layered_test(img_request) ||
2405 !obj_request_overlaps_parent(obj_request))) {
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002406 opcode = CEPH_OSD_OP_DELETE;
2407 } else if ((offset + length == object_size)) {
2408 opcode = CEPH_OSD_OP_TRUNCATE;
2409 } else {
2410 down_read(&rbd_dev->header_rwsem);
2411 img_end = rbd_dev->header.image_size;
2412 up_read(&rbd_dev->header_rwsem);
2413
2414 if (obj_request->img_offset + length == img_end)
2415 opcode = CEPH_OSD_OP_TRUNCATE;
2416 else
2417 opcode = CEPH_OSD_OP_ZERO;
2418 }
2419 } else if (op_type == OBJ_OP_WRITE) {
Ilya Dryomove30b7572015-10-07 17:27:17 +02002420 if (!offset && length == object_size)
2421 opcode = CEPH_OSD_OP_WRITEFULL;
2422 else
2423 opcode = CEPH_OSD_OP_WRITE;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002424 osd_req_op_alloc_hint_init(osd_request, num_ops,
2425 object_size, object_size);
2426 num_ops++;
2427 } else {
2428 opcode = CEPH_OSD_OP_READ;
2429 }
2430
Ilya Dryomov7e868b62014-11-21 22:16:43 +03002431 if (opcode == CEPH_OSD_OP_DELETE)
Yan, Zheng144cba12015-04-27 11:09:54 +08002432 osd_req_op_init(osd_request, num_ops, opcode, 0);
Ilya Dryomov7e868b62014-11-21 22:16:43 +03002433 else
2434 osd_req_op_extent_init(osd_request, num_ops, opcode,
2435 offset, length, 0, 0);
2436
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002437 if (obj_request->type == OBJ_REQUEST_BIO)
2438 osd_req_op_extent_osd_data_bio(osd_request, num_ops,
2439 obj_request->bio_list, length);
2440 else if (obj_request->type == OBJ_REQUEST_PAGES)
2441 osd_req_op_extent_osd_data_pages(osd_request, num_ops,
2442 obj_request->pages, length,
2443 offset & ~PAGE_MASK, false, false);
2444
2445 /* Discards are also writes */
2446 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
2447 rbd_osd_req_format_write(obj_request);
2448 else
2449 rbd_osd_req_format_read(obj_request);
2450}
2451
2452/*
Alex Elderf1a47392013-04-19 15:34:50 -05002453 * Split up an image request into one or more object requests, each
2454 * to a different object. The "type" parameter indicates whether
2455 * "data_desc" is the pointer to the head of a list of bio
2456 * structures, or the base of a page array. In either case this
2457 * function assumes data_desc describes memory sufficient to hold
2458 * all data described by the image request.
2459 */
2460static int rbd_img_request_fill(struct rbd_img_request *img_request,
2461 enum obj_request_type type,
2462 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002463{
2464 struct rbd_device *rbd_dev = img_request->rbd_dev;
2465 struct rbd_obj_request *obj_request = NULL;
2466 struct rbd_obj_request *next_obj_request;
Jingoo Hana1580732013-08-09 13:04:35 +09002467 struct bio *bio_list = NULL;
Alex Elderf1a47392013-04-19 15:34:50 -05002468 unsigned int bio_offset = 0;
Jingoo Hana1580732013-08-09 13:04:35 +09002469 struct page **pages = NULL;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002470 enum obj_operation_type op_type;
Alex Elder7da22d22013-01-24 16:13:36 -06002471 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002472 u64 resid;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002473
Alex Elderf1a47392013-04-19 15:34:50 -05002474 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2475 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06002476
Alex Elder7da22d22013-01-24 16:13:36 -06002477 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002478 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06002479 rbd_assert(resid > 0);
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002480 op_type = rbd_img_request_op_type(img_request);
Alex Elderf1a47392013-04-19 15:34:50 -05002481
2482 if (type == OBJ_REQUEST_BIO) {
2483 bio_list = data_desc;
Kent Overstreet4f024f32013-10-11 15:44:27 -07002484 rbd_assert(img_offset ==
2485 bio_list->bi_iter.bi_sector << SECTOR_SHIFT);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002486 } else if (type == OBJ_REQUEST_PAGES) {
Alex Elderf1a47392013-04-19 15:34:50 -05002487 pages = data_desc;
2488 }
2489
Alex Elderbf0d5f502012-11-22 00:00:08 -06002490 while (resid) {
Alex Elder2fa12322013-04-05 01:27:12 -05002491 struct ceph_osd_request *osd_req;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002492 const char *object_name;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002493 u64 offset = rbd_segment_offset(rbd_dev, img_offset);
2494 u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002495
Alex Elder7da22d22013-01-24 16:13:36 -06002496 object_name = rbd_segment_name(rbd_dev, img_offset);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002497 if (!object_name)
2498 goto out_unwind;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002499 obj_request = rbd_obj_request_create(object_name, type);
Alex Elder78c2a442013-05-01 12:43:04 -05002500 /* object request has its own copy of the object name */
2501 rbd_segment_name_free(object_name);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002502 if (!obj_request)
2503 goto out_unwind;
Ilya Dryomov62054da2014-03-04 11:57:17 +02002504
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002505 obj_request->offset = offset;
2506 obj_request->length = length;
2507
Josh Durgin03507db2013-08-27 14:45:46 -07002508 /*
2509 * set obj_request->img_request before creating the
2510 * osd_request so that it gets the right snapc
2511 */
2512 rbd_img_obj_request_add(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002513
Alex Elderf1a47392013-04-19 15:34:50 -05002514 if (type == OBJ_REQUEST_BIO) {
2515 unsigned int clone_size;
2516
2517 rbd_assert(length <= (u64)UINT_MAX);
2518 clone_size = (unsigned int)length;
2519 obj_request->bio_list =
2520 bio_chain_clone_range(&bio_list,
2521 &bio_offset,
2522 clone_size,
David Disseldorp2224d872016-04-05 11:13:39 +02002523 GFP_NOIO);
Alex Elderf1a47392013-04-19 15:34:50 -05002524 if (!obj_request->bio_list)
Ilya Dryomov62054da2014-03-04 11:57:17 +02002525 goto out_unwind;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002526 } else if (type == OBJ_REQUEST_PAGES) {
Alex Elderf1a47392013-04-19 15:34:50 -05002527 unsigned int page_count;
2528
2529 obj_request->pages = pages;
2530 page_count = (u32)calc_pages_for(offset, length);
2531 obj_request->page_count = page_count;
2532 if ((offset + length) & ~PAGE_MASK)
2533 page_count--; /* more on last page */
2534 pages += page_count;
2535 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002536
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002537 osd_req = rbd_osd_req_create(rbd_dev, op_type,
2538 (op_type == OBJ_OP_WRITE) ? 2 : 1,
2539 obj_request);
Alex Elder2fa12322013-04-05 01:27:12 -05002540 if (!osd_req)
Ilya Dryomov62054da2014-03-04 11:57:17 +02002541 goto out_unwind;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002542
Alex Elder2fa12322013-04-05 01:27:12 -05002543 obj_request->osd_req = osd_req;
Alex Elder21692382013-04-05 01:27:12 -05002544 obj_request->callback = rbd_img_obj_callback;
Alex Elder7da22d22013-01-24 16:13:36 -06002545 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002546
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002547 rbd_img_obj_request_fill(obj_request, osd_req, op_type, 0);
2548
Alex Elder7da22d22013-01-24 16:13:36 -06002549 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002550 resid -= length;
2551 }
2552
2553 return 0;
2554
Alex Elderbf0d5f502012-11-22 00:00:08 -06002555out_unwind:
2556 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
Ilya Dryomov42dd0372014-03-04 11:57:17 +02002557 rbd_img_obj_request_del(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002558
2559 return -ENOMEM;
2560}
2561
Alex Elder3d7efd12013-04-19 15:34:50 -05002562static void
Ilya Dryomov27617132015-07-16 17:36:11 +03002563rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
Alex Elder0eefd472013-04-19 15:34:50 -05002564{
2565 struct rbd_img_request *img_request;
2566 struct rbd_device *rbd_dev;
Alex Elderebda6402013-05-10 16:29:22 -05002567 struct page **pages;
Alex Elder0eefd472013-04-19 15:34:50 -05002568 u32 page_count;
2569
Ilya Dryomov27617132015-07-16 17:36:11 +03002570 dout("%s: obj %p\n", __func__, obj_request);
2571
Josh Durgind3246fb2014-04-07 16:49:21 -07002572 rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2573 obj_request->type == OBJ_REQUEST_NODATA);
Alex Elder0eefd472013-04-19 15:34:50 -05002574 rbd_assert(obj_request_img_data_test(obj_request));
2575 img_request = obj_request->img_request;
2576 rbd_assert(img_request);
2577
2578 rbd_dev = img_request->rbd_dev;
2579 rbd_assert(rbd_dev);
Alex Elder0eefd472013-04-19 15:34:50 -05002580
Alex Elderebda6402013-05-10 16:29:22 -05002581 pages = obj_request->copyup_pages;
2582 rbd_assert(pages != NULL);
Alex Elder0eefd472013-04-19 15:34:50 -05002583 obj_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002584 page_count = obj_request->copyup_page_count;
2585 rbd_assert(page_count);
2586 obj_request->copyup_page_count = 0;
2587 ceph_release_page_vector(pages, page_count);
Alex Elder0eefd472013-04-19 15:34:50 -05002588
2589 /*
2590 * We want the transfer count to reflect the size of the
2591 * original write request. There is no such thing as a
2592 * successful short write, so if the request was successful
2593 * we can just set it to the originally-requested length.
2594 */
2595 if (!obj_request->result)
2596 obj_request->xferred = obj_request->length;
2597
Ilya Dryomov27617132015-07-16 17:36:11 +03002598 obj_request_done_set(obj_request);
Alex Elder0eefd472013-04-19 15:34:50 -05002599}
2600
2601static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002602rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2603{
2604 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002605 struct ceph_osd_request *osd_req;
Alex Elder0eefd472013-04-19 15:34:50 -05002606 struct rbd_device *rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002607 struct page **pages;
Josh Durgind3246fb2014-04-07 16:49:21 -07002608 enum obj_operation_type op_type;
Alex Elderebda6402013-05-10 16:29:22 -05002609 u32 page_count;
Alex Elderbbea1c12013-05-06 17:40:33 -05002610 int img_result;
Alex Elderebda6402013-05-10 16:29:22 -05002611 u64 parent_length;
Alex Elder3d7efd12013-04-19 15:34:50 -05002612
2613 rbd_assert(img_request_child_test(img_request));
2614
2615 /* First get what we need from the image request */
2616
2617 pages = img_request->copyup_pages;
2618 rbd_assert(pages != NULL);
2619 img_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002620 page_count = img_request->copyup_page_count;
2621 rbd_assert(page_count);
2622 img_request->copyup_page_count = 0;
Alex Elder3d7efd12013-04-19 15:34:50 -05002623
2624 orig_request = img_request->obj_request;
2625 rbd_assert(orig_request != NULL);
Alex Elderb91f09f2013-05-10 16:29:22 -05002626 rbd_assert(obj_request_type_valid(orig_request->type));
Alex Elderbbea1c12013-05-06 17:40:33 -05002627 img_result = img_request->result;
Alex Elderebda6402013-05-10 16:29:22 -05002628 parent_length = img_request->length;
Ilya Dryomovfa355112016-09-16 15:20:42 +02002629 rbd_assert(img_result || parent_length == img_request->xferred);
Alex Elder3d7efd12013-04-19 15:34:50 -05002630 rbd_img_request_put(img_request);
2631
Alex Elder91c6feb2013-05-06 17:40:32 -05002632 rbd_assert(orig_request->img_request);
2633 rbd_dev = orig_request->img_request->rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002634 rbd_assert(rbd_dev);
Alex Elder3d7efd12013-04-19 15:34:50 -05002635
Alex Elderbbea1c12013-05-06 17:40:33 -05002636 /*
2637 * If the overlap has become 0 (most likely because the
2638 * image has been flattened) we need to free the pages
2639 * and re-submit the original write request.
2640 */
2641 if (!rbd_dev->parent_overlap) {
Alex Elderbbea1c12013-05-06 17:40:33 -05002642 ceph_release_page_vector(pages, page_count);
Ilya Dryomov980917f2016-09-12 18:59:42 +02002643 rbd_obj_request_submit(orig_request);
2644 return;
Alex Elderbbea1c12013-05-06 17:40:33 -05002645 }
2646
2647 if (img_result)
Alex Elder0eefd472013-04-19 15:34:50 -05002648 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002649
Alex Elder8785b1d2013-05-09 10:08:49 -05002650 /*
2651 * The original osd request is of no use to use any more.
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002652 * We need a new one that can hold the three ops in a copyup
Alex Elder8785b1d2013-05-09 10:08:49 -05002653 * request. Allocate the new copyup osd request for the
2654 * original request, and release the old one.
2655 */
Alex Elderbbea1c12013-05-06 17:40:33 -05002656 img_result = -ENOMEM;
Alex Elder0eefd472013-04-19 15:34:50 -05002657 osd_req = rbd_osd_req_create_copyup(orig_request);
2658 if (!osd_req)
2659 goto out_err;
Alex Elder8785b1d2013-05-09 10:08:49 -05002660 rbd_osd_req_destroy(orig_request->osd_req);
Alex Elder0eefd472013-04-19 15:34:50 -05002661 orig_request->osd_req = osd_req;
2662 orig_request->copyup_pages = pages;
Alex Elderebda6402013-05-10 16:29:22 -05002663 orig_request->copyup_page_count = page_count;
Alex Elder3d7efd12013-04-19 15:34:50 -05002664
Alex Elder0eefd472013-04-19 15:34:50 -05002665 /* Initialize the copyup op */
2666
2667 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
Alex Elderebda6402013-05-10 16:29:22 -05002668 osd_req_op_cls_request_data_pages(osd_req, 0, pages, parent_length, 0,
Alex Elder0eefd472013-04-19 15:34:50 -05002669 false, false);
2670
Josh Durgind3246fb2014-04-07 16:49:21 -07002671 /* Add the other op(s) */
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002672
Josh Durgind3246fb2014-04-07 16:49:21 -07002673 op_type = rbd_img_request_op_type(orig_request->img_request);
2674 rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
Alex Elder0eefd472013-04-19 15:34:50 -05002675
2676 /* All set, send it off. */
2677
Ilya Dryomov980917f2016-09-12 18:59:42 +02002678 rbd_obj_request_submit(orig_request);
2679 return;
Alex Elder0eefd472013-04-19 15:34:50 -05002680
Alex Elder0eefd472013-04-19 15:34:50 -05002681out_err:
Ilya Dryomovfa355112016-09-16 15:20:42 +02002682 ceph_release_page_vector(pages, page_count);
Ilya Dryomov0dcc6852016-09-26 15:43:52 +02002683 rbd_obj_request_error(orig_request, img_result);
Alex Elder3d7efd12013-04-19 15:34:50 -05002684}
2685
2686/*
2687 * Read from the parent image the range of data that covers the
2688 * entire target of the given object request. This is used for
2689 * satisfying a layered image write request when the target of an
2690 * object request from the image request does not exist.
2691 *
2692 * A page array big enough to hold the returned data is allocated
2693 * and supplied to rbd_img_request_fill() as the "data descriptor."
2694 * When the read completes, this page array will be transferred to
2695 * the original object request for the copyup operation.
2696 *
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002697 * If an error occurs, it is recorded as the result of the original
2698 * object request in rbd_img_obj_exists_callback().
Alex Elder3d7efd12013-04-19 15:34:50 -05002699 */
2700static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2701{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002702 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002703 struct rbd_img_request *parent_request = NULL;
Alex Elder3d7efd12013-04-19 15:34:50 -05002704 u64 img_offset;
2705 u64 length;
2706 struct page **pages = NULL;
2707 u32 page_count;
2708 int result;
2709
Alex Elder3d7efd12013-04-19 15:34:50 -05002710 rbd_assert(rbd_dev->parent != NULL);
2711
2712 /*
2713 * Determine the byte range covered by the object in the
2714 * child image to which the original request was to be sent.
2715 */
2716 img_offset = obj_request->img_offset - obj_request->offset;
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01002717 length = rbd_obj_bytes(&rbd_dev->header);
Alex Elder3d7efd12013-04-19 15:34:50 -05002718
2719 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002720 * There is no defined parent data beyond the parent
2721 * overlap, so limit what we read at that boundary if
2722 * necessary.
2723 */
2724 if (img_offset + length > rbd_dev->parent_overlap) {
2725 rbd_assert(img_offset < rbd_dev->parent_overlap);
2726 length = rbd_dev->parent_overlap - img_offset;
2727 }
2728
2729 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002730 * Allocate a page array big enough to receive the data read
2731 * from the parent.
2732 */
2733 page_count = (u32)calc_pages_for(0, length);
2734 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
2735 if (IS_ERR(pages)) {
2736 result = PTR_ERR(pages);
2737 pages = NULL;
2738 goto out_err;
2739 }
2740
2741 result = -ENOMEM;
Alex Eldere93f3152013-05-08 22:50:04 -05002742 parent_request = rbd_parent_request_create(obj_request,
2743 img_offset, length);
Alex Elder3d7efd12013-04-19 15:34:50 -05002744 if (!parent_request)
2745 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002746
2747 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_PAGES, pages);
2748 if (result)
2749 goto out_err;
Ilya Dryomov058aa992016-09-12 14:44:45 +02002750
Alex Elder3d7efd12013-04-19 15:34:50 -05002751 parent_request->copyup_pages = pages;
Alex Elderebda6402013-05-10 16:29:22 -05002752 parent_request->copyup_page_count = page_count;
Alex Elder3d7efd12013-04-19 15:34:50 -05002753 parent_request->callback = rbd_img_obj_parent_read_full_callback;
Ilya Dryomov058aa992016-09-12 14:44:45 +02002754
Alex Elder3d7efd12013-04-19 15:34:50 -05002755 result = rbd_img_request_submit(parent_request);
2756 if (!result)
2757 return 0;
2758
2759 parent_request->copyup_pages = NULL;
Alex Elderebda6402013-05-10 16:29:22 -05002760 parent_request->copyup_page_count = 0;
Alex Elder3d7efd12013-04-19 15:34:50 -05002761 parent_request->obj_request = NULL;
2762 rbd_obj_request_put(obj_request);
2763out_err:
2764 if (pages)
2765 ceph_release_page_vector(pages, page_count);
2766 if (parent_request)
2767 rbd_img_request_put(parent_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002768 return result;
2769}
2770
Alex Elderc5b5ef62013-02-11 12:33:24 -06002771static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2772{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002773 struct rbd_obj_request *orig_request;
Alex Elder638f5ab2013-05-06 17:40:33 -05002774 struct rbd_device *rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002775 int result;
2776
2777 rbd_assert(!obj_request_img_data_test(obj_request));
2778
2779 /*
2780 * All we need from the object request is the original
2781 * request and the result of the STAT op. Grab those, then
2782 * we're done with the request.
2783 */
2784 orig_request = obj_request->obj_request;
2785 obj_request->obj_request = NULL;
Alex Elder912c3172013-05-13 20:35:38 -05002786 rbd_obj_request_put(orig_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002787 rbd_assert(orig_request);
2788 rbd_assert(orig_request->img_request);
2789
2790 result = obj_request->result;
2791 obj_request->result = 0;
2792
2793 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2794 obj_request, orig_request, result,
2795 obj_request->xferred, obj_request->length);
2796 rbd_obj_request_put(obj_request);
2797
Alex Elder638f5ab2013-05-06 17:40:33 -05002798 /*
2799 * If the overlap has become 0 (most likely because the
Ilya Dryomov980917f2016-09-12 18:59:42 +02002800 * image has been flattened) we need to re-submit the
2801 * original request.
Alex Elder638f5ab2013-05-06 17:40:33 -05002802 */
2803 rbd_dev = orig_request->img_request->rbd_dev;
2804 if (!rbd_dev->parent_overlap) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02002805 rbd_obj_request_submit(orig_request);
2806 return;
Alex Elder638f5ab2013-05-06 17:40:33 -05002807 }
Alex Elderc5b5ef62013-02-11 12:33:24 -06002808
2809 /*
2810 * Our only purpose here is to determine whether the object
2811 * exists, and we don't want to treat the non-existence as
2812 * an error. If something else comes back, transfer the
2813 * error to the original request and complete it now.
2814 */
2815 if (!result) {
2816 obj_request_existence_set(orig_request, true);
2817 } else if (result == -ENOENT) {
2818 obj_request_existence_set(orig_request, false);
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002819 } else {
2820 goto fail_orig_request;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002821 }
2822
2823 /*
2824 * Resubmit the original request now that we have recorded
2825 * whether the target object exists.
2826 */
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002827 result = rbd_img_obj_request_submit(orig_request);
2828 if (result)
2829 goto fail_orig_request;
2830
2831 return;
2832
2833fail_orig_request:
Ilya Dryomov0dcc6852016-09-26 15:43:52 +02002834 rbd_obj_request_error(orig_request, result);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002835}
2836
2837static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2838{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002839 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002840 struct rbd_obj_request *stat_request;
Ilya Dryomov710214e2016-09-15 17:53:32 +02002841 struct page **pages;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002842 u32 page_count;
2843 size_t size;
2844 int ret;
2845
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002846 stat_request = rbd_obj_request_create(obj_request->object_name,
Ilya Dryomov710214e2016-09-15 17:53:32 +02002847 OBJ_REQUEST_PAGES);
2848 if (!stat_request)
2849 return -ENOMEM;
2850
2851 stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2852 stat_request);
2853 if (!stat_request->osd_req) {
2854 ret = -ENOMEM;
2855 goto fail_stat_request;
2856 }
2857
Alex Elderc5b5ef62013-02-11 12:33:24 -06002858 /*
2859 * The response data for a STAT call consists of:
2860 * le64 length;
2861 * struct {
2862 * le32 tv_sec;
2863 * le32 tv_nsec;
2864 * } mtime;
2865 */
2866 size = sizeof (__le64) + sizeof (__le32) + sizeof (__le32);
2867 page_count = (u32)calc_pages_for(0, size);
2868 pages = ceph_alloc_page_vector(page_count, GFP_KERNEL);
Ilya Dryomov710214e2016-09-15 17:53:32 +02002869 if (IS_ERR(pages)) {
2870 ret = PTR_ERR(pages);
2871 goto fail_stat_request;
2872 }
Alex Elderc5b5ef62013-02-11 12:33:24 -06002873
Ilya Dryomov710214e2016-09-15 17:53:32 +02002874 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
2875 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages, size, 0,
2876 false, false);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002877
2878 rbd_obj_request_get(obj_request);
2879 stat_request->obj_request = obj_request;
2880 stat_request->pages = pages;
2881 stat_request->page_count = page_count;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002882 stat_request->callback = rbd_img_obj_exists_callback;
2883
Ilya Dryomov980917f2016-09-12 18:59:42 +02002884 rbd_obj_request_submit(stat_request);
2885 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002886
Ilya Dryomov710214e2016-09-15 17:53:32 +02002887fail_stat_request:
2888 rbd_obj_request_put(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002889 return ret;
2890}
2891
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002892static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
Alex Elderb454e362013-04-19 15:34:50 -05002893{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002894 struct rbd_img_request *img_request = obj_request->img_request;
2895 struct rbd_device *rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002896
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002897 /* Reads */
Josh Durgin1c220882014-04-04 17:49:12 -07002898 if (!img_request_write_test(img_request) &&
2899 !img_request_discard_test(img_request))
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002900 return true;
Alex Elderb454e362013-04-19 15:34:50 -05002901
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002902 /* Non-layered writes */
2903 if (!img_request_layered_test(img_request))
2904 return true;
2905
2906 /*
2907 * Layered writes outside of the parent overlap range don't
2908 * share any data with the parent.
2909 */
2910 if (!obj_request_overlaps_parent(obj_request))
2911 return true;
2912
2913 /*
Guangliang Zhaoc622d222014-04-01 22:22:15 +08002914 * Entire-object layered writes - we will overwrite whatever
2915 * parent data there is anyway.
2916 */
2917 if (!obj_request->offset &&
2918 obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2919 return true;
2920
2921 /*
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002922 * If the object is known to already exist, its parent data has
2923 * already been copied.
2924 */
2925 if (obj_request_known_test(obj_request) &&
2926 obj_request_exists_test(obj_request))
2927 return true;
2928
2929 return false;
2930}
2931
2932static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2933{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002934 rbd_assert(obj_request_img_data_test(obj_request));
2935 rbd_assert(obj_request_type_valid(obj_request->type));
2936 rbd_assert(obj_request->img_request);
2937
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002938 if (img_obj_request_simple(obj_request)) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02002939 rbd_obj_request_submit(obj_request);
2940 return 0;
Alex Elderb454e362013-04-19 15:34:50 -05002941 }
2942
2943 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002944 * It's a layered write. The target object might exist but
2945 * we may not know that yet. If we know it doesn't exist,
2946 * start by reading the data for the full target object from
2947 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002948 */
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002949 if (obj_request_known_test(obj_request))
Alex Elder3d7efd12013-04-19 15:34:50 -05002950 return rbd_img_obj_parent_read_full(obj_request);
2951
2952 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002953
2954 return rbd_img_obj_exists_submit(obj_request);
2955}
2956
Alex Elderbf0d5f502012-11-22 00:00:08 -06002957static int rbd_img_request_submit(struct rbd_img_request *img_request)
2958{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002959 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002960 struct rbd_obj_request *next_obj_request;
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002961 int ret = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002962
Alex Elder37206ee2013-02-20 17:32:08 -06002963 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002964
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002965 rbd_img_request_get(img_request);
2966 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Alex Elderb454e362013-04-19 15:34:50 -05002967 ret = rbd_img_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002968 if (ret)
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002969 goto out_put_ireq;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002970 }
2971
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002972out_put_ireq:
2973 rbd_img_request_put(img_request);
2974 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002975}
2976
Alex Elder8b3e1a52013-01-24 16:13:36 -06002977static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
2978{
2979 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002980 struct rbd_device *rbd_dev;
2981 u64 obj_end;
Alex Elder02c74fb2013-05-06 17:40:33 -05002982 u64 img_xferred;
2983 int img_result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002984
2985 rbd_assert(img_request_child_test(img_request));
2986
Alex Elder02c74fb2013-05-06 17:40:33 -05002987 /* First get what we need from the image request and release it */
2988
Alex Elder8b3e1a52013-01-24 16:13:36 -06002989 obj_request = img_request->obj_request;
Alex Elder02c74fb2013-05-06 17:40:33 -05002990 img_xferred = img_request->xferred;
2991 img_result = img_request->result;
2992 rbd_img_request_put(img_request);
2993
2994 /*
2995 * If the overlap has become 0 (most likely because the
2996 * image has been flattened) we need to re-submit the
2997 * original request.
2998 */
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002999 rbd_assert(obj_request);
3000 rbd_assert(obj_request->img_request);
Alex Elder02c74fb2013-05-06 17:40:33 -05003001 rbd_dev = obj_request->img_request->rbd_dev;
3002 if (!rbd_dev->parent_overlap) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02003003 rbd_obj_request_submit(obj_request);
3004 return;
Alex Elder02c74fb2013-05-06 17:40:33 -05003005 }
3006
3007 obj_request->result = img_result;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003008 if (obj_request->result)
3009 goto out;
3010
3011 /*
3012 * We need to zero anything beyond the parent overlap
3013 * boundary. Since rbd_img_obj_request_read_callback()
3014 * will zero anything beyond the end of a short read, an
3015 * easy way to do this is to pretend the data from the
3016 * parent came up short--ending at the overlap boundary.
3017 */
3018 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
3019 obj_end = obj_request->img_offset + obj_request->length;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003020 if (obj_end > rbd_dev->parent_overlap) {
3021 u64 xferred = 0;
3022
3023 if (obj_request->img_offset < rbd_dev->parent_overlap)
3024 xferred = rbd_dev->parent_overlap -
3025 obj_request->img_offset;
3026
Alex Elder02c74fb2013-05-06 17:40:33 -05003027 obj_request->xferred = min(img_xferred, xferred);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003028 } else {
Alex Elder02c74fb2013-05-06 17:40:33 -05003029 obj_request->xferred = img_xferred;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003030 }
3031out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06003032 rbd_img_obj_request_read_callback(obj_request);
3033 rbd_obj_request_complete(obj_request);
3034}
3035
3036static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
3037{
Alex Elder8b3e1a52013-01-24 16:13:36 -06003038 struct rbd_img_request *img_request;
3039 int result;
3040
3041 rbd_assert(obj_request_img_data_test(obj_request));
3042 rbd_assert(obj_request->img_request != NULL);
3043 rbd_assert(obj_request->result == (s32) -ENOENT);
Alex Elder5b2ab722013-05-06 17:40:33 -05003044 rbd_assert(obj_request_type_valid(obj_request->type));
Alex Elder8b3e1a52013-01-24 16:13:36 -06003045
Alex Elder8b3e1a52013-01-24 16:13:36 -06003046 /* rbd_read_finish(obj_request, obj_request->length); */
Alex Eldere93f3152013-05-08 22:50:04 -05003047 img_request = rbd_parent_request_create(obj_request,
Alex Elder8b3e1a52013-01-24 16:13:36 -06003048 obj_request->img_offset,
Alex Eldere93f3152013-05-08 22:50:04 -05003049 obj_request->length);
Alex Elder8b3e1a52013-01-24 16:13:36 -06003050 result = -ENOMEM;
3051 if (!img_request)
3052 goto out_err;
3053
Alex Elder5b2ab722013-05-06 17:40:33 -05003054 if (obj_request->type == OBJ_REQUEST_BIO)
3055 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
3056 obj_request->bio_list);
3057 else
3058 result = rbd_img_request_fill(img_request, OBJ_REQUEST_PAGES,
3059 obj_request->pages);
Alex Elder8b3e1a52013-01-24 16:13:36 -06003060 if (result)
3061 goto out_err;
3062
3063 img_request->callback = rbd_img_parent_read_callback;
3064 result = rbd_img_request_submit(img_request);
3065 if (result)
3066 goto out_err;
3067
3068 return;
3069out_err:
3070 if (img_request)
3071 rbd_img_request_put(img_request);
3072 obj_request->result = result;
3073 obj_request->xferred = 0;
3074 obj_request_done_set(obj_request);
3075}
3076
Ilya Dryomoved95b212016-08-12 16:40:02 +02003077static const struct rbd_client_id rbd_empty_cid;
3078
3079static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3080 const struct rbd_client_id *rhs)
3081{
3082 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3083}
3084
3085static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3086{
3087 struct rbd_client_id cid;
3088
3089 mutex_lock(&rbd_dev->watch_mutex);
3090 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3091 cid.handle = rbd_dev->watch_cookie;
3092 mutex_unlock(&rbd_dev->watch_mutex);
3093 return cid;
3094}
3095
3096/*
3097 * lock_rwsem must be held for write
3098 */
3099static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3100 const struct rbd_client_id *cid)
3101{
3102 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3103 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3104 cid->gid, cid->handle);
3105 rbd_dev->owner_cid = *cid; /* struct */
3106}
3107
3108static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3109{
3110 mutex_lock(&rbd_dev->watch_mutex);
3111 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3112 mutex_unlock(&rbd_dev->watch_mutex);
3113}
3114
3115/*
3116 * lock_rwsem must be held for write
3117 */
3118static int rbd_lock(struct rbd_device *rbd_dev)
3119{
3120 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3121 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3122 char cookie[32];
3123 int ret;
3124
3125 WARN_ON(__rbd_is_lock_owner(rbd_dev));
3126
3127 format_lock_cookie(rbd_dev, cookie);
3128 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3129 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3130 RBD_LOCK_TAG, "", 0);
3131 if (ret)
3132 return ret;
3133
3134 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
3135 rbd_set_owner_cid(rbd_dev, &cid);
3136 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3137 return 0;
3138}
3139
3140/*
3141 * lock_rwsem must be held for write
3142 */
3143static int rbd_unlock(struct rbd_device *rbd_dev)
3144{
3145 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3146 char cookie[32];
3147 int ret;
3148
3149 WARN_ON(!__rbd_is_lock_owner(rbd_dev));
3150
3151 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
3152
3153 format_lock_cookie(rbd_dev, cookie);
3154 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3155 RBD_LOCK_NAME, cookie);
3156 if (ret && ret != -ENOENT) {
3157 rbd_warn(rbd_dev, "cls_unlock failed: %d", ret);
3158 return ret;
3159 }
3160
3161 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3162 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
3163 return 0;
3164}
3165
3166static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3167 enum rbd_notify_op notify_op,
3168 struct page ***preply_pages,
3169 size_t *preply_len)
3170{
3171 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3172 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3173 int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
3174 char buf[buf_size];
3175 void *p = buf;
3176
3177 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3178
3179 /* encode *LockPayload NotifyMessage (op + ClientId) */
3180 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3181 ceph_encode_32(&p, notify_op);
3182 ceph_encode_64(&p, cid.gid);
3183 ceph_encode_64(&p, cid.handle);
3184
3185 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3186 &rbd_dev->header_oloc, buf, buf_size,
3187 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3188}
3189
3190static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3191 enum rbd_notify_op notify_op)
3192{
3193 struct page **reply_pages;
3194 size_t reply_len;
3195
3196 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3197 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3198}
3199
3200static void rbd_notify_acquired_lock(struct work_struct *work)
3201{
3202 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3203 acquired_lock_work);
3204
3205 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3206}
3207
3208static void rbd_notify_released_lock(struct work_struct *work)
3209{
3210 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3211 released_lock_work);
3212
3213 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3214}
3215
3216static int rbd_request_lock(struct rbd_device *rbd_dev)
3217{
3218 struct page **reply_pages;
3219 size_t reply_len;
3220 bool lock_owner_responded = false;
3221 int ret;
3222
3223 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3224
3225 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3226 &reply_pages, &reply_len);
3227 if (ret && ret != -ETIMEDOUT) {
3228 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3229 goto out;
3230 }
3231
3232 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3233 void *p = page_address(reply_pages[0]);
3234 void *const end = p + reply_len;
3235 u32 n;
3236
3237 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3238 while (n--) {
3239 u8 struct_v;
3240 u32 len;
3241
3242 ceph_decode_need(&p, end, 8 + 8, e_inval);
3243 p += 8 + 8; /* skip gid and cookie */
3244
3245 ceph_decode_32_safe(&p, end, len, e_inval);
3246 if (!len)
3247 continue;
3248
3249 if (lock_owner_responded) {
3250 rbd_warn(rbd_dev,
3251 "duplicate lock owners detected");
3252 ret = -EIO;
3253 goto out;
3254 }
3255
3256 lock_owner_responded = true;
3257 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3258 &struct_v, &len);
3259 if (ret) {
3260 rbd_warn(rbd_dev,
3261 "failed to decode ResponseMessage: %d",
3262 ret);
3263 goto e_inval;
3264 }
3265
3266 ret = ceph_decode_32(&p);
3267 }
3268 }
3269
3270 if (!lock_owner_responded) {
3271 rbd_warn(rbd_dev, "no lock owners detected");
3272 ret = -ETIMEDOUT;
3273 }
3274
3275out:
3276 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3277 return ret;
3278
3279e_inval:
3280 ret = -EINVAL;
3281 goto out;
3282}
3283
3284static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3285{
3286 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3287
3288 cancel_delayed_work(&rbd_dev->lock_dwork);
3289 if (wake_all)
3290 wake_up_all(&rbd_dev->lock_waitq);
3291 else
3292 wake_up(&rbd_dev->lock_waitq);
3293}
3294
3295static int get_lock_owner_info(struct rbd_device *rbd_dev,
3296 struct ceph_locker **lockers, u32 *num_lockers)
3297{
3298 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3299 u8 lock_type;
3300 char *lock_tag;
3301 int ret;
3302
3303 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3304
3305 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3306 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3307 &lock_type, &lock_tag, lockers, num_lockers);
3308 if (ret)
3309 return ret;
3310
3311 if (*num_lockers == 0) {
3312 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3313 goto out;
3314 }
3315
3316 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3317 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3318 lock_tag);
3319 ret = -EBUSY;
3320 goto out;
3321 }
3322
3323 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3324 rbd_warn(rbd_dev, "shared lock type detected");
3325 ret = -EBUSY;
3326 goto out;
3327 }
3328
3329 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3330 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3331 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3332 (*lockers)[0].id.cookie);
3333 ret = -EBUSY;
3334 goto out;
3335 }
3336
3337out:
3338 kfree(lock_tag);
3339 return ret;
3340}
3341
3342static int find_watcher(struct rbd_device *rbd_dev,
3343 const struct ceph_locker *locker)
3344{
3345 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3346 struct ceph_watch_item *watchers;
3347 u32 num_watchers;
3348 u64 cookie;
3349 int i;
3350 int ret;
3351
3352 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3353 &rbd_dev->header_oloc, &watchers,
3354 &num_watchers);
3355 if (ret)
3356 return ret;
3357
3358 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3359 for (i = 0; i < num_watchers; i++) {
3360 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3361 sizeof(locker->info.addr)) &&
3362 watchers[i].cookie == cookie) {
3363 struct rbd_client_id cid = {
3364 .gid = le64_to_cpu(watchers[i].name.num),
3365 .handle = cookie,
3366 };
3367
3368 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3369 rbd_dev, cid.gid, cid.handle);
3370 rbd_set_owner_cid(rbd_dev, &cid);
3371 ret = 1;
3372 goto out;
3373 }
3374 }
3375
3376 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3377 ret = 0;
3378out:
3379 kfree(watchers);
3380 return ret;
3381}
3382
3383/*
3384 * lock_rwsem must be held for write
3385 */
3386static int rbd_try_lock(struct rbd_device *rbd_dev)
3387{
3388 struct ceph_client *client = rbd_dev->rbd_client->client;
3389 struct ceph_locker *lockers;
3390 u32 num_lockers;
3391 int ret;
3392
3393 for (;;) {
3394 ret = rbd_lock(rbd_dev);
3395 if (ret != -EBUSY)
3396 return ret;
3397
3398 /* determine if the current lock holder is still alive */
3399 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3400 if (ret)
3401 return ret;
3402
3403 if (num_lockers == 0)
3404 goto again;
3405
3406 ret = find_watcher(rbd_dev, lockers);
3407 if (ret) {
3408 if (ret > 0)
3409 ret = 0; /* have to request lock */
3410 goto out;
3411 }
3412
3413 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3414 ENTITY_NAME(lockers[0].id.name));
3415
3416 ret = ceph_monc_blacklist_add(&client->monc,
3417 &lockers[0].info.addr);
3418 if (ret) {
3419 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3420 ENTITY_NAME(lockers[0].id.name), ret);
3421 goto out;
3422 }
3423
3424 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3425 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3426 lockers[0].id.cookie,
3427 &lockers[0].id.name);
3428 if (ret && ret != -ENOENT)
3429 goto out;
3430
3431again:
3432 ceph_free_lockers(lockers, num_lockers);
3433 }
3434
3435out:
3436 ceph_free_lockers(lockers, num_lockers);
3437 return ret;
3438}
3439
3440/*
3441 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3442 */
3443static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3444 int *pret)
3445{
3446 enum rbd_lock_state lock_state;
3447
3448 down_read(&rbd_dev->lock_rwsem);
3449 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3450 rbd_dev->lock_state);
3451 if (__rbd_is_lock_owner(rbd_dev)) {
3452 lock_state = rbd_dev->lock_state;
3453 up_read(&rbd_dev->lock_rwsem);
3454 return lock_state;
3455 }
3456
3457 up_read(&rbd_dev->lock_rwsem);
3458 down_write(&rbd_dev->lock_rwsem);
3459 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3460 rbd_dev->lock_state);
3461 if (!__rbd_is_lock_owner(rbd_dev)) {
3462 *pret = rbd_try_lock(rbd_dev);
3463 if (*pret)
3464 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3465 }
3466
3467 lock_state = rbd_dev->lock_state;
3468 up_write(&rbd_dev->lock_rwsem);
3469 return lock_state;
3470}
3471
3472static void rbd_acquire_lock(struct work_struct *work)
3473{
3474 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3475 struct rbd_device, lock_dwork);
3476 enum rbd_lock_state lock_state;
3477 int ret;
3478
3479 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3480again:
3481 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3482 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3483 if (lock_state == RBD_LOCK_STATE_LOCKED)
3484 wake_requests(rbd_dev, true);
3485 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3486 rbd_dev, lock_state, ret);
3487 return;
3488 }
3489
3490 ret = rbd_request_lock(rbd_dev);
3491 if (ret == -ETIMEDOUT) {
3492 goto again; /* treat this as a dead client */
3493 } else if (ret < 0) {
3494 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3495 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3496 RBD_RETRY_DELAY);
3497 } else {
3498 /*
3499 * lock owner acked, but resend if we don't see them
3500 * release the lock
3501 */
3502 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3503 rbd_dev);
3504 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3505 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3506 }
3507}
3508
3509/*
3510 * lock_rwsem must be held for write
3511 */
3512static bool rbd_release_lock(struct rbd_device *rbd_dev)
3513{
3514 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3515 rbd_dev->lock_state);
3516 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3517 return false;
3518
3519 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3520 downgrade_write(&rbd_dev->lock_rwsem);
3521 /*
3522 * Ensure that all in-flight IO is flushed.
3523 *
3524 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3525 * may be shared with other devices.
3526 */
3527 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3528 up_read(&rbd_dev->lock_rwsem);
3529
3530 down_write(&rbd_dev->lock_rwsem);
3531 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3532 rbd_dev->lock_state);
3533 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3534 return false;
3535
3536 if (!rbd_unlock(rbd_dev))
3537 /*
3538 * Give others a chance to grab the lock - we would re-acquire
3539 * almost immediately if we got new IO during ceph_osdc_sync()
3540 * otherwise. We need to ack our own notifications, so this
3541 * lock_dwork will be requeued from rbd_wait_state_locked()
3542 * after wake_requests() in rbd_handle_released_lock().
3543 */
3544 cancel_delayed_work(&rbd_dev->lock_dwork);
3545
3546 return true;
3547}
3548
3549static void rbd_release_lock_work(struct work_struct *work)
3550{
3551 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3552 unlock_work);
3553
3554 down_write(&rbd_dev->lock_rwsem);
3555 rbd_release_lock(rbd_dev);
3556 up_write(&rbd_dev->lock_rwsem);
3557}
3558
3559static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3560 void **p)
3561{
3562 struct rbd_client_id cid = { 0 };
3563
3564 if (struct_v >= 2) {
3565 cid.gid = ceph_decode_64(p);
3566 cid.handle = ceph_decode_64(p);
3567 }
3568
3569 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3570 cid.handle);
3571 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3572 down_write(&rbd_dev->lock_rwsem);
3573 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3574 /*
3575 * we already know that the remote client is
3576 * the owner
3577 */
3578 up_write(&rbd_dev->lock_rwsem);
3579 return;
3580 }
3581
3582 rbd_set_owner_cid(rbd_dev, &cid);
3583 downgrade_write(&rbd_dev->lock_rwsem);
3584 } else {
3585 down_read(&rbd_dev->lock_rwsem);
3586 }
3587
3588 if (!__rbd_is_lock_owner(rbd_dev))
3589 wake_requests(rbd_dev, false);
3590 up_read(&rbd_dev->lock_rwsem);
3591}
3592
3593static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3594 void **p)
3595{
3596 struct rbd_client_id cid = { 0 };
3597
3598 if (struct_v >= 2) {
3599 cid.gid = ceph_decode_64(p);
3600 cid.handle = ceph_decode_64(p);
3601 }
3602
3603 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3604 cid.handle);
3605 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3606 down_write(&rbd_dev->lock_rwsem);
3607 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3608 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3609 __func__, rbd_dev, cid.gid, cid.handle,
3610 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3611 up_write(&rbd_dev->lock_rwsem);
3612 return;
3613 }
3614
3615 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3616 downgrade_write(&rbd_dev->lock_rwsem);
3617 } else {
3618 down_read(&rbd_dev->lock_rwsem);
3619 }
3620
3621 if (!__rbd_is_lock_owner(rbd_dev))
3622 wake_requests(rbd_dev, false);
3623 up_read(&rbd_dev->lock_rwsem);
3624}
3625
3626static bool rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3627 void **p)
3628{
3629 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3630 struct rbd_client_id cid = { 0 };
3631 bool need_to_send;
3632
3633 if (struct_v >= 2) {
3634 cid.gid = ceph_decode_64(p);
3635 cid.handle = ceph_decode_64(p);
3636 }
3637
3638 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3639 cid.handle);
3640 if (rbd_cid_equal(&cid, &my_cid))
3641 return false;
3642
3643 down_read(&rbd_dev->lock_rwsem);
3644 need_to_send = __rbd_is_lock_owner(rbd_dev);
3645 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
3646 if (!rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid)) {
3647 dout("%s rbd_dev %p queueing unlock_work\n", __func__,
3648 rbd_dev);
3649 queue_work(rbd_dev->task_wq, &rbd_dev->unlock_work);
3650 }
3651 }
3652 up_read(&rbd_dev->lock_rwsem);
3653 return need_to_send;
3654}
3655
3656static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3657 u64 notify_id, u64 cookie, s32 *result)
3658{
3659 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3660 int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3661 char buf[buf_size];
3662 int ret;
3663
3664 if (result) {
3665 void *p = buf;
3666
3667 /* encode ResponseMessage */
3668 ceph_start_encoding(&p, 1, 1,
3669 buf_size - CEPH_ENCODING_START_BLK_LEN);
3670 ceph_encode_32(&p, *result);
3671 } else {
3672 buf_size = 0;
3673 }
3674
3675 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3676 &rbd_dev->header_oloc, notify_id, cookie,
3677 buf, buf_size);
3678 if (ret)
3679 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3680}
3681
3682static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3683 u64 cookie)
3684{
3685 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3686 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3687}
3688
3689static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3690 u64 notify_id, u64 cookie, s32 result)
3691{
3692 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3693 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3694}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003695
3696static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3697 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003698{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003699 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003700 void *p = data;
3701 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003702 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003703 u32 len;
3704 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003705 int ret;
3706
Ilya Dryomoved95b212016-08-12 16:40:02 +02003707 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3708 __func__, rbd_dev, cookie, notify_id, data_len);
3709 if (data_len) {
3710 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3711 &struct_v, &len);
3712 if (ret) {
3713 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3714 ret);
3715 return;
3716 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003717
Ilya Dryomoved95b212016-08-12 16:40:02 +02003718 notify_op = ceph_decode_32(&p);
3719 } else {
3720 /* legacy notification for header updates */
3721 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3722 len = 0;
3723 }
Alex Elderb8d70032012-11-30 17:53:04 -06003724
Ilya Dryomoved95b212016-08-12 16:40:02 +02003725 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3726 switch (notify_op) {
3727 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3728 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3729 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3730 break;
3731 case RBD_NOTIFY_OP_RELEASED_LOCK:
3732 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3733 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3734 break;
3735 case RBD_NOTIFY_OP_REQUEST_LOCK:
3736 if (rbd_handle_request_lock(rbd_dev, struct_v, &p))
3737 /*
3738 * send ResponseMessage(0) back so the client
3739 * can detect a missing owner
3740 */
3741 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3742 cookie, 0);
3743 else
3744 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3745 break;
3746 case RBD_NOTIFY_OP_HEADER_UPDATE:
3747 ret = rbd_dev_refresh(rbd_dev);
3748 if (ret)
3749 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3750
3751 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3752 break;
3753 default:
3754 if (rbd_is_lock_owner(rbd_dev))
3755 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3756 cookie, -EOPNOTSUPP);
3757 else
3758 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3759 break;
3760 }
Alex Elderb8d70032012-11-30 17:53:04 -06003761}
3762
Ilya Dryomov99d16942016-08-12 16:11:41 +02003763static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3764
Ilya Dryomov922dab62016-05-26 01:15:02 +02003765static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003766{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003767 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003768
Ilya Dryomov922dab62016-05-26 01:15:02 +02003769 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003770
Ilya Dryomoved95b212016-08-12 16:40:02 +02003771 down_write(&rbd_dev->lock_rwsem);
3772 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3773 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003774
Ilya Dryomov99d16942016-08-12 16:11:41 +02003775 mutex_lock(&rbd_dev->watch_mutex);
3776 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3777 __rbd_unregister_watch(rbd_dev);
3778 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003779
Ilya Dryomov99d16942016-08-12 16:11:41 +02003780 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003781 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003782 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003783}
3784
3785/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003786 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003787 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003788static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003789{
3790 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003791 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003792
Ilya Dryomov922dab62016-05-26 01:15:02 +02003793 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003794 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003795
Ilya Dryomov922dab62016-05-26 01:15:02 +02003796 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3797 &rbd_dev->header_oloc, rbd_watch_cb,
3798 rbd_watch_errcb, rbd_dev);
3799 if (IS_ERR(handle))
3800 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003801
Ilya Dryomov922dab62016-05-26 01:15:02 +02003802 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003803 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003804}
3805
Ilya Dryomov99d16942016-08-12 16:11:41 +02003806/*
3807 * watch_mutex must be locked
3808 */
3809static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003810{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003811 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3812 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003813
Ilya Dryomov99d16942016-08-12 16:11:41 +02003814 rbd_assert(rbd_dev->watch_handle);
3815 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003816
Ilya Dryomov922dab62016-05-26 01:15:02 +02003817 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3818 if (ret)
3819 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003820
Ilya Dryomov922dab62016-05-26 01:15:02 +02003821 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003822}
3823
Ilya Dryomov99d16942016-08-12 16:11:41 +02003824static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003825{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003826 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003827
Ilya Dryomov99d16942016-08-12 16:11:41 +02003828 mutex_lock(&rbd_dev->watch_mutex);
3829 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3830 ret = __rbd_register_watch(rbd_dev);
3831 if (ret)
3832 goto out;
3833
3834 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3835 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3836
3837out:
3838 mutex_unlock(&rbd_dev->watch_mutex);
3839 return ret;
3840}
3841
3842static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3843{
3844 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3845
3846 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003847 cancel_work_sync(&rbd_dev->acquired_lock_work);
3848 cancel_work_sync(&rbd_dev->released_lock_work);
3849 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3850 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003851}
3852
3853static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3854{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003855 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003856 cancel_tasks_sync(rbd_dev);
3857
3858 mutex_lock(&rbd_dev->watch_mutex);
3859 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3860 __rbd_unregister_watch(rbd_dev);
3861 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3862 mutex_unlock(&rbd_dev->watch_mutex);
3863
Ilya Dryomov811c6682016-04-15 16:22:16 +02003864 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003865}
3866
Ilya Dryomov99d16942016-08-12 16:11:41 +02003867static void rbd_reregister_watch(struct work_struct *work)
3868{
3869 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3870 struct rbd_device, watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003871 bool was_lock_owner = false;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003872 bool need_to_wake = false;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003873 int ret;
3874
3875 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3876
Ilya Dryomoved95b212016-08-12 16:40:02 +02003877 down_write(&rbd_dev->lock_rwsem);
3878 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3879 was_lock_owner = rbd_release_lock(rbd_dev);
3880
Ilya Dryomov99d16942016-08-12 16:11:41 +02003881 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003882 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3883 mutex_unlock(&rbd_dev->watch_mutex);
3884 goto out;
3885 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003886
3887 ret = __rbd_register_watch(rbd_dev);
3888 if (ret) {
3889 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003890 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003891 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3892 need_to_wake = true;
3893 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003894 queue_delayed_work(rbd_dev->task_wq,
3895 &rbd_dev->watch_dwork,
3896 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003897 }
3898 mutex_unlock(&rbd_dev->watch_mutex);
3899 goto out;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003900 }
3901
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003902 need_to_wake = true;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003903 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3904 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3905 mutex_unlock(&rbd_dev->watch_mutex);
3906
3907 ret = rbd_dev_refresh(rbd_dev);
3908 if (ret)
3909 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
3910
Ilya Dryomoved95b212016-08-12 16:40:02 +02003911 if (was_lock_owner) {
3912 ret = rbd_try_lock(rbd_dev);
3913 if (ret)
3914 rbd_warn(rbd_dev, "reregisteration lock failed: %d",
3915 ret);
3916 }
3917
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003918out:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003919 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003920 if (need_to_wake)
3921 wake_requests(rbd_dev, true);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003922}
3923
Alex Elder36be9a72013-01-19 00:30:28 -06003924/*
Alex Elderf40eb342013-04-25 15:09:42 -05003925 * Synchronous osd object method call. Returns the number of bytes
3926 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003927 */
3928static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003929 struct ceph_object_id *oid,
3930 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003931 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003932 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003933 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003934 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003935 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003936{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003937 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3938 struct page *req_page = NULL;
3939 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003940 int ret;
3941
3942 /*
Alex Elder6010a452013-04-05 01:27:11 -05003943 * Method calls are ultimately read operations. The result
3944 * should placed into the inbound buffer provided. They
3945 * also supply outbound data--parameters for the object
3946 * method. Currently if this is present it will be a
3947 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003948 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003949 if (outbound) {
3950 if (outbound_size > PAGE_SIZE)
3951 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003952
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003953 req_page = alloc_page(GFP_KERNEL);
3954 if (!req_page)
3955 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003956
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003957 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003958 }
Alex Elder430c28c2013-04-03 21:32:51 -05003959
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003960 reply_page = alloc_page(GFP_KERNEL);
3961 if (!reply_page) {
3962 if (req_page)
3963 __free_page(req_page);
3964 return -ENOMEM;
3965 }
Alex Elder36be9a72013-01-19 00:30:28 -06003966
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003967 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3968 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3969 reply_page, &inbound_size);
3970 if (!ret) {
3971 memcpy(inbound, page_address(reply_page), inbound_size);
3972 ret = inbound_size;
3973 }
Alex Elder57385b52013-04-21 12:14:45 -05003974
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003975 if (req_page)
3976 __free_page(req_page);
3977 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003978 return ret;
3979}
3980
Ilya Dryomoved95b212016-08-12 16:40:02 +02003981/*
3982 * lock_rwsem must be held for read
3983 */
3984static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
3985{
3986 DEFINE_WAIT(wait);
3987
3988 do {
3989 /*
3990 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3991 * and cancel_delayed_work() in wake_requests().
3992 */
3993 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3994 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3995 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3996 TASK_UNINTERRUPTIBLE);
3997 up_read(&rbd_dev->lock_rwsem);
3998 schedule();
3999 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004000 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
4001 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
4002
Ilya Dryomoved95b212016-08-12 16:40:02 +02004003 finish_wait(&rbd_dev->lock_waitq, &wait);
4004}
4005
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004006static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004007{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004008 struct request *rq = blk_mq_rq_from_pdu(work);
4009 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004010 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07004011 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004012 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4013 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004014 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07004015 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02004016 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004017 int result;
4018
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004019 if (rq->cmd_type != REQ_TYPE_FS) {
4020 dout("%s: non-fs request type %d\n", __func__,
4021 (int) rq->cmd_type);
4022 result = -EIO;
4023 goto err;
4024 }
4025
Mike Christiec2df40d2016-06-05 14:32:17 -05004026 if (req_op(rq) == REQ_OP_DISCARD)
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004027 op_type = OBJ_OP_DISCARD;
Mike Christiec2df40d2016-06-05 14:32:17 -05004028 else if (req_op(rq) == REQ_OP_WRITE)
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004029 op_type = OBJ_OP_WRITE;
4030 else
4031 op_type = OBJ_OP_READ;
4032
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004033 /* Ignore/skip any zero-length requests */
4034
4035 if (!length) {
4036 dout("%s: zero-length request\n", __func__);
4037 result = 0;
4038 goto err_rq;
4039 }
4040
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004041 /* Only reads are allowed to a read-only device */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004042
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004043 if (op_type != OBJ_OP_READ) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004044 if (rbd_dev->mapping.read_only) {
4045 result = -EROFS;
4046 goto err_rq;
4047 }
4048 rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP);
4049 }
4050
4051 /*
4052 * Quit early if the mapped snapshot no longer exists. It's
4053 * still possible the snapshot will have disappeared by the
4054 * time our request arrives at the osd, but there's no sense in
4055 * sending it if we already know.
4056 */
4057 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4058 dout("request for non-existent snapshot");
4059 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4060 result = -ENXIO;
4061 goto err_rq;
4062 }
4063
4064 if (offset && length > U64_MAX - offset + 1) {
4065 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4066 length);
4067 result = -EINVAL;
4068 goto err_rq; /* Shouldn't happen */
4069 }
4070
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004071 blk_mq_start_request(rq);
4072
Josh Durgin4e752f02014-04-08 11:12:11 -07004073 down_read(&rbd_dev->header_rwsem);
4074 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004075 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07004076 snapc = rbd_dev->header.snapc;
4077 ceph_get_snap_context(snapc);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004078 must_be_locked = rbd_is_lock_supported(rbd_dev);
Ilya Dryomov80de1912016-09-20 14:23:17 +02004079 } else {
4080 must_be_locked = rbd_dev->opts->lock_on_read &&
4081 rbd_is_lock_supported(rbd_dev);
Josh Durgin4e752f02014-04-08 11:12:11 -07004082 }
4083 up_read(&rbd_dev->header_rwsem);
4084
4085 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004086 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07004087 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004088 result = -EIO;
4089 goto err_rq;
4090 }
4091
Ilya Dryomoved95b212016-08-12 16:40:02 +02004092 if (must_be_locked) {
4093 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004094 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
4095 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
Ilya Dryomoved95b212016-08-12 16:40:02 +02004096 rbd_wait_state_locked(rbd_dev);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004097
4098 WARN_ON((rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) ^
4099 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
4100 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
4101 result = -EBLACKLISTED;
4102 goto err_unlock;
4103 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02004104 }
4105
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004106 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07004107 snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004108 if (!img_request) {
4109 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004110 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004111 }
4112 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01004113 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004114
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004115 if (op_type == OBJ_OP_DISCARD)
4116 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
4117 NULL);
4118 else
4119 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
4120 rq->bio);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004121 if (result)
4122 goto err_img_request;
4123
4124 result = rbd_img_request_submit(img_request);
4125 if (result)
4126 goto err_img_request;
4127
Ilya Dryomoved95b212016-08-12 16:40:02 +02004128 if (must_be_locked)
4129 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004130 return;
4131
4132err_img_request:
4133 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004134err_unlock:
4135 if (must_be_locked)
4136 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004137err_rq:
4138 if (result)
4139 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004140 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01004141 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004142err:
4143 blk_mq_end_request(rq, result);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004144}
4145
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004146static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
4147 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004148{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004149 struct request *rq = bd->rq;
4150 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004151
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004152 queue_work(rbd_wq, work);
4153 return BLK_MQ_RQ_QUEUE_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06004154}
4155
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004156static void rbd_free_disk(struct rbd_device *rbd_dev)
4157{
4158 struct gendisk *disk = rbd_dev->disk;
4159
4160 if (!disk)
4161 return;
4162
Alex Eldera0cab922013-04-25 23:15:08 -05004163 rbd_dev->disk = NULL;
4164 if (disk->flags & GENHD_FL_UP) {
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004165 del_gendisk(disk);
Alex Eldera0cab922013-04-25 23:15:08 -05004166 if (disk->queue)
4167 blk_cleanup_queue(disk->queue);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004168 blk_mq_free_tag_set(&rbd_dev->tag_set);
Alex Eldera0cab922013-04-25 23:15:08 -05004169 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004170 put_disk(disk);
4171}
4172
Alex Elder788e2df2013-01-17 12:25:27 -06004173static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004174 struct ceph_object_id *oid,
4175 struct ceph_object_locator *oloc,
4176 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06004177
4178{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004179 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4180 struct ceph_osd_request *req;
4181 struct page **pages;
4182 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06004183 int ret;
4184
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004185 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4186 if (!req)
4187 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06004188
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004189 ceph_oid_copy(&req->r_base_oid, oid);
4190 ceph_oloc_copy(&req->r_base_oloc, oloc);
4191 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06004192
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004193 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06004194 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004195 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06004196
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004197 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4198 if (IS_ERR(pages)) {
4199 ret = PTR_ERR(pages);
4200 goto out_req;
4201 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06004202
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004203 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4204 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4205 true);
Alex Elder788e2df2013-01-17 12:25:27 -06004206
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004207 ceph_osdc_start_request(osdc, req, false);
4208 ret = ceph_osdc_wait_request(osdc, req);
4209 if (ret >= 0)
4210 ceph_copy_from_page_vector(pages, buf, 0, ret);
4211
4212out_req:
4213 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06004214 return ret;
4215}
4216
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004217/*
Alex Elder662518b2013-05-06 09:51:29 -05004218 * Read the complete header for the given rbd device. On successful
4219 * return, the rbd_dev->header field will contain up-to-date
4220 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05004221 */
Alex Elder99a41eb2013-05-06 09:51:30 -05004222static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05004223{
4224 struct rbd_image_header_ondisk *ondisk = NULL;
4225 u32 snap_count = 0;
4226 u64 names_size = 0;
4227 u32 want_count;
4228 int ret;
4229
4230 /*
4231 * The complete header will include an array of its 64-bit
4232 * snapshot ids, followed by the names of those snapshots as
4233 * a contiguous block of NUL-terminated strings. Note that
4234 * the number of snapshots could change by the time we read
4235 * it in, in which case we re-read it.
4236 */
4237 do {
4238 size_t size;
4239
4240 kfree(ondisk);
4241
4242 size = sizeof (*ondisk);
4243 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4244 size += names_size;
4245 ondisk = kmalloc(size, GFP_KERNEL);
4246 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05004247 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05004248
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004249 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4250 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05004251 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05004252 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004253 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05004254 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004255 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4256 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05004257 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004258 }
4259 if (!rbd_dev_ondisk_valid(ondisk)) {
4260 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004261 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05004262 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004263 }
4264
4265 names_size = le64_to_cpu(ondisk->snap_names_len);
4266 want_count = snap_count;
4267 snap_count = le32_to_cpu(ondisk->snap_count);
4268 } while (snap_count != want_count);
4269
Alex Elder662518b2013-05-06 09:51:29 -05004270 ret = rbd_header_from_disk(rbd_dev, ondisk);
4271out:
Alex Elder4156d992012-08-02 11:29:46 -05004272 kfree(ondisk);
4273
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004274 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004275}
4276
Alex Elder15228ed2013-05-01 12:43:03 -05004277/*
4278 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
4279 * has disappeared from the (just updated) snapshot context.
4280 */
4281static void rbd_exists_validate(struct rbd_device *rbd_dev)
4282{
4283 u64 snap_id;
4284
4285 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
4286 return;
4287
4288 snap_id = rbd_dev->spec->snap_id;
4289 if (snap_id == CEPH_NOSNAP)
4290 return;
4291
4292 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
4293 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4294}
4295
Josh Durgin98752012013-08-29 17:26:31 -07004296static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4297{
4298 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07004299
4300 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02004301 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4302 * try to update its size. If REMOVING is set, updating size
4303 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07004304 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02004305 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4306 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07004307 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4308 dout("setting size to %llu sectors", (unsigned long long)size);
4309 set_capacity(rbd_dev->disk, size);
4310 revalidate_disk(rbd_dev->disk);
4311 }
4312}
4313
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004314static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05004315{
Alex Eldere627db02013-05-06 07:40:30 -05004316 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05004317 int ret;
4318
Alex Eldercfbf6372013-05-31 17:40:45 -05004319 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004320 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004321
4322 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004323 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004324 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004325
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004326 /*
4327 * If there is a parent, see if it has disappeared due to the
4328 * mapped image getting flattened.
4329 */
4330 if (rbd_dev->parent) {
4331 ret = rbd_dev_v2_parent_info(rbd_dev);
4332 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004333 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004334 }
4335
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004336 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004337 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004338 } else {
4339 /* validate mapped snapshot's EXISTS flag */
4340 rbd_exists_validate(rbd_dev);
4341 }
Alex Elder15228ed2013-05-01 12:43:03 -05004342
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004343out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004344 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004345 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004346 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004347
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004348 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004349}
4350
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004351static int rbd_init_request(void *data, struct request *rq,
4352 unsigned int hctx_idx, unsigned int request_idx,
4353 unsigned int numa_node)
4354{
4355 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4356
4357 INIT_WORK(work, rbd_queue_workfn);
4358 return 0;
4359}
4360
4361static struct blk_mq_ops rbd_mq_ops = {
4362 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004363 .init_request = rbd_init_request,
4364};
4365
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004366static int rbd_init_disk(struct rbd_device *rbd_dev)
4367{
4368 struct gendisk *disk;
4369 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06004370 u64 segment_size;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004371 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004372
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004373 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004374 disk = alloc_disk(single_major ?
4375 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4376 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004377 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004378 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004379
Alex Elderf0f8cef2012-01-29 13:57:44 -06004380 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004381 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004382 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004383 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004384 if (single_major)
4385 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004386 disk->fops = &rbd_bd_ops;
4387 disk->private_data = rbd_dev;
4388
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004389 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4390 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004391 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004392 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004393 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004394 rbd_dev->tag_set.nr_hw_queues = 1;
4395 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4396
4397 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4398 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004399 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004400
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004401 q = blk_mq_init_queue(&rbd_dev->tag_set);
4402 if (IS_ERR(q)) {
4403 err = PTR_ERR(q);
4404 goto out_tag_set;
4405 }
4406
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004407 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
4408 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004409
Josh Durgin029bcbd2011-07-22 11:35:23 -07004410 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06004411 segment_size = rbd_obj_bytes(&rbd_dev->header);
4412 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004413 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomovd3834fe2015-06-12 19:19:02 +03004414 blk_queue_max_segments(q, segment_size / SECTOR_SIZE);
Alex Elder593a9e72012-02-07 12:03:37 -06004415 blk_queue_max_segment_size(q, segment_size);
4416 blk_queue_io_min(q, segment_size);
4417 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004418
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004419 /* enable the discard support */
4420 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
4421 q->limits.discard_granularity = segment_size;
4422 q->limits.discard_alignment = segment_size;
Jens Axboe2bb4cd52015-07-14 08:15:12 -06004423 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
Josh Durginb76f8232014-04-07 16:52:03 -07004424 q->limits.discard_zeroes_data = 1;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004425
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004426 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
4427 q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES;
4428
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004429 disk->queue = q;
4430
4431 q->queuedata = rbd_dev;
4432
4433 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004434
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004435 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004436out_tag_set:
4437 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004438out_disk:
4439 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004440 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004441}
4442
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004443/*
4444 sysfs
4445*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004446
Alex Elder593a9e72012-02-07 12:03:37 -06004447static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4448{
4449 return container_of(dev, struct rbd_device, dev);
4450}
4451
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004452static ssize_t rbd_size_show(struct device *dev,
4453 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004454{
Alex Elder593a9e72012-02-07 12:03:37 -06004455 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004456
Alex Elderfc71d832013-04-26 15:44:36 -05004457 return sprintf(buf, "%llu\n",
4458 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004459}
4460
Alex Elder34b13182012-07-13 20:35:12 -05004461/*
4462 * Note this shows the features for whatever's mapped, which is not
4463 * necessarily the base image.
4464 */
4465static ssize_t rbd_features_show(struct device *dev,
4466 struct device_attribute *attr, char *buf)
4467{
4468 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4469
4470 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004471 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004472}
4473
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004474static ssize_t rbd_major_show(struct device *dev,
4475 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004476{
Alex Elder593a9e72012-02-07 12:03:37 -06004477 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004478
Alex Elderfc71d832013-04-26 15:44:36 -05004479 if (rbd_dev->major)
4480 return sprintf(buf, "%d\n", rbd_dev->major);
4481
4482 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004483}
Alex Elderfc71d832013-04-26 15:44:36 -05004484
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004485static ssize_t rbd_minor_show(struct device *dev,
4486 struct device_attribute *attr, char *buf)
4487{
4488 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4489
4490 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004491}
4492
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004493static ssize_t rbd_client_addr_show(struct device *dev,
4494 struct device_attribute *attr, char *buf)
4495{
4496 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4497 struct ceph_entity_addr *client_addr =
4498 ceph_client_addr(rbd_dev->rbd_client->client);
4499
4500 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4501 le32_to_cpu(client_addr->nonce));
4502}
4503
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004504static ssize_t rbd_client_id_show(struct device *dev,
4505 struct device_attribute *attr, char *buf)
4506{
Alex Elder593a9e72012-02-07 12:03:37 -06004507 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004508
Alex Elder1dbb4392012-01-24 10:08:37 -06004509 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004510 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004511}
4512
Mike Christie267fb902016-08-18 18:38:43 +02004513static ssize_t rbd_cluster_fsid_show(struct device *dev,
4514 struct device_attribute *attr, char *buf)
4515{
4516 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4517
4518 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4519}
4520
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004521static ssize_t rbd_config_info_show(struct device *dev,
4522 struct device_attribute *attr, char *buf)
4523{
4524 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4525
4526 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004527}
4528
4529static ssize_t rbd_pool_show(struct device *dev,
4530 struct device_attribute *attr, char *buf)
4531{
Alex Elder593a9e72012-02-07 12:03:37 -06004532 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004533
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004534 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004535}
4536
Alex Elder9bb2f332012-07-12 10:46:35 -05004537static ssize_t rbd_pool_id_show(struct device *dev,
4538 struct device_attribute *attr, char *buf)
4539{
4540 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4541
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004542 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004543 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004544}
4545
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004546static ssize_t rbd_name_show(struct device *dev,
4547 struct device_attribute *attr, char *buf)
4548{
Alex Elder593a9e72012-02-07 12:03:37 -06004549 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004550
Alex Eldera92ffdf2012-10-30 19:40:33 -05004551 if (rbd_dev->spec->image_name)
4552 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4553
4554 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004555}
4556
Alex Elder589d30e2012-07-10 20:30:11 -05004557static ssize_t rbd_image_id_show(struct device *dev,
4558 struct device_attribute *attr, char *buf)
4559{
4560 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4561
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004562 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004563}
4564
Alex Elder34b13182012-07-13 20:35:12 -05004565/*
4566 * Shows the name of the currently-mapped snapshot (or
4567 * RBD_SNAP_HEAD_NAME for the base image).
4568 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004569static ssize_t rbd_snap_show(struct device *dev,
4570 struct device_attribute *attr,
4571 char *buf)
4572{
Alex Elder593a9e72012-02-07 12:03:37 -06004573 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004574
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004575 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004576}
4577
Mike Christie92a58672016-08-18 18:38:44 +02004578static ssize_t rbd_snap_id_show(struct device *dev,
4579 struct device_attribute *attr, char *buf)
4580{
4581 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4582
4583 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4584}
4585
Alex Elder86b00e02012-10-25 23:34:42 -05004586/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004587 * For a v2 image, shows the chain of parent images, separated by empty
4588 * lines. For v1 images or if there is no parent, shows "(no parent
4589 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004590 */
4591static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004592 struct device_attribute *attr,
4593 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004594{
4595 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004596 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004597
Ilya Dryomovff961282014-07-22 21:53:07 +04004598 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004599 return sprintf(buf, "(no parent image)\n");
4600
Ilya Dryomovff961282014-07-22 21:53:07 +04004601 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4602 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004603
Ilya Dryomovff961282014-07-22 21:53:07 +04004604 count += sprintf(&buf[count], "%s"
4605 "pool_id %llu\npool_name %s\n"
4606 "image_id %s\nimage_name %s\n"
4607 "snap_id %llu\nsnap_name %s\n"
4608 "overlap %llu\n",
4609 !count ? "" : "\n", /* first? */
4610 spec->pool_id, spec->pool_name,
4611 spec->image_id, spec->image_name ?: "(unknown)",
4612 spec->snap_id, spec->snap_name,
4613 rbd_dev->parent_overlap);
4614 }
Alex Elder86b00e02012-10-25 23:34:42 -05004615
Ilya Dryomovff961282014-07-22 21:53:07 +04004616 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004617}
4618
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004619static ssize_t rbd_image_refresh(struct device *dev,
4620 struct device_attribute *attr,
4621 const char *buf,
4622 size_t size)
4623{
Alex Elder593a9e72012-02-07 12:03:37 -06004624 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004625 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004626
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004627 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004628 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004629 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004630
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004631 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004632}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004633
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004634static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05004635static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004636static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004637static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004638static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004639static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
Mike Christie267fb902016-08-18 18:38:43 +02004640static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004641static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004642static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05004643static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004644static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05004645static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004646static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4647static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Mike Christie92a58672016-08-18 18:38:44 +02004648static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05004649static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004650
4651static struct attribute *rbd_attrs[] = {
4652 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004653 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004654 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004655 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004656 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004657 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004658 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004659 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004660 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004661 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004662 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004663 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004664 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004665 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004666 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004667 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004668 NULL
4669};
4670
4671static struct attribute_group rbd_attr_group = {
4672 .attrs = rbd_attrs,
4673};
4674
4675static const struct attribute_group *rbd_attr_groups[] = {
4676 &rbd_attr_group,
4677 NULL
4678};
4679
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004680static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004681
4682static struct device_type rbd_device_type = {
4683 .name = "rbd",
4684 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004685 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004686};
4687
Alex Elder8b8fb992012-10-26 17:25:24 -05004688static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4689{
4690 kref_get(&spec->kref);
4691
4692 return spec;
4693}
4694
4695static void rbd_spec_free(struct kref *kref);
4696static void rbd_spec_put(struct rbd_spec *spec)
4697{
4698 if (spec)
4699 kref_put(&spec->kref, rbd_spec_free);
4700}
4701
4702static struct rbd_spec *rbd_spec_alloc(void)
4703{
4704 struct rbd_spec *spec;
4705
4706 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4707 if (!spec)
4708 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004709
4710 spec->pool_id = CEPH_NOPOOL;
4711 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004712 kref_init(&spec->kref);
4713
Alex Elder8b8fb992012-10-26 17:25:24 -05004714 return spec;
4715}
4716
4717static void rbd_spec_free(struct kref *kref)
4718{
4719 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4720
4721 kfree(spec->pool_name);
4722 kfree(spec->image_id);
4723 kfree(spec->image_name);
4724 kfree(spec->snap_name);
4725 kfree(spec);
4726}
4727
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004728static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004729{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004730 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004731 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004732
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004733 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004734 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004735 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004736
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004737 rbd_put_client(rbd_dev->rbd_client);
4738 rbd_spec_put(rbd_dev->spec);
4739 kfree(rbd_dev->opts);
4740 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004741}
4742
4743static void rbd_dev_release(struct device *dev)
4744{
4745 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4746 bool need_put = !!rbd_dev->opts;
4747
4748 if (need_put) {
4749 destroy_workqueue(rbd_dev->task_wq);
4750 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4751 }
4752
4753 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004754
4755 /*
4756 * This is racy, but way better than putting module outside of
4757 * the release callback. The race window is pretty small, so
4758 * doing something similar to dm (dm-builtin.c) is overkill.
4759 */
4760 if (need_put)
4761 module_put(THIS_MODULE);
4762}
4763
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004764static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4765 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004766{
4767 struct rbd_device *rbd_dev;
4768
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004769 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004770 if (!rbd_dev)
4771 return NULL;
4772
4773 spin_lock_init(&rbd_dev->lock);
4774 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004775 init_rwsem(&rbd_dev->header_rwsem);
4776
Ilya Dryomov7e973322017-01-25 18:16:22 +01004777 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004778 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004779 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004780
Ilya Dryomov99d16942016-08-12 16:11:41 +02004781 mutex_init(&rbd_dev->watch_mutex);
4782 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4783 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4784
Ilya Dryomoved95b212016-08-12 16:40:02 +02004785 init_rwsem(&rbd_dev->lock_rwsem);
4786 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4787 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4788 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4789 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4790 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4791 init_waitqueue_head(&rbd_dev->lock_waitq);
4792
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004793 rbd_dev->dev.bus = &rbd_bus_type;
4794 rbd_dev->dev.type = &rbd_device_type;
4795 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004796 device_initialize(&rbd_dev->dev);
4797
Alex Elderc53d5892012-10-25 23:34:42 -05004798 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004799 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004800
Alex Elderc53d5892012-10-25 23:34:42 -05004801 return rbd_dev;
4802}
4803
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004804/*
4805 * Create a mapping rbd_dev.
4806 */
4807static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4808 struct rbd_spec *spec,
4809 struct rbd_options *opts)
4810{
4811 struct rbd_device *rbd_dev;
4812
4813 rbd_dev = __rbd_dev_create(rbdc, spec);
4814 if (!rbd_dev)
4815 return NULL;
4816
4817 rbd_dev->opts = opts;
4818
4819 /* get an id and fill in device name */
4820 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4821 minor_to_rbd_dev_id(1 << MINORBITS),
4822 GFP_KERNEL);
4823 if (rbd_dev->dev_id < 0)
4824 goto fail_rbd_dev;
4825
4826 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4827 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4828 rbd_dev->name);
4829 if (!rbd_dev->task_wq)
4830 goto fail_dev_id;
4831
4832 /* we have a ref from do_rbd_add() */
4833 __module_get(THIS_MODULE);
4834
4835 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4836 return rbd_dev;
4837
4838fail_dev_id:
4839 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4840fail_rbd_dev:
4841 rbd_dev_free(rbd_dev);
4842 return NULL;
4843}
4844
Alex Elderc53d5892012-10-25 23:34:42 -05004845static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4846{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004847 if (rbd_dev)
4848 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004849}
4850
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004851/*
Alex Elder9d475de2012-07-03 16:01:19 -05004852 * Get the size and object order for an image snapshot, or if
4853 * snap_id is CEPH_NOSNAP, gets this information for the base
4854 * image.
4855 */
4856static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4857 u8 *order, u64 *snap_size)
4858{
4859 __le64 snapid = cpu_to_le64(snap_id);
4860 int ret;
4861 struct {
4862 u8 order;
4863 __le64 size;
4864 } __attribute__ ((packed)) size_buf = { 0 };
4865
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004866 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4867 &rbd_dev->header_oloc, "get_size",
4868 &snapid, sizeof(snapid),
4869 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004870 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004871 if (ret < 0)
4872 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004873 if (ret < sizeof (size_buf))
4874 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004875
Josh Durginc3545572013-08-28 17:08:10 -07004876 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004877 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004878 dout(" order %u", (unsigned int)*order);
4879 }
Alex Elder9d475de2012-07-03 16:01:19 -05004880 *snap_size = le64_to_cpu(size_buf.size);
4881
Josh Durginc3545572013-08-28 17:08:10 -07004882 dout(" snap_id 0x%016llx snap_size = %llu\n",
4883 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004884 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004885
4886 return 0;
4887}
4888
4889static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4890{
4891 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4892 &rbd_dev->header.obj_order,
4893 &rbd_dev->header.image_size);
4894}
4895
Alex Elder1e130192012-07-03 16:01:19 -05004896static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4897{
4898 void *reply_buf;
4899 int ret;
4900 void *p;
4901
4902 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4903 if (!reply_buf)
4904 return -ENOMEM;
4905
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004906 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4907 &rbd_dev->header_oloc, "get_object_prefix",
4908 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004909 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004910 if (ret < 0)
4911 goto out;
4912
4913 p = reply_buf;
4914 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004915 p + ret, NULL, GFP_NOIO);
4916 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004917
4918 if (IS_ERR(rbd_dev->header.object_prefix)) {
4919 ret = PTR_ERR(rbd_dev->header.object_prefix);
4920 rbd_dev->header.object_prefix = NULL;
4921 } else {
4922 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4923 }
Alex Elder1e130192012-07-03 16:01:19 -05004924out:
4925 kfree(reply_buf);
4926
4927 return ret;
4928}
4929
Alex Elderb1b54022012-07-03 16:01:19 -05004930static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4931 u64 *snap_features)
4932{
4933 __le64 snapid = cpu_to_le64(snap_id);
4934 struct {
4935 __le64 features;
4936 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004937 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004938 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004939 int ret;
4940
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004941 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4942 &rbd_dev->header_oloc, "get_features",
4943 &snapid, sizeof(snapid),
4944 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004945 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004946 if (ret < 0)
4947 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004948 if (ret < sizeof (features_buf))
4949 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004950
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004951 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4952 if (unsup) {
4953 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4954 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004955 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004956 }
Alex Elderd8891402012-10-09 13:50:17 -07004957
Alex Elderb1b54022012-07-03 16:01:19 -05004958 *snap_features = le64_to_cpu(features_buf.features);
4959
4960 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004961 (unsigned long long)snap_id,
4962 (unsigned long long)*snap_features,
4963 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004964
4965 return 0;
4966}
4967
4968static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4969{
4970 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4971 &rbd_dev->header.features);
4972}
4973
Alex Elder86b00e02012-10-25 23:34:42 -05004974static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4975{
4976 struct rbd_spec *parent_spec;
4977 size_t size;
4978 void *reply_buf = NULL;
4979 __le64 snapid;
4980 void *p;
4981 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05004982 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004983 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004984 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004985 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05004986 int ret;
4987
4988 parent_spec = rbd_spec_alloc();
4989 if (!parent_spec)
4990 return -ENOMEM;
4991
4992 size = sizeof (__le64) + /* pool_id */
4993 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
4994 sizeof (__le64) + /* snap_id */
4995 sizeof (__le64); /* overlap */
4996 reply_buf = kmalloc(size, GFP_KERNEL);
4997 if (!reply_buf) {
4998 ret = -ENOMEM;
4999 goto out_err;
5000 }
5001
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04005002 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005003 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5004 &rbd_dev->header_oloc, "get_parent",
5005 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005006 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05005007 if (ret < 0)
5008 goto out_err;
5009
Alex Elder86b00e02012-10-25 23:34:42 -05005010 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005011 end = reply_buf + ret;
5012 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05005013 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05005014 if (pool_id == CEPH_NOPOOL) {
5015 /*
5016 * Either the parent never existed, or we have
5017 * record of it but the image got flattened so it no
5018 * longer has a parent. When the parent of a
5019 * layered image disappears we immediately set the
5020 * overlap to 0. The effect of this is that all new
5021 * requests will be treated as if the image had no
5022 * parent.
5023 */
5024 if (rbd_dev->parent_overlap) {
5025 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05005026 rbd_dev_parent_put(rbd_dev);
5027 pr_info("%s: clone image has been flattened\n",
5028 rbd_dev->disk->disk_name);
5029 }
5030
Alex Elder86b00e02012-10-25 23:34:42 -05005031 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05005032 }
Alex Elder86b00e02012-10-25 23:34:42 -05005033
Alex Elder0903e872012-11-14 12:25:19 -06005034 /* The ceph file layout needs to fit pool id in 32 bits */
5035
5036 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05005037 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04005038 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05005039 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05005040 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005041 }
Alex Elder0903e872012-11-14 12:25:19 -06005042
Alex Elder979ed482012-11-01 08:39:26 -05005043 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05005044 if (IS_ERR(image_id)) {
5045 ret = PTR_ERR(image_id);
5046 goto out_err;
5047 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005048 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05005049 ceph_decode_64_safe(&p, end, overlap, out_err);
5050
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005051 /*
5052 * The parent won't change (except when the clone is
5053 * flattened, already handled that). So we only need to
5054 * record the parent spec we have not already done so.
5055 */
5056 if (!rbd_dev->parent_spec) {
5057 parent_spec->pool_id = pool_id;
5058 parent_spec->image_id = image_id;
5059 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05005060 rbd_dev->parent_spec = parent_spec;
5061 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04005062 } else {
5063 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005064 }
5065
5066 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005067 * We always update the parent overlap. If it's zero we issue
5068 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005069 */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005070 if (!overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005071 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005072 /* refresh, careful to warn just once */
5073 if (rbd_dev->parent_overlap)
5074 rbd_warn(rbd_dev,
5075 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005076 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005077 /* initial probe */
5078 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005079 }
Alex Elder70cf49c2013-05-06 17:40:33 -05005080 }
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005081 rbd_dev->parent_overlap = overlap;
5082
Alex Elder86b00e02012-10-25 23:34:42 -05005083out:
5084 ret = 0;
5085out_err:
5086 kfree(reply_buf);
5087 rbd_spec_put(parent_spec);
5088
5089 return ret;
5090}
5091
Alex Eldercc070d52013-04-21 12:14:45 -05005092static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5093{
5094 struct {
5095 __le64 stripe_unit;
5096 __le64 stripe_count;
5097 } __attribute__ ((packed)) striping_info_buf = { 0 };
5098 size_t size = sizeof (striping_info_buf);
5099 void *p;
5100 u64 obj_size;
5101 u64 stripe_unit;
5102 u64 stripe_count;
5103 int ret;
5104
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005105 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5106 &rbd_dev->header_oloc, "get_stripe_unit_count",
5107 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05005108 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5109 if (ret < 0)
5110 return ret;
5111 if (ret < size)
5112 return -ERANGE;
5113
5114 /*
5115 * We don't actually support the "fancy striping" feature
5116 * (STRIPINGV2) yet, but if the striping sizes are the
5117 * defaults the behavior is the same as before. So find
5118 * out, and only fail if the image has non-default values.
5119 */
5120 ret = -EINVAL;
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01005121 obj_size = rbd_obj_bytes(&rbd_dev->header);
Alex Eldercc070d52013-04-21 12:14:45 -05005122 p = &striping_info_buf;
5123 stripe_unit = ceph_decode_64(&p);
5124 if (stripe_unit != obj_size) {
5125 rbd_warn(rbd_dev, "unsupported stripe unit "
5126 "(got %llu want %llu)",
5127 stripe_unit, obj_size);
5128 return -EINVAL;
5129 }
5130 stripe_count = ceph_decode_64(&p);
5131 if (stripe_count != 1) {
5132 rbd_warn(rbd_dev, "unsupported stripe count "
5133 "(got %llu want 1)", stripe_count);
5134 return -EINVAL;
5135 }
Alex Elder500d0c02013-04-26 09:43:47 -05005136 rbd_dev->header.stripe_unit = stripe_unit;
5137 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05005138
5139 return 0;
5140}
5141
Ilya Dryomov7e973322017-01-25 18:16:22 +01005142static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5143{
5144 __le64 data_pool_id;
5145 int ret;
5146
5147 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5148 &rbd_dev->header_oloc, "get_data_pool",
5149 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5150 if (ret < 0)
5151 return ret;
5152 if (ret < sizeof(data_pool_id))
5153 return -EBADMSG;
5154
5155 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5156 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5157 return 0;
5158}
5159
Alex Elder9e15b772012-10-30 19:40:33 -05005160static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5161{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005162 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05005163 size_t image_id_size;
5164 char *image_id;
5165 void *p;
5166 void *end;
5167 size_t size;
5168 void *reply_buf = NULL;
5169 size_t len = 0;
5170 char *image_name = NULL;
5171 int ret;
5172
5173 rbd_assert(!rbd_dev->spec->image_name);
5174
Alex Elder69e7a022012-11-01 08:39:26 -05005175 len = strlen(rbd_dev->spec->image_id);
5176 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05005177 image_id = kmalloc(image_id_size, GFP_KERNEL);
5178 if (!image_id)
5179 return NULL;
5180
5181 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05005182 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05005183 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05005184
5185 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5186 reply_buf = kmalloc(size, GFP_KERNEL);
5187 if (!reply_buf)
5188 goto out;
5189
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005190 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5191 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5192 "dir_get_name", image_id, image_id_size,
5193 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005194 if (ret < 0)
5195 goto out;
5196 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005197 end = reply_buf + ret;
5198
Alex Elder9e15b772012-10-30 19:40:33 -05005199 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5200 if (IS_ERR(image_name))
5201 image_name = NULL;
5202 else
5203 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5204out:
5205 kfree(reply_buf);
5206 kfree(image_id);
5207
5208 return image_name;
5209}
5210
Alex Elder2ad3d712013-04-30 00:44:33 -05005211static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5212{
5213 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5214 const char *snap_name;
5215 u32 which = 0;
5216
5217 /* Skip over names until we find the one we are looking for */
5218
5219 snap_name = rbd_dev->header.snap_names;
5220 while (which < snapc->num_snaps) {
5221 if (!strcmp(name, snap_name))
5222 return snapc->snaps[which];
5223 snap_name += strlen(snap_name) + 1;
5224 which++;
5225 }
5226 return CEPH_NOSNAP;
5227}
5228
5229static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5230{
5231 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5232 u32 which;
5233 bool found = false;
5234 u64 snap_id;
5235
5236 for (which = 0; !found && which < snapc->num_snaps; which++) {
5237 const char *snap_name;
5238
5239 snap_id = snapc->snaps[which];
5240 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005241 if (IS_ERR(snap_name)) {
5242 /* ignore no-longer existing snapshots */
5243 if (PTR_ERR(snap_name) == -ENOENT)
5244 continue;
5245 else
5246 break;
5247 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005248 found = !strcmp(name, snap_name);
5249 kfree(snap_name);
5250 }
5251 return found ? snap_id : CEPH_NOSNAP;
5252}
5253
5254/*
5255 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5256 * no snapshot by that name is found, or if an error occurs.
5257 */
5258static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5259{
5260 if (rbd_dev->image_format == 1)
5261 return rbd_v1_snap_id_by_name(rbd_dev, name);
5262
5263 return rbd_v2_snap_id_by_name(rbd_dev, name);
5264}
5265
Alex Elder9e15b772012-10-30 19:40:33 -05005266/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005267 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005268 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005269static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5270{
5271 struct rbd_spec *spec = rbd_dev->spec;
5272
5273 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5274 rbd_assert(spec->image_id && spec->image_name);
5275 rbd_assert(spec->snap_name);
5276
5277 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5278 u64 snap_id;
5279
5280 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5281 if (snap_id == CEPH_NOSNAP)
5282 return -ENOENT;
5283
5284 spec->snap_id = snap_id;
5285 } else {
5286 spec->snap_id = CEPH_NOSNAP;
5287 }
5288
5289 return 0;
5290}
5291
5292/*
5293 * A parent image will have all ids but none of the names.
5294 *
5295 * All names in an rbd spec are dynamically allocated. It's OK if we
5296 * can't figure out the name for an image id.
5297 */
5298static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005299{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005300 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5301 struct rbd_spec *spec = rbd_dev->spec;
5302 const char *pool_name;
5303 const char *image_name;
5304 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005305 int ret;
5306
Ilya Dryomov04077592014-07-23 17:11:20 +04005307 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5308 rbd_assert(spec->image_id);
5309 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005310
Alex Elder2e9f7f12013-04-26 09:43:48 -05005311 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005312
Alex Elder2e9f7f12013-04-26 09:43:48 -05005313 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5314 if (!pool_name) {
5315 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005316 return -EIO;
5317 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005318 pool_name = kstrdup(pool_name, GFP_KERNEL);
5319 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005320 return -ENOMEM;
5321
5322 /* Fetch the image name; tolerate failure here */
5323
Alex Elder2e9f7f12013-04-26 09:43:48 -05005324 image_name = rbd_dev_image_name(rbd_dev);
5325 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005326 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005327
Ilya Dryomov04077592014-07-23 17:11:20 +04005328 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005329
Alex Elder2e9f7f12013-04-26 09:43:48 -05005330 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005331 if (IS_ERR(snap_name)) {
5332 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005333 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005334 }
5335
5336 spec->pool_name = pool_name;
5337 spec->image_name = image_name;
5338 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005339
5340 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005341
Alex Elder9e15b772012-10-30 19:40:33 -05005342out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005343 kfree(image_name);
5344 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005345 return ret;
5346}
5347
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005348static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005349{
5350 size_t size;
5351 int ret;
5352 void *reply_buf;
5353 void *p;
5354 void *end;
5355 u64 seq;
5356 u32 snap_count;
5357 struct ceph_snap_context *snapc;
5358 u32 i;
5359
5360 /*
5361 * We'll need room for the seq value (maximum snapshot id),
5362 * snapshot count, and array of that many snapshot ids.
5363 * For now we have a fixed upper limit on the number we're
5364 * prepared to receive.
5365 */
5366 size = sizeof (__le64) + sizeof (__le32) +
5367 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5368 reply_buf = kzalloc(size, GFP_KERNEL);
5369 if (!reply_buf)
5370 return -ENOMEM;
5371
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005372 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5373 &rbd_dev->header_oloc, "get_snapcontext",
5374 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005375 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005376 if (ret < 0)
5377 goto out;
5378
Alex Elder35d489f2012-07-03 16:01:19 -05005379 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005380 end = reply_buf + ret;
5381 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005382 ceph_decode_64_safe(&p, end, seq, out);
5383 ceph_decode_32_safe(&p, end, snap_count, out);
5384
5385 /*
5386 * Make sure the reported number of snapshot ids wouldn't go
5387 * beyond the end of our buffer. But before checking that,
5388 * make sure the computed size of the snapshot context we
5389 * allocate is representable in a size_t.
5390 */
5391 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5392 / sizeof (u64)) {
5393 ret = -EINVAL;
5394 goto out;
5395 }
5396 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5397 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005398 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005399
Alex Elder812164f82013-04-30 00:44:32 -05005400 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005401 if (!snapc) {
5402 ret = -ENOMEM;
5403 goto out;
5404 }
Alex Elder35d489f2012-07-03 16:01:19 -05005405 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005406 for (i = 0; i < snap_count; i++)
5407 snapc->snaps[i] = ceph_decode_64(&p);
5408
Alex Elder49ece552013-05-06 08:37:00 -05005409 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005410 rbd_dev->header.snapc = snapc;
5411
5412 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005413 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005414out:
5415 kfree(reply_buf);
5416
Alex Elder57385b52013-04-21 12:14:45 -05005417 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005418}
5419
Alex Elder54cac612013-04-30 00:44:33 -05005420static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5421 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005422{
5423 size_t size;
5424 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005425 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005426 int ret;
5427 void *p;
5428 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005429 char *snap_name;
5430
5431 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5432 reply_buf = kmalloc(size, GFP_KERNEL);
5433 if (!reply_buf)
5434 return ERR_PTR(-ENOMEM);
5435
Alex Elder54cac612013-04-30 00:44:33 -05005436 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005437 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5438 &rbd_dev->header_oloc, "get_snapshot_name",
5439 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005440 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005441 if (ret < 0) {
5442 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005443 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005444 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005445
5446 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005447 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005448 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005449 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005450 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005451
Alex Elderf40eb342013-04-25 15:09:42 -05005452 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005453 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005454out:
5455 kfree(reply_buf);
5456
Alex Elderf40eb342013-04-25 15:09:42 -05005457 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005458}
5459
Alex Elder2df3fac2013-05-06 09:51:30 -05005460static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005461{
Alex Elder2df3fac2013-05-06 09:51:30 -05005462 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005463 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005464
Josh Durgin1617e402013-06-12 14:43:10 -07005465 ret = rbd_dev_v2_image_size(rbd_dev);
5466 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005467 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005468
Alex Elder2df3fac2013-05-06 09:51:30 -05005469 if (first_time) {
5470 ret = rbd_dev_v2_header_onetime(rbd_dev);
5471 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005472 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005473 }
5474
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005475 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005476 if (ret && first_time) {
5477 kfree(rbd_dev->header.object_prefix);
5478 rbd_dev->header.object_prefix = NULL;
5479 }
Alex Elder117973f2012-08-31 17:29:55 -05005480
5481 return ret;
5482}
5483
Ilya Dryomova720ae02014-07-23 17:11:19 +04005484static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5485{
5486 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5487
5488 if (rbd_dev->image_format == 1)
5489 return rbd_dev_v1_header_info(rbd_dev);
5490
5491 return rbd_dev_v2_header_info(rbd_dev);
5492}
5493
Alex Elder1ddbe942012-01-29 13:57:44 -06005494/*
Alex Eldere28fff262012-02-02 08:13:30 -06005495 * Skips over white space at *buf, and updates *buf to point to the
5496 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005497 * the token (string of non-white space characters) found. Note
5498 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005499 */
5500static inline size_t next_token(const char **buf)
5501{
5502 /*
5503 * These are the characters that produce nonzero for
5504 * isspace() in the "C" and "POSIX" locales.
5505 */
5506 const char *spaces = " \f\n\r\t\v";
5507
5508 *buf += strspn(*buf, spaces); /* Find start of token */
5509
5510 return strcspn(*buf, spaces); /* Return token length */
5511}
5512
5513/*
Alex Elderea3352f2012-07-09 21:04:23 -05005514 * Finds the next token in *buf, dynamically allocates a buffer big
5515 * enough to hold a copy of it, and copies the token into the new
5516 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5517 * that a duplicate buffer is created even for a zero-length token.
5518 *
5519 * Returns a pointer to the newly-allocated duplicate, or a null
5520 * pointer if memory for the duplicate was not available. If
5521 * the lenp argument is a non-null pointer, the length of the token
5522 * (not including the '\0') is returned in *lenp.
5523 *
5524 * If successful, the *buf pointer will be updated to point beyond
5525 * the end of the found token.
5526 *
5527 * Note: uses GFP_KERNEL for allocation.
5528 */
5529static inline char *dup_token(const char **buf, size_t *lenp)
5530{
5531 char *dup;
5532 size_t len;
5533
5534 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005535 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005536 if (!dup)
5537 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005538 *(dup + len) = '\0';
5539 *buf += len;
5540
5541 if (lenp)
5542 *lenp = len;
5543
5544 return dup;
5545}
5546
5547/*
Alex Elder859c31d2012-10-25 23:34:42 -05005548 * Parse the options provided for an "rbd add" (i.e., rbd image
5549 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5550 * and the data written is passed here via a NUL-terminated buffer.
5551 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005552 *
Alex Elder859c31d2012-10-25 23:34:42 -05005553 * The information extracted from these options is recorded in
5554 * the other parameters which return dynamically-allocated
5555 * structures:
5556 * ceph_opts
5557 * The address of a pointer that will refer to a ceph options
5558 * structure. Caller must release the returned pointer using
5559 * ceph_destroy_options() when it is no longer needed.
5560 * rbd_opts
5561 * Address of an rbd options pointer. Fully initialized by
5562 * this function; caller must release with kfree().
5563 * spec
5564 * Address of an rbd image specification pointer. Fully
5565 * initialized by this function based on parsed options.
5566 * Caller must release with rbd_spec_put().
5567 *
5568 * The options passed take this form:
5569 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5570 * where:
5571 * <mon_addrs>
5572 * A comma-separated list of one or more monitor addresses.
5573 * A monitor address is an ip address, optionally followed
5574 * by a port number (separated by a colon).
5575 * I.e.: ip1[:port1][,ip2[:port2]...]
5576 * <options>
5577 * A comma-separated list of ceph and/or rbd options.
5578 * <pool_name>
5579 * The name of the rados pool containing the rbd image.
5580 * <image_name>
5581 * The name of the image in that pool to map.
5582 * <snap_id>
5583 * An optional snapshot id. If provided, the mapping will
5584 * present data from the image at the time that snapshot was
5585 * created. The image head is used if no snapshot id is
5586 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005587 */
Alex Elder859c31d2012-10-25 23:34:42 -05005588static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005589 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005590 struct rbd_options **opts,
5591 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005592{
Alex Elderd22f76e2012-07-12 10:46:35 -05005593 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005594 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005595 const char *mon_addrs;
Alex Elderecb4dc222013-04-26 09:43:47 -05005596 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005597 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05005598 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005599 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005600 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005601 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005602
5603 /* The first four tokens are required */
5604
Alex Elder7ef32142012-02-02 08:13:30 -06005605 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005606 if (!len) {
5607 rbd_warn(NULL, "no monitor address(es) provided");
5608 return -EINVAL;
5609 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005610 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005611 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005612 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005613
Alex Elderdc79b112012-10-25 23:34:41 -05005614 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005615 options = dup_token(&buf, NULL);
5616 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005617 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005618 if (!*options) {
5619 rbd_warn(NULL, "no options provided");
5620 goto out_err;
5621 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005622
Alex Elder859c31d2012-10-25 23:34:42 -05005623 spec = rbd_spec_alloc();
5624 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005625 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005626
5627 spec->pool_name = dup_token(&buf, NULL);
5628 if (!spec->pool_name)
5629 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005630 if (!*spec->pool_name) {
5631 rbd_warn(NULL, "no pool name provided");
5632 goto out_err;
5633 }
Alex Eldere28fff262012-02-02 08:13:30 -06005634
Alex Elder69e7a022012-11-01 08:39:26 -05005635 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05005636 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005637 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005638 if (!*spec->image_name) {
5639 rbd_warn(NULL, "no image name provided");
5640 goto out_err;
5641 }
Alex Eldere28fff262012-02-02 08:13:30 -06005642
Alex Elderf28e5652012-10-25 23:34:41 -05005643 /*
5644 * Snapshot name is optional; default is to use "-"
5645 * (indicating the head/no snapshot).
5646 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005647 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005648 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005649 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5650 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005651 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005652 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005653 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005654 }
Alex Elderecb4dc222013-04-26 09:43:47 -05005655 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5656 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005657 goto out_mem;
Alex Elderecb4dc222013-04-26 09:43:47 -05005658 *(snap_name + len) = '\0';
5659 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005660
Alex Elder0ddebc02012-10-25 23:34:41 -05005661 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005662
Alex Elder4e9afeb2012-10-25 23:34:41 -05005663 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
5664 if (!rbd_opts)
5665 goto out_mem;
5666
5667 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Ilya Dryomovb5584182015-06-23 16:21:19 +03005668 rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov80de1912016-09-20 14:23:17 +02005669 rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005670
Alex Elder859c31d2012-10-25 23:34:42 -05005671 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05005672 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05005673 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05005674 if (IS_ERR(copts)) {
5675 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005676 goto out_err;
5677 }
Alex Elder859c31d2012-10-25 23:34:42 -05005678 kfree(options);
5679
5680 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005681 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05005682 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005683
Alex Elderdc79b112012-10-25 23:34:41 -05005684 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005685out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005686 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005687out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05005688 kfree(rbd_opts);
5689 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005690 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005691
Alex Elderdc79b112012-10-25 23:34:41 -05005692 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005693}
5694
Alex Elder589d30e2012-07-10 20:30:11 -05005695/*
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005696 * Return pool id (>= 0) or a negative error code.
5697 */
5698static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
5699{
Ilya Dryomova319bf52015-05-15 12:02:17 +03005700 struct ceph_options *opts = rbdc->client->options;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005701 u64 newest_epoch;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005702 int tries = 0;
5703 int ret;
5704
5705again:
5706 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
5707 if (ret == -ENOENT && tries++ < 1) {
Ilya Dryomovd0b19702016-04-28 16:07:27 +02005708 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
5709 &newest_epoch);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005710 if (ret < 0)
5711 return ret;
5712
5713 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
Ilya Dryomov7cca78c2016-04-28 16:07:28 +02005714 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005715 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
Ilya Dryomova319bf52015-05-15 12:02:17 +03005716 newest_epoch,
5717 opts->mount_timeout);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005718 goto again;
5719 } else {
5720 /* the osdmap we have is new enough */
5721 return -ENOENT;
5722 }
5723 }
5724
5725 return ret;
5726}
5727
5728/*
Alex Elder589d30e2012-07-10 20:30:11 -05005729 * An rbd format 2 image has a unique identifier, distinct from the
5730 * name given to it by the user. Internally, that identifier is
5731 * what's used to specify the names of objects related to the image.
5732 *
5733 * A special "rbd id" object is used to map an rbd image name to its
5734 * id. If that object doesn't exist, then there is no v2 rbd image
5735 * with the supplied name.
5736 *
5737 * This function will record the given rbd_dev's image_id field if
5738 * it can be determined, and in that case will return 0. If any
5739 * errors occur a negative errno will be returned and the rbd_dev's
5740 * image_id field will be unchanged (and should be NULL).
5741 */
5742static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5743{
5744 int ret;
5745 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005746 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005747 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005748 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005749
Alex Elder589d30e2012-07-10 20:30:11 -05005750 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005751 * When probing a parent image, the image id is already
5752 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005753 * need to fetch the image id again in this case. We
5754 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005755 */
Alex Elderc0fba362013-04-25 23:15:08 -05005756 if (rbd_dev->spec->image_id) {
5757 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5758
Alex Elder2c0d0a12012-10-30 19:40:33 -05005759 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005760 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005761
5762 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005763 * First, see if the format 2 image id file exists, and if
5764 * so, get the image's persistent id from it.
5765 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005766 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5767 rbd_dev->spec->image_name);
5768 if (ret)
5769 return ret;
5770
5771 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005772
5773 /* Response will be an encoded string, which includes a length */
5774
5775 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5776 response = kzalloc(size, GFP_NOIO);
5777 if (!response) {
5778 ret = -ENOMEM;
5779 goto out;
5780 }
5781
Alex Elderc0fba362013-04-25 23:15:08 -05005782 /* If it doesn't exist we'll assume it's a format 1 image */
5783
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005784 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5785 "get_id", NULL, 0,
5786 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005787 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005788 if (ret == -ENOENT) {
5789 image_id = kstrdup("", GFP_KERNEL);
5790 ret = image_id ? 0 : -ENOMEM;
5791 if (!ret)
5792 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005793 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005794 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005795
Alex Elderc0fba362013-04-25 23:15:08 -05005796 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005797 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005798 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005799 if (!ret)
5800 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005801 }
5802
5803 if (!ret) {
5804 rbd_dev->spec->image_id = image_id;
5805 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005806 }
5807out:
5808 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005809 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005810 return ret;
5811}
5812
Alex Elder3abef3b2013-05-13 20:35:37 -05005813/*
5814 * Undo whatever state changes are made by v1 or v2 header info
5815 * call.
5816 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005817static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5818{
5819 struct rbd_image_header *header;
5820
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005821 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005822
5823 /* Free dynamic fields from the header, then zero it out */
5824
5825 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005826 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005827 kfree(header->snap_sizes);
5828 kfree(header->snap_names);
5829 kfree(header->object_prefix);
5830 memset(header, 0, sizeof (*header));
5831}
5832
Alex Elder2df3fac2013-05-06 09:51:30 -05005833static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005834{
5835 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005836
Alex Elder1e130192012-07-03 16:01:19 -05005837 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005838 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005839 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005840
Alex Elder2df3fac2013-05-06 09:51:30 -05005841 /*
5842 * Get the and check features for the image. Currently the
5843 * features are assumed to never change.
5844 */
Alex Elderb1b54022012-07-03 16:01:19 -05005845 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005846 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005847 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005848
Alex Eldercc070d52013-04-21 12:14:45 -05005849 /* If the image supports fancy striping, get its parameters */
5850
5851 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5852 ret = rbd_dev_v2_striping_info(rbd_dev);
5853 if (ret < 0)
5854 goto out_err;
5855 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005856
Ilya Dryomov7e973322017-01-25 18:16:22 +01005857 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5858 ret = rbd_dev_v2_data_pool(rbd_dev);
5859 if (ret)
5860 goto out_err;
5861 }
5862
Ilya Dryomov263423f2017-01-25 18:16:22 +01005863 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005864 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005865
Alex Elder9d475de2012-07-03 16:01:19 -05005866out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005867 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005868 kfree(rbd_dev->header.object_prefix);
5869 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005870 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005871}
5872
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005873/*
5874 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5875 * rbd_dev_image_probe() recursion depth, which means it's also the
5876 * length of the already discovered part of the parent chain.
5877 */
5878static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005879{
Alex Elder2f82ee52012-10-30 19:40:33 -05005880 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005881 int ret;
5882
5883 if (!rbd_dev->parent_spec)
5884 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005885
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005886 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5887 pr_info("parent chain is too long (%d)\n", depth);
5888 ret = -EINVAL;
5889 goto out_err;
5890 }
5891
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005892 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005893 if (!parent) {
5894 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005895 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005896 }
5897
5898 /*
5899 * Images related by parent/child relationships always share
5900 * rbd_client and spec/parent_spec, so bump their refcounts.
5901 */
5902 __rbd_get_client(rbd_dev->rbd_client);
5903 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005904
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005905 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005906 if (ret < 0)
5907 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005908
Alex Elder124afba2013-04-26 15:44:36 -05005909 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005910 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005911 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005912
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005913out_err:
5914 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005915 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005916 return ret;
5917}
5918
Ilya Dryomov811c6682016-04-15 16:22:16 +02005919/*
5920 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5921 * upon return.
5922 */
Alex Elder200a6a82013-04-28 23:32:34 -05005923static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005924{
Alex Elder83a06262012-10-30 15:47:17 -05005925 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005926
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005927 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005928
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005929 if (!single_major) {
5930 ret = register_blkdev(0, rbd_dev->name);
5931 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005932 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005933
5934 rbd_dev->major = ret;
5935 rbd_dev->minor = 0;
5936 } else {
5937 rbd_dev->major = rbd_major;
5938 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5939 }
Alex Elder83a06262012-10-30 15:47:17 -05005940
5941 /* Set up the blkdev mapping. */
5942
5943 ret = rbd_init_disk(rbd_dev);
5944 if (ret)
5945 goto err_out_blkdev;
5946
Alex Elderf35a4de2013-05-06 09:51:29 -05005947 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005948 if (ret)
5949 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005950
Alex Elderf35a4de2013-05-06 09:51:29 -05005951 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Josh Durgin22001f62013-09-30 20:10:04 -07005952 set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005953
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005954 dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
5955 ret = device_add(&rbd_dev->dev);
Alex Elderf35a4de2013-05-06 09:51:29 -05005956 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005957 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005958
Alex Elder83a06262012-10-30 15:47:17 -05005959 /* Everything's ready. Announce the disk to the world. */
5960
Alex Elder129b79d2013-04-26 15:44:36 -05005961 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005962 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005963
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005964 spin_lock(&rbd_dev_list_lock);
5965 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5966 spin_unlock(&rbd_dev_list_lock);
5967
Ilya Dryomov811c6682016-04-15 16:22:16 +02005968 add_disk(rbd_dev->disk);
Ilya Dryomovca7909e2016-08-18 18:38:41 +02005969 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5970 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5971 rbd_dev->header.features);
Alex Elder83a06262012-10-30 15:47:17 -05005972
5973 return ret;
Alex Elder2f82ee52012-10-30 19:40:33 -05005974
Alex Elderf35a4de2013-05-06 09:51:29 -05005975err_out_mapping:
5976 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005977err_out_disk:
5978 rbd_free_disk(rbd_dev);
5979err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005980 if (!single_major)
5981 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005982err_out_unlock:
5983 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005984 return ret;
5985}
5986
Alex Elder332bb122013-04-27 09:59:30 -05005987static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5988{
5989 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005990 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005991
5992 /* Record the header object name for this rbd image. */
5993
5994 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005995 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005996 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5997 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005998 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005999 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6000 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05006001
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006002 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05006003}
6004
Alex Elder200a6a82013-04-28 23:32:34 -05006005static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6006{
Alex Elder6fd48b32013-04-28 23:32:34 -05006007 rbd_dev_unprobe(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006008 rbd_dev->image_format = 0;
6009 kfree(rbd_dev->spec->image_id);
6010 rbd_dev->spec->image_id = NULL;
6011
Alex Elder200a6a82013-04-28 23:32:34 -05006012 rbd_dev_destroy(rbd_dev);
6013}
6014
Alex Eldera30b71b2012-07-10 20:30:11 -05006015/*
6016 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05006017 * device. If this image is the one being mapped (i.e., not a
6018 * parent), initiate a watch on its header object before using that
6019 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05006020 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006021static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05006022{
6023 int ret;
6024
6025 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05006026 * Get the id from the image id object. Unless there's an
6027 * error, rbd_dev->spec->image_id will be filled in with
6028 * a dynamically-allocated string, and rbd_dev->image_format
6029 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05006030 */
6031 ret = rbd_dev_image_id(rbd_dev);
6032 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05006033 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05006034
Alex Elder332bb122013-04-27 09:59:30 -05006035 ret = rbd_dev_header_name(rbd_dev);
6036 if (ret)
6037 goto err_out_format;
6038
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006039 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02006040 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006041 if (ret) {
6042 if (ret == -ENOENT)
6043 pr_info("image %s/%s does not exist\n",
6044 rbd_dev->spec->pool_name,
6045 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006046 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006047 }
Alex Elder1f3ef782013-05-06 17:40:33 -05006048 }
Alex Elderb644de22013-04-27 09:59:31 -05006049
Ilya Dryomova720ae02014-07-23 17:11:19 +04006050 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05006051 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05006052 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05006053
Ilya Dryomov04077592014-07-23 17:11:20 +04006054 /*
6055 * If this image is the one being mapped, we have pool name and
6056 * id, image name and id, and snap name - need to fill snap id.
6057 * Otherwise this is a parent image, identified by pool, image
6058 * and snap ids - need to fill in names for those ids.
6059 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006060 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04006061 ret = rbd_spec_fill_snap_id(rbd_dev);
6062 else
6063 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006064 if (ret) {
6065 if (ret == -ENOENT)
6066 pr_info("snap %s/%s@%s does not exist\n",
6067 rbd_dev->spec->pool_name,
6068 rbd_dev->spec->image_name,
6069 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05006070 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006071 }
Alex Elder9bb81c92013-04-27 09:59:30 -05006072
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006073 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6074 ret = rbd_dev_v2_parent_info(rbd_dev);
6075 if (ret)
6076 goto err_out_probe;
6077
6078 /*
6079 * Need to warn users if this image is the one being
6080 * mapped and has a parent.
6081 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006082 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006083 rbd_warn(rbd_dev,
6084 "WARNING: kernel layering is EXPERIMENTAL!");
6085 }
6086
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006087 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05006088 if (ret)
6089 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05006090
Alex Elder30d60ba2013-05-06 09:51:30 -05006091 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006092 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05006093 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006094
Alex Elder6fd48b32013-04-28 23:32:34 -05006095err_out_probe:
6096 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05006097err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006098 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02006099 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05006100err_out_format:
6101 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05006102 kfree(rbd_dev->spec->image_id);
6103 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05006104 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006105}
6106
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006107static ssize_t do_rbd_add(struct bus_type *bus,
6108 const char *buf,
6109 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006110{
Alex Eldercb8627c2012-07-09 21:04:23 -05006111 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05006112 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05006113 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05006114 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05006115 struct rbd_client *rbdc;
Alex Elder51344a32013-05-06 07:40:30 -05006116 bool read_only;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006117 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006118
6119 if (!try_module_get(THIS_MODULE))
6120 return -ENODEV;
6121
Alex Eldera725f65e2012-02-02 08:13:30 -06006122 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05006123 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05006124 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006125 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06006126
Alex Elder9d3997f2012-10-25 23:34:42 -05006127 rbdc = rbd_get_client(ceph_opts);
6128 if (IS_ERR(rbdc)) {
6129 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006130 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05006131 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006132
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006133 /* pick the pool */
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006134 rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006135 if (rc < 0) {
6136 if (rc == -ENOENT)
6137 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006138 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006139 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05006140 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05006141
Ilya Dryomovd1475432015-06-22 13:24:48 +03006142 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006143 if (!rbd_dev) {
6144 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05006145 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006146 }
Alex Elderc53d5892012-10-25 23:34:42 -05006147 rbdc = NULL; /* rbd_dev now owns this */
6148 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03006149 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006150
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006151 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
6152 if (!rbd_dev->config_info) {
6153 rc = -ENOMEM;
6154 goto err_out_rbd_dev;
6155 }
6156
Ilya Dryomov811c6682016-04-15 16:22:16 +02006157 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006158 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006159 if (rc < 0) {
6160 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05006161 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006162 }
Alex Elder05fd6f62012-08-29 17:11:07 -05006163
Alex Elder7ce4eef2013-05-06 17:40:33 -05006164 /* If we are mapping a snapshot it must be marked read-only */
6165
Ilya Dryomovd1475432015-06-22 13:24:48 +03006166 read_only = rbd_dev->opts->read_only;
Alex Elder7ce4eef2013-05-06 17:40:33 -05006167 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
6168 read_only = true;
6169 rbd_dev->mapping.read_only = read_only;
6170
Alex Elderb536f692013-04-28 23:32:34 -05006171 rc = rbd_dev_device_setup(rbd_dev);
Alex Elder3abef3b2013-05-13 20:35:37 -05006172 if (rc) {
Ilya Dryomove37180c2013-12-16 18:02:41 +02006173 /*
Ilya Dryomov99d16942016-08-12 16:11:41 +02006174 * rbd_unregister_watch() can't be moved into
Ilya Dryomove37180c2013-12-16 18:02:41 +02006175 * rbd_dev_image_release() without refactoring, see
6176 * commit 1f3ef78861ac.
6177 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02006178 rbd_unregister_watch(rbd_dev);
Alex Elder3abef3b2013-05-13 20:35:37 -05006179 rbd_dev_image_release(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006180 goto out;
Alex Elder3abef3b2013-05-13 20:35:37 -05006181 }
Alex Elderb536f692013-04-28 23:32:34 -05006182
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006183 rc = count;
6184out:
6185 module_put(THIS_MODULE);
6186 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05006187
Alex Elderc53d5892012-10-25 23:34:42 -05006188err_out_rbd_dev:
6189 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05006190err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05006191 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006192err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05006193 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03006194 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006195 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006196}
6197
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006198static ssize_t rbd_add(struct bus_type *bus,
6199 const char *buf,
6200 size_t count)
6201{
6202 if (single_major)
6203 return -EINVAL;
6204
6205 return do_rbd_add(bus, buf, count);
6206}
6207
6208static ssize_t rbd_add_single_major(struct bus_type *bus,
6209 const char *buf,
6210 size_t count)
6211{
6212 return do_rbd_add(bus, buf, count);
6213}
6214
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006215static void rbd_dev_device_release(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006216{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006217 rbd_free_disk(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006218
6219 spin_lock(&rbd_dev_list_lock);
6220 list_del_init(&rbd_dev->node);
6221 spin_unlock(&rbd_dev_list_lock);
6222
Alex Elder200a6a82013-04-28 23:32:34 -05006223 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006224 device_del(&rbd_dev->dev);
Alex Elder6d80b132013-05-06 07:40:30 -05006225 rbd_dev_mapping_clear(rbd_dev);
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006226 if (!single_major)
6227 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006228}
6229
Alex Elder05a46af2013-04-26 15:44:36 -05006230static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6231{
Alex Elderad945fc2013-04-26 15:44:36 -05006232 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05006233 struct rbd_device *first = rbd_dev;
6234 struct rbd_device *second = first->parent;
6235 struct rbd_device *third;
6236
6237 /*
6238 * Follow to the parent with no grandparent and
6239 * remove it.
6240 */
6241 while (second && (third = second->parent)) {
6242 first = second;
6243 second = third;
6244 }
Alex Elderad945fc2013-04-26 15:44:36 -05006245 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006246 rbd_dev_image_release(second);
Alex Elderad945fc2013-04-26 15:44:36 -05006247 first->parent = NULL;
6248 first->parent_overlap = 0;
6249
6250 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05006251 rbd_spec_put(first->parent_spec);
6252 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05006253 }
6254}
6255
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006256static ssize_t do_rbd_remove(struct bus_type *bus,
6257 const char *buf,
6258 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006259{
6260 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05006261 struct list_head *tmp;
6262 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02006263 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05006264 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02006265 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05006266 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006267
Mike Christie0276dca2016-08-18 18:38:45 +02006268 dev_id = -1;
6269 opt_buf[0] = '\0';
6270 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6271 if (dev_id < 0) {
6272 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006273 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02006274 }
6275 if (opt_buf[0] != '\0') {
6276 if (!strcmp(opt_buf, "force")) {
6277 force = true;
6278 } else {
6279 pr_err("bad remove option at '%s'\n", opt_buf);
6280 return -EINVAL;
6281 }
6282 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006283
Alex Elder751cc0e2013-05-31 15:17:01 -05006284 ret = -ENOENT;
6285 spin_lock(&rbd_dev_list_lock);
6286 list_for_each(tmp, &rbd_dev_list) {
6287 rbd_dev = list_entry(tmp, struct rbd_device, node);
6288 if (rbd_dev->dev_id == dev_id) {
6289 ret = 0;
6290 break;
6291 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006292 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006293 if (!ret) {
6294 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006295 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006296 ret = -EBUSY;
6297 else
Alex Elder82a442d2013-05-31 17:40:44 -05006298 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6299 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05006300 spin_unlock_irq(&rbd_dev->lock);
6301 }
6302 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05006303 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006304 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006305
Mike Christie0276dca2016-08-18 18:38:45 +02006306 if (force) {
6307 /*
6308 * Prevent new IO from being queued and wait for existing
6309 * IO to complete/fail.
6310 */
6311 blk_mq_freeze_queue(rbd_dev->disk->queue);
6312 blk_set_queue_dying(rbd_dev->disk->queue);
6313 }
6314
Ilya Dryomoved95b212016-08-12 16:40:02 +02006315 down_write(&rbd_dev->lock_rwsem);
6316 if (__rbd_is_lock_owner(rbd_dev))
6317 rbd_unlock(rbd_dev);
6318 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomov99d16942016-08-12 16:11:41 +02006319 rbd_unregister_watch(rbd_dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006320
Josh Durgin98752012013-08-29 17:26:31 -07006321 /*
6322 * Don't free anything from rbd_dev->disk until after all
6323 * notifies are completely processed. Otherwise
6324 * rbd_bus_del_dev() will race with rbd_watch_cb(), resulting
6325 * in a potential use after free of rbd_dev->disk or rbd_dev.
6326 */
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006327 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006328 rbd_dev_image_release(rbd_dev);
Alex Elderaafb2302012-09-06 16:00:54 -05006329
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006330 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006331}
6332
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006333static ssize_t rbd_remove(struct bus_type *bus,
6334 const char *buf,
6335 size_t count)
6336{
6337 if (single_major)
6338 return -EINVAL;
6339
6340 return do_rbd_remove(bus, buf, count);
6341}
6342
6343static ssize_t rbd_remove_single_major(struct bus_type *bus,
6344 const char *buf,
6345 size_t count)
6346{
6347 return do_rbd_remove(bus, buf, count);
6348}
6349
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006350/*
6351 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006352 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006353 */
6354static int rbd_sysfs_init(void)
6355{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006356 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006357
Alex Elderfed4c142012-02-07 12:03:36 -06006358 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006359 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006360 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006361
Alex Elderfed4c142012-02-07 12:03:36 -06006362 ret = bus_register(&rbd_bus_type);
6363 if (ret < 0)
6364 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006365
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006366 return ret;
6367}
6368
6369static void rbd_sysfs_cleanup(void)
6370{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006371 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006372 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006373}
6374
Alex Elder1c2a9df2013-05-01 12:43:03 -05006375static int rbd_slab_init(void)
6376{
6377 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006378 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006379 if (!rbd_img_request_cache)
6380 return -ENOMEM;
6381
6382 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006383 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006384 if (!rbd_obj_request_cache)
6385 goto out_err;
6386
6387 rbd_assert(!rbd_segment_name_cache);
6388 rbd_segment_name_cache = kmem_cache_create("rbd_segment_name",
Ilya Dryomov2d0ebc52014-01-27 17:40:18 +02006389 CEPH_MAX_OID_NAME_LEN + 1, 1, 0, NULL);
Alex Elder78c2a442013-05-01 12:43:04 -05006390 if (rbd_segment_name_cache)
Alex Elder1c2a9df2013-05-01 12:43:03 -05006391 return 0;
Alex Elder78c2a442013-05-01 12:43:04 -05006392out_err:
Julia Lawall13bf2832015-09-13 14:15:26 +02006393 kmem_cache_destroy(rbd_obj_request_cache);
6394 rbd_obj_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006395
Alex Elder868311b2013-05-01 12:43:03 -05006396 kmem_cache_destroy(rbd_img_request_cache);
6397 rbd_img_request_cache = NULL;
6398
Alex Elder1c2a9df2013-05-01 12:43:03 -05006399 return -ENOMEM;
6400}
6401
6402static void rbd_slab_exit(void)
6403{
Alex Elder78c2a442013-05-01 12:43:04 -05006404 rbd_assert(rbd_segment_name_cache);
6405 kmem_cache_destroy(rbd_segment_name_cache);
6406 rbd_segment_name_cache = NULL;
6407
Alex Elder868311b2013-05-01 12:43:03 -05006408 rbd_assert(rbd_obj_request_cache);
6409 kmem_cache_destroy(rbd_obj_request_cache);
6410 rbd_obj_request_cache = NULL;
6411
Alex Elder1c2a9df2013-05-01 12:43:03 -05006412 rbd_assert(rbd_img_request_cache);
6413 kmem_cache_destroy(rbd_img_request_cache);
6414 rbd_img_request_cache = NULL;
6415}
6416
Alex Eldercc344fa2013-02-19 12:25:56 -06006417static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006418{
6419 int rc;
6420
Alex Elder1e32d342013-01-30 11:13:33 -06006421 if (!libceph_compatible(NULL)) {
6422 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006423 return -EINVAL;
6424 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006425
Alex Elder1c2a9df2013-05-01 12:43:03 -05006426 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006427 if (rc)
6428 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006429
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006430 /*
6431 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006432 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006433 */
6434 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6435 if (!rbd_wq) {
6436 rc = -ENOMEM;
6437 goto err_out_slab;
6438 }
6439
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006440 if (single_major) {
6441 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6442 if (rbd_major < 0) {
6443 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006444 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006445 }
6446 }
6447
Alex Elder1c2a9df2013-05-01 12:43:03 -05006448 rc = rbd_sysfs_init();
6449 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006450 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006451
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006452 if (single_major)
6453 pr_info("loaded (major %d)\n", rbd_major);
6454 else
6455 pr_info("loaded\n");
6456
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006457 return 0;
6458
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006459err_out_blkdev:
6460 if (single_major)
6461 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006462err_out_wq:
6463 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006464err_out_slab:
6465 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006466 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006467}
6468
Alex Eldercc344fa2013-02-19 12:25:56 -06006469static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006470{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006471 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006472 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006473 if (single_major)
6474 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006475 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006476 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006477}
6478
6479module_init(rbd_init);
6480module_exit(rbd_exit);
6481
Alex Elderd552c612013-05-31 20:13:09 -05006482MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006483MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6484MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006485/* following authorship retained from original osdblk.c */
6486MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6487
Ilya Dryomov90da2582013-12-13 15:28:56 +02006488MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006489MODULE_LICENSE("GPL");