blob: e542fda18395ceb3ddf9cffa3c089fa3745b85bf [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070036#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050037#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070038
39#include <linux/kernel.h>
40#include <linux/device.h>
41#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010042#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070043#include <linux/fs.h>
44#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050045#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020046#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040047#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070048
49#include "rbd_types.h"
50
Alex Elderaafb2302012-09-06 16:00:54 -050051#define RBD_DEBUG /* Activate rbd_assert() calls */
52
Alex Elder593a9e72012-02-07 12:03:37 -060053/*
54 * The basic unit of block I/O is a sector. It is interpreted in a
55 * number of contexts in Linux (blk, bio, genhd), but the default is
56 * universally 512 bytes. These symbols are just slightly more
57 * meaningful than the bare numbers they represent.
58 */
59#define SECTOR_SHIFT 9
60#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
61
Alex Eldera2acd002013-05-08 22:50:04 -050062/*
63 * Increment the given counter and return its updated value.
64 * If the counter is already 0 it will not be incremented.
65 * If the counter is already at its maximum value returns
66 * -EINVAL without updating it.
67 */
68static int atomic_inc_return_safe(atomic_t *v)
69{
70 unsigned int counter;
71
72 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73 if (counter <= (unsigned int)INT_MAX)
74 return (int)counter;
75
76 atomic_dec(v);
77
78 return -EINVAL;
79}
80
81/* Decrement the counter. Return the resulting value, or -EINVAL */
82static int atomic_dec_return_safe(atomic_t *v)
83{
84 int counter;
85
86 counter = atomic_dec_return(v);
87 if (counter >= 0)
88 return counter;
89
90 atomic_inc(v);
91
92 return -EINVAL;
93}
94
Alex Elderf0f8cef2012-01-29 13:57:44 -060095#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096
Ilya Dryomov7e513d42013-12-16 19:26:32 +020097#define RBD_MINORS_PER_MAJOR 256
98#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200100#define RBD_MAX_PARENT_CHAIN_LEN 16
101
Alex Elderd4b125e2012-07-03 16:01:19 -0500102#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
103#define RBD_MAX_SNAP_NAME_LEN \
104 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105
Alex Elder35d489f2012-07-03 16:01:19 -0500106#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700107
108#define RBD_SNAP_HEAD_NAME "-"
109
Alex Elder9682fc62013-04-30 00:44:33 -0500110#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
111
Alex Elder9e15b772012-10-30 19:40:33 -0500112/* This allows a single page to hold an image name sent by OSD */
113#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500114#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500115
Alex Elder1e130192012-07-03 16:01:19 -0500116#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500117
Ilya Dryomoved95b212016-08-12 16:40:02 +0200118#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200119#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
120
Alex Elderd8891402012-10-09 13:50:17 -0700121/* Feature bits */
122
Ilya Dryomov8767b292017-03-02 19:56:57 +0100123#define RBD_FEATURE_LAYERING (1ULL<<0)
124#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
125#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
126#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100127#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100128
Ilya Dryomoved95b212016-08-12 16:40:02 +0200129#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
130 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100131 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100132 RBD_FEATURE_DATA_POOL | \
133 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700134
135/* Features supported by this (client software) implementation. */
136
Alex Elder770eba62012-10-25 23:34:40 -0500137#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700138
Alex Elder81a89792012-02-02 08:13:30 -0600139/*
140 * An RBD device name will be "rbd#", where the "rbd" comes from
141 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600142 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143#define DEV_NAME_LEN 32
144
145/*
146 * block device image metadata (in-memory version)
147 */
148struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500149 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500150 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500152 u64 stripe_unit;
153 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100154 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500155 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156
Alex Elderf84344f2012-08-31 17:29:51 -0500157 /* The remaining fields need to be updated occasionally */
158 u64 image_size;
159 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500160 char *snap_names; /* format 1 only */
161 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700162};
163
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500164/*
165 * An rbd image specification.
166 *
167 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500168 * identify an image. Each rbd_dev structure includes a pointer to
169 * an rbd_spec structure that encapsulates this identity.
170 *
171 * Each of the id's in an rbd_spec has an associated name. For a
172 * user-mapped image, the names are supplied and the id's associated
173 * with them are looked up. For a layered image, a parent image is
174 * defined by the tuple, and the names are looked up.
175 *
176 * An rbd_dev structure contains a parent_spec pointer which is
177 * non-null if the image it represents is a child in a layered
178 * image. This pointer will refer to the rbd_spec structure used
179 * by the parent rbd_dev for its own identity (i.e., the structure
180 * is shared between the parent and child).
181 *
182 * Since these structures are populated once, during the discovery
183 * phase of image construction, they are effectively immutable so
184 * we make no effort to synchronize access to them.
185 *
186 * Note that code herein does not assume the image name is known (it
187 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500188 */
189struct rbd_spec {
190 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500191 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500192
Alex Elderecb4dc22013-04-26 09:43:47 -0500193 const char *image_id;
194 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500195
196 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500197 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500198
199 struct kref kref;
200};
201
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700202/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600203 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700204 */
205struct rbd_client {
206 struct ceph_client *client;
207 struct kref kref;
208 struct list_head node;
209};
210
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211struct rbd_img_request;
212typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
213
214#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
215
216struct rbd_obj_request;
217typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
218
Alex Elder9969ebc2013-01-18 12:31:10 -0600219enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100220 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100221 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100222 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Alex Elder9969ebc2013-01-18 12:31:10 -0600223};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600224
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800225enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100226 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800227 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800228 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800229};
230
Alex Elder926f9b32013-02-11 12:33:24 -0600231enum obj_req_flags {
232 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600233 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600234};
235
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100236/*
237 * Writes go through the following state machine to deal with
238 * layering:
239 *
240 * need copyup
241 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
242 * | ^ |
243 * v \------------------------------/
244 * done
245 * ^
246 * |
247 * RBD_OBJ_WRITE_FLAT
248 *
249 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
250 * there is a parent or not.
251 */
252enum rbd_obj_write_state {
253 RBD_OBJ_WRITE_FLAT = 1,
254 RBD_OBJ_WRITE_GUARD,
255 RBD_OBJ_WRITE_COPYUP,
256};
257
Alex Elderbf0d5f502012-11-22 00:00:08 -0600258struct rbd_obj_request {
Ilya Dryomova90bb0c2017-01-25 18:16:23 +0100259 u64 object_no;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600260 u64 offset; /* object start byte */
261 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600262 unsigned long flags;
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100263 union {
264 bool tried_parent; /* for reads */
265 enum rbd_obj_write_state write_state; /* for writes */
266 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600267
Alex Elderc5b5ef62013-02-11 12:33:24 -0600268 /*
269 * An object request associated with an image will have its
270 * img_data flag set; a standalone object request will not.
271 *
Alex Elderc5b5ef62013-02-11 12:33:24 -0600272 * Finally, an object request for rbd image data will have
273 * which != BAD_WHICH, and will have a non-null img_request
274 * pointer. The value of which will be in the range
275 * 0..(img_request->obj_request_count-1).
276 */
Ilya Dryomov51c35092018-01-29 14:04:08 +0100277 struct rbd_img_request *img_request;
278 u64 img_offset;
279 /* links for img_request->obj_requests list */
280 struct list_head links;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600281 u32 which; /* posn image request list */
282
283 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600284 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100285 struct ceph_bio_iter bio_pos;
Alex Elder788e2df2013-01-17 12:25:27 -0600286 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100287 struct ceph_bvec_iter bvec_pos;
288 u32 bvec_count;
Alex Elder788e2df2013-01-17 12:25:27 -0600289 };
290 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100291 struct bio_vec *copyup_bvecs;
292 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293
294 struct ceph_osd_request *osd_req;
295
296 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800297 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600298
299 rbd_obj_callback_t callback;
300
301 struct kref kref;
302};
303
Alex Elder0c425242013-02-08 09:55:49 -0600304enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600305 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600306 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600307};
308
Alex Elderbf0d5f502012-11-22 00:00:08 -0600309struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600310 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100311 enum obj_operation_type op_type;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600312 u64 offset; /* starting image byte offset */
313 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600314 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600315 union {
Alex Elder9849e982013-01-24 16:13:36 -0600316 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600317 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600318 };
319 union {
320 struct request *rq; /* block request */
321 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600322 };
323 spinlock_t completion_lock;/* protects next_completion */
324 u32 next_completion;
325 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500326 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600327 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600328
329 u32 obj_request_count;
330 struct list_head obj_requests; /* rbd_obj_request structs */
331
332 struct kref kref;
333};
334
335#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600336 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600337#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600338 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600339#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600340 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600341
Ilya Dryomov99d16942016-08-12 16:11:41 +0200342enum rbd_watch_state {
343 RBD_WATCH_STATE_UNREGISTERED,
344 RBD_WATCH_STATE_REGISTERED,
345 RBD_WATCH_STATE_ERROR,
346};
347
Ilya Dryomoved95b212016-08-12 16:40:02 +0200348enum rbd_lock_state {
349 RBD_LOCK_STATE_UNLOCKED,
350 RBD_LOCK_STATE_LOCKED,
351 RBD_LOCK_STATE_RELEASING,
352};
353
354/* WatchNotify::ClientId */
355struct rbd_client_id {
356 u64 gid;
357 u64 handle;
358};
359
Alex Elderf84344f2012-08-31 17:29:51 -0500360struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500361 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500362 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500363};
364
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700365/*
366 * a single device
367 */
368struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500369 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700370
371 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200372 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700373 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700374
Alex Eldera30b71b2012-07-10 20:30:11 -0500375 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700376 struct rbd_client *rbd_client;
377
378 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
379
Alex Elderb82d1672013-01-14 12:43:31 -0600380 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700381
382 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600383 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500384 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300385 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200386 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700387
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200388 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200389 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500390
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200391 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600392
Ilya Dryomov99d16942016-08-12 16:11:41 +0200393 struct mutex watch_mutex;
394 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200395 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200396 u64 watch_cookie;
397 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700398
Ilya Dryomoved95b212016-08-12 16:40:02 +0200399 struct rw_semaphore lock_rwsem;
400 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200401 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200402 struct rbd_client_id owner_cid;
403 struct work_struct acquired_lock_work;
404 struct work_struct released_lock_work;
405 struct delayed_work lock_dwork;
406 struct work_struct unlock_work;
407 wait_queue_head_t lock_waitq;
408
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200409 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700410
Alex Elder86b00e02012-10-25 23:34:42 -0500411 struct rbd_spec *parent_spec;
412 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500413 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500414 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500415
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100416 /* Block layer tags. */
417 struct blk_mq_tag_set tag_set;
418
Josh Durginc6666012011-11-21 17:11:12 -0800419 /* protects updating the header */
420 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500421
422 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700423
424 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800425
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800426 /* sysfs related */
427 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600428 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800429};
430
Alex Elderb82d1672013-01-14 12:43:31 -0600431/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200432 * Flag bits for rbd_dev->flags:
433 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
434 * by rbd_dev->lock
435 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600436 */
Alex Elder6d292902013-01-14 12:43:31 -0600437enum rbd_dev_flags {
438 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600439 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200440 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600441};
442
Alex Eldercfbf6372013-05-31 17:40:45 -0500443static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600444
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700445static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600446static DEFINE_SPINLOCK(rbd_dev_list_lock);
447
Alex Elder432b8582012-01-29 13:57:44 -0600448static LIST_HEAD(rbd_client_list); /* clients */
449static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700450
Alex Elder78c2a442013-05-01 12:43:04 -0500451/* Slab caches for frequently-allocated structures */
452
Alex Elder1c2a9df2013-05-01 12:43:03 -0500453static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500454static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500455
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200456static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200457static DEFINE_IDA(rbd_dev_id_ida);
458
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400459static struct workqueue_struct *rbd_wq;
460
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200461/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100462 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200463 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100464static bool single_major = true;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200465module_param(single_major, bool, S_IRUGO);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100466MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200467
Alex Elderf0f8cef2012-01-29 13:57:44 -0600468static ssize_t rbd_add(struct bus_type *bus, const char *buf,
469 size_t count);
470static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
471 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200472static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
473 size_t count);
474static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
475 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200476static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Eldera2acd002013-05-08 22:50:04 -0500477static void rbd_spec_put(struct rbd_spec *spec);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600478
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200479static int rbd_dev_id_to_minor(int dev_id)
480{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200481 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200482}
483
484static int minor_to_rbd_dev_id(int minor)
485{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200486 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200487}
488
Ilya Dryomoved95b212016-08-12 16:40:02 +0200489static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
490{
491 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
492 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
493}
494
495static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
496{
497 bool is_lock_owner;
498
499 down_read(&rbd_dev->lock_rwsem);
500 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
501 up_read(&rbd_dev->lock_rwsem);
502 return is_lock_owner;
503}
504
Ilya Dryomov8767b292017-03-02 19:56:57 +0100505static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
506{
507 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
508}
509
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700510static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
511static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200512static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
513static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Ilya Dryomov8767b292017-03-02 19:56:57 +0100514static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700515
516static struct attribute *rbd_bus_attrs[] = {
517 &bus_attr_add.attr,
518 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200519 &bus_attr_add_single_major.attr,
520 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100521 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700522 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600523};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200524
525static umode_t rbd_bus_is_visible(struct kobject *kobj,
526 struct attribute *attr, int index)
527{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200528 if (!single_major &&
529 (attr == &bus_attr_add_single_major.attr ||
530 attr == &bus_attr_remove_single_major.attr))
531 return 0;
532
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200533 return attr->mode;
534}
535
536static const struct attribute_group rbd_bus_group = {
537 .attrs = rbd_bus_attrs,
538 .is_visible = rbd_bus_is_visible,
539};
540__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600541
542static struct bus_type rbd_bus_type = {
543 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700544 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600545};
546
547static void rbd_root_dev_release(struct device *dev)
548{
549}
550
551static struct device rbd_root_dev = {
552 .init_name = "rbd",
553 .release = rbd_root_dev_release,
554};
555
Alex Elder06ecc6c2012-11-01 10:17:15 -0500556static __printf(2, 3)
557void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
558{
559 struct va_format vaf;
560 va_list args;
561
562 va_start(args, fmt);
563 vaf.fmt = fmt;
564 vaf.va = &args;
565
566 if (!rbd_dev)
567 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
568 else if (rbd_dev->disk)
569 printk(KERN_WARNING "%s: %s: %pV\n",
570 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
571 else if (rbd_dev->spec && rbd_dev->spec->image_name)
572 printk(KERN_WARNING "%s: image %s: %pV\n",
573 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
574 else if (rbd_dev->spec && rbd_dev->spec->image_id)
575 printk(KERN_WARNING "%s: id %s: %pV\n",
576 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
577 else /* punt */
578 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
579 RBD_DRV_NAME, rbd_dev, &vaf);
580 va_end(args);
581}
582
Alex Elderaafb2302012-09-06 16:00:54 -0500583#ifdef RBD_DEBUG
584#define rbd_assert(expr) \
585 if (unlikely(!(expr))) { \
586 printk(KERN_ERR "\nAssertion failure in %s() " \
587 "at line %d:\n\n" \
588 "\trbd_assert(%s);\n\n", \
589 __func__, __LINE__, #expr); \
590 BUG(); \
591 }
592#else /* !RBD_DEBUG */
593# define rbd_assert(expr) ((void) 0)
594#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800595
Alex Elder05a46af2013-04-26 15:44:36 -0500596static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600597
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500598static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500599static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400600static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400601static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500602static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
603 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500604static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
605 u8 *order, u64 *snap_size);
606static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
607 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700608
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700609static int rbd_open(struct block_device *bdev, fmode_t mode)
610{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600611 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600612 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700613
Alex Eldera14ea262013-02-05 13:23:12 -0600614 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600615 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
616 removing = true;
617 else
618 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600619 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600620 if (removing)
621 return -ENOENT;
622
Alex Elderc3e946c2012-11-16 09:29:16 -0600623 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700624
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700625 return 0;
626}
627
Al Virodb2a1442013-05-05 21:52:57 -0400628static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800629{
630 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600631 unsigned long open_count_before;
632
Alex Eldera14ea262013-02-05 13:23:12 -0600633 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600634 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600635 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600636 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800637
Alex Elderc3e946c2012-11-16 09:29:16 -0600638 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800639}
640
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800641static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
642{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200643 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800644
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200645 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800646 return -EFAULT;
647
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200648 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800649 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
650 return -EROFS;
651
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200652 /* Let blkdev_roset() handle it */
653 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800654}
655
656static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
657 unsigned int cmd, unsigned long arg)
658{
659 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200660 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800661
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800662 switch (cmd) {
663 case BLKROSET:
664 ret = rbd_ioctl_set_ro(rbd_dev, arg);
665 break;
666 default:
667 ret = -ENOTTY;
668 }
669
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800670 return ret;
671}
672
673#ifdef CONFIG_COMPAT
674static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
675 unsigned int cmd, unsigned long arg)
676{
677 return rbd_ioctl(bdev, mode, cmd, arg);
678}
679#endif /* CONFIG_COMPAT */
680
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700681static const struct block_device_operations rbd_bd_ops = {
682 .owner = THIS_MODULE,
683 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800684 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800685 .ioctl = rbd_ioctl,
686#ifdef CONFIG_COMPAT
687 .compat_ioctl = rbd_compat_ioctl,
688#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689};
690
691/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500692 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500693 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700694 */
Alex Elderf8c38922012-08-10 13:12:07 -0700695static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696{
697 struct rbd_client *rbdc;
698 int ret = -ENOMEM;
699
Alex Elder37206ee2013-02-20 17:32:08 -0600700 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700701 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
702 if (!rbdc)
703 goto out_opt;
704
705 kref_init(&rbdc->kref);
706 INIT_LIST_HEAD(&rbdc->node);
707
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100708 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500710 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500711 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712
713 ret = ceph_open_session(rbdc->client);
714 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500715 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716
Alex Elder432b8582012-01-29 13:57:44 -0600717 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700718 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600719 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700720
Alex Elder37206ee2013-02-20 17:32:08 -0600721 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600722
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700723 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500724out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700725 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500726out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727 kfree(rbdc);
728out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500729 if (ceph_opts)
730 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600731 dout("%s: error %d\n", __func__, ret);
732
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400733 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700734}
735
Alex Elder2f82ee52012-10-30 19:40:33 -0500736static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
737{
738 kref_get(&rbdc->kref);
739
740 return rbdc;
741}
742
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700744 * Find a ceph client with specific addr and configuration. If
745 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700746 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700747static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700748{
749 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700750 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751
Alex Elder43ae4702012-07-03 16:01:18 -0500752 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753 return NULL;
754
Alex Elder1f7ba332012-08-10 13:12:07 -0700755 spin_lock(&rbd_client_list_lock);
756 list_for_each_entry(client_node, &rbd_client_list, node) {
757 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500758 __rbd_get_client(client_node);
759
Alex Elder1f7ba332012-08-10 13:12:07 -0700760 found = true;
761 break;
762 }
763 }
764 spin_unlock(&rbd_client_list_lock);
765
766 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700767}
768
769/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300770 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700771 */
772enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300773 Opt_queue_depth,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700774 Opt_last_int,
775 /* int args above */
776 Opt_last_string,
777 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700778 Opt_read_only,
779 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200780 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200781 Opt_exclusive,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300782 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700783};
784
Alex Elder43ae4702012-07-03 16:01:18 -0500785static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300786 {Opt_queue_depth, "queue_depth=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700787 /* int args above */
788 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500789 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700790 {Opt_read_only, "ro"}, /* Alternate spelling */
791 {Opt_read_write, "read_write"},
792 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200793 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200794 {Opt_exclusive, "exclusive"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300795 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700796};
797
Alex Elder98571b52013-01-20 14:44:42 -0600798struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300799 int queue_depth;
Alex Elder98571b52013-01-20 14:44:42 -0600800 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200801 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200802 bool exclusive;
Alex Elder98571b52013-01-20 14:44:42 -0600803};
804
Ilya Dryomovb5584182015-06-23 16:21:19 +0300805#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Alex Elder98571b52013-01-20 14:44:42 -0600806#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200807#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200808#define RBD_EXCLUSIVE_DEFAULT false
Alex Elder98571b52013-01-20 14:44:42 -0600809
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700810static int parse_rbd_opts_token(char *c, void *private)
811{
Alex Elder43ae4702012-07-03 16:01:18 -0500812 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700813 substring_t argstr[MAX_OPT_ARGS];
814 int token, intval, ret;
815
Alex Elder43ae4702012-07-03 16:01:18 -0500816 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700817 if (token < Opt_last_int) {
818 ret = match_int(&argstr[0], &intval);
819 if (ret < 0) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300820 pr_err("bad mount option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700821 return ret;
822 }
823 dout("got int token %d val %d\n", token, intval);
824 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300825 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700826 } else {
827 dout("got token %d\n", token);
828 }
829
830 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300831 case Opt_queue_depth:
832 if (intval < 1) {
833 pr_err("queue_depth out of range\n");
834 return -EINVAL;
835 }
836 rbd_opts->queue_depth = intval;
837 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700838 case Opt_read_only:
839 rbd_opts->read_only = true;
840 break;
841 case Opt_read_write:
842 rbd_opts->read_only = false;
843 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200844 case Opt_lock_on_read:
845 rbd_opts->lock_on_read = true;
846 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200847 case Opt_exclusive:
848 rbd_opts->exclusive = true;
849 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700850 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300851 /* libceph prints "bad option" msg */
852 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700853 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300854
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700855 return 0;
856}
857
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800858static char* obj_op_name(enum obj_operation_type op_type)
859{
860 switch (op_type) {
861 case OBJ_OP_READ:
862 return "read";
863 case OBJ_OP_WRITE:
864 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800865 case OBJ_OP_DISCARD:
866 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800867 default:
868 return "???";
869 }
870}
871
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700872/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700873 * Get a ceph client with specific addr and configuration, if one does
Alex Elder7262cfc2013-05-16 15:04:20 -0500874 * not exist create it. Either way, ceph_opts is consumed by this
875 * function.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700876 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500877static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700878{
Alex Elderf8c38922012-08-10 13:12:07 -0700879 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700880
Alex Eldercfbf6372013-05-31 17:40:45 -0500881 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
Alex Elder1f7ba332012-08-10 13:12:07 -0700882 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500883 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500884 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500885 else
Alex Elderf8c38922012-08-10 13:12:07 -0700886 rbdc = rbd_client_create(ceph_opts);
Alex Eldercfbf6372013-05-31 17:40:45 -0500887 mutex_unlock(&client_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700888
Alex Elder9d3997f2012-10-25 23:34:42 -0500889 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700890}
891
892/*
893 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600894 *
Alex Elder432b8582012-01-29 13:57:44 -0600895 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896 */
897static void rbd_client_release(struct kref *kref)
898{
899 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
900
Alex Elder37206ee2013-02-20 17:32:08 -0600901 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500902 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500904 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700905
906 ceph_destroy_client(rbdc->client);
907 kfree(rbdc);
908}
909
910/*
911 * Drop reference to ceph client node. If it's not referenced anymore, release
912 * it.
913 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500914static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700915{
Alex Elderc53d5892012-10-25 23:34:42 -0500916 if (rbdc)
917 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700918}
919
Alex Eldera30b71b2012-07-10 20:30:11 -0500920static bool rbd_image_format_valid(u32 image_format)
921{
922 return image_format == 1 || image_format == 2;
923}
924
Alex Elder8e94af82012-07-25 09:32:40 -0500925static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
926{
Alex Elder103a1502012-08-02 11:29:45 -0500927 size_t size;
928 u32 snap_count;
929
930 /* The header has to start with the magic rbd header text */
931 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
932 return false;
933
Alex Elderdb2388b2012-10-20 22:17:27 -0500934 /* The bio layer requires at least sector-sized I/O */
935
936 if (ondisk->options.order < SECTOR_SHIFT)
937 return false;
938
939 /* If we use u64 in a few spots we may be able to loosen this */
940
941 if (ondisk->options.order > 8 * sizeof (int) - 1)
942 return false;
943
Alex Elder103a1502012-08-02 11:29:45 -0500944 /*
945 * The size of a snapshot header has to fit in a size_t, and
946 * that limits the number of snapshots.
947 */
948 snap_count = le32_to_cpu(ondisk->snap_count);
949 size = SIZE_MAX - sizeof (struct ceph_snap_context);
950 if (snap_count > size / sizeof (__le64))
951 return false;
952
953 /*
954 * Not only that, but the size of the entire the snapshot
955 * header must also be representable in a size_t.
956 */
957 size -= snap_count * sizeof (__le64);
958 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
959 return false;
960
961 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500962}
963
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700964/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100965 * returns the size of an object in the image
966 */
967static u32 rbd_obj_bytes(struct rbd_image_header *header)
968{
969 return 1U << header->obj_order;
970}
971
Ilya Dryomov263423f2017-01-25 18:16:22 +0100972static void rbd_init_layout(struct rbd_device *rbd_dev)
973{
974 if (rbd_dev->header.stripe_unit == 0 ||
975 rbd_dev->header.stripe_count == 0) {
976 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
977 rbd_dev->header.stripe_count = 1;
978 }
979
980 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
981 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
982 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +0100983 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
984 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +0100985 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
986}
987
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100988/*
Alex Elderbb23e372013-05-06 09:51:29 -0500989 * Fill an rbd image header with information from the given format 1
990 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700991 */
Alex Elder662518b2013-05-06 09:51:29 -0500992static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -0500993 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700994{
Alex Elder662518b2013-05-06 09:51:29 -0500995 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -0500996 bool first_time = header->object_prefix == NULL;
997 struct ceph_snap_context *snapc;
998 char *object_prefix = NULL;
999 char *snap_names = NULL;
1000 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001001 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001002 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001003 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001004
Alex Elderbb23e372013-05-06 09:51:29 -05001005 /* Allocate this now to avoid having to handle failure below */
1006
1007 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001008 object_prefix = kstrndup(ondisk->object_prefix,
1009 sizeof(ondisk->object_prefix),
1010 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001011 if (!object_prefix)
1012 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001013 }
1014
1015 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001016
Alex Elder103a1502012-08-02 11:29:45 -05001017 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001018 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1019 if (!snapc)
1020 goto out_err;
1021 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001022 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001023 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001024 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1025
Alex Elderbb23e372013-05-06 09:51:29 -05001026 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001027
Alex Elderbb23e372013-05-06 09:51:29 -05001028 if (snap_names_len > (u64)SIZE_MAX)
1029 goto out_2big;
1030 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1031 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001032 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001033
1034 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001035 snap_sizes = kmalloc_array(snap_count,
1036 sizeof(*header->snap_sizes),
1037 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001038 if (!snap_sizes)
1039 goto out_err;
1040
Alex Elderf785cc12012-08-23 23:22:06 -05001041 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001042 * Copy the names, and fill in each snapshot's id
1043 * and size.
1044 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001045 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001046 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001047 * snap_names_len bytes beyond the end of the
1048 * snapshot id array, this memcpy() is safe.
1049 */
Alex Elderbb23e372013-05-06 09:51:29 -05001050 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1051 snaps = ondisk->snaps;
1052 for (i = 0; i < snap_count; i++) {
1053 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1054 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1055 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056 }
Alex Elder849b4262012-07-09 21:04:24 -05001057
Alex Elderbb23e372013-05-06 09:51:29 -05001058 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001059
Alex Elderbb23e372013-05-06 09:51:29 -05001060 if (first_time) {
1061 header->object_prefix = object_prefix;
1062 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001063 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001064 } else {
1065 ceph_put_snap_context(header->snapc);
1066 kfree(header->snap_names);
1067 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001068 }
1069
1070 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001071
Alex Elderf84344f2012-08-31 17:29:51 -05001072 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001073 header->snapc = snapc;
1074 header->snap_names = snap_names;
1075 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001076
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001077 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001078out_2big:
1079 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001080out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001081 kfree(snap_sizes);
1082 kfree(snap_names);
1083 ceph_put_snap_context(snapc);
1084 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001085
Alex Elderbb23e372013-05-06 09:51:29 -05001086 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001087}
1088
Alex Elder9682fc62013-04-30 00:44:33 -05001089static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1090{
1091 const char *snap_name;
1092
1093 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1094
1095 /* Skip over names until we find the one we are looking for */
1096
1097 snap_name = rbd_dev->header.snap_names;
1098 while (which--)
1099 snap_name += strlen(snap_name) + 1;
1100
1101 return kstrdup(snap_name, GFP_KERNEL);
1102}
1103
Alex Elder30d1cff2013-05-01 12:43:03 -05001104/*
1105 * Snapshot id comparison function for use with qsort()/bsearch().
1106 * Note that result is for snapshots in *descending* order.
1107 */
1108static int snapid_compare_reverse(const void *s1, const void *s2)
1109{
1110 u64 snap_id1 = *(u64 *)s1;
1111 u64 snap_id2 = *(u64 *)s2;
1112
1113 if (snap_id1 < snap_id2)
1114 return 1;
1115 return snap_id1 == snap_id2 ? 0 : -1;
1116}
1117
1118/*
1119 * Search a snapshot context to see if the given snapshot id is
1120 * present.
1121 *
1122 * Returns the position of the snapshot id in the array if it's found,
1123 * or BAD_SNAP_INDEX otherwise.
1124 *
1125 * Note: The snapshot array is in kept sorted (by the osd) in
1126 * reverse order, highest snapshot id first.
1127 */
Alex Elder9682fc62013-04-30 00:44:33 -05001128static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1129{
1130 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001131 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001132
Alex Elder30d1cff2013-05-01 12:43:03 -05001133 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1134 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001135
Alex Elder30d1cff2013-05-01 12:43:03 -05001136 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001137}
1138
Alex Elder2ad3d712013-04-30 00:44:33 -05001139static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1140 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001141{
1142 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001143 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001144
1145 which = rbd_dev_snap_index(rbd_dev, snap_id);
1146 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001147 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001148
Josh Durginda6a6b62013-09-04 17:57:31 -07001149 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1150 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001151}
1152
Alex Elder9e15b772012-10-30 19:40:33 -05001153static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1154{
Alex Elder9e15b772012-10-30 19:40:33 -05001155 if (snap_id == CEPH_NOSNAP)
1156 return RBD_SNAP_HEAD_NAME;
1157
Alex Elder54cac612013-04-30 00:44:33 -05001158 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1159 if (rbd_dev->image_format == 1)
1160 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001161
Alex Elder54cac612013-04-30 00:44:33 -05001162 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001163}
1164
Alex Elder2ad3d712013-04-30 00:44:33 -05001165static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1166 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167{
Alex Elder2ad3d712013-04-30 00:44:33 -05001168 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1169 if (snap_id == CEPH_NOSNAP) {
1170 *snap_size = rbd_dev->header.image_size;
1171 } else if (rbd_dev->image_format == 1) {
1172 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001173
Alex Elder2ad3d712013-04-30 00:44:33 -05001174 which = rbd_dev_snap_index(rbd_dev, snap_id);
1175 if (which == BAD_SNAP_INDEX)
1176 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001177
Alex Elder2ad3d712013-04-30 00:44:33 -05001178 *snap_size = rbd_dev->header.snap_sizes[which];
1179 } else {
1180 u64 size = 0;
1181 int ret;
1182
1183 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1184 if (ret)
1185 return ret;
1186
1187 *snap_size = size;
1188 }
1189 return 0;
1190}
1191
1192static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1193 u64 *snap_features)
1194{
1195 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1196 if (snap_id == CEPH_NOSNAP) {
1197 *snap_features = rbd_dev->header.features;
1198 } else if (rbd_dev->image_format == 1) {
1199 *snap_features = 0; /* No features for format 1 */
1200 } else {
1201 u64 features = 0;
1202 int ret;
1203
1204 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1205 if (ret)
1206 return ret;
1207
1208 *snap_features = features;
1209 }
1210 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001211}
1212
Alex Elderd1cf5782013-04-27 09:59:30 -05001213static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001214{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001215 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001216 u64 size = 0;
1217 u64 features = 0;
1218 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001219
Alex Elder2ad3d712013-04-30 00:44:33 -05001220 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1221 if (ret)
1222 return ret;
1223 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1224 if (ret)
1225 return ret;
1226
1227 rbd_dev->mapping.size = size;
1228 rbd_dev->mapping.features = features;
1229
Alex Elder8b0241f2013-04-25 23:15:08 -05001230 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231}
1232
Alex Elderd1cf5782013-04-27 09:59:30 -05001233static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1234{
1235 rbd_dev->mapping.size = 0;
1236 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001237}
1238
Alex Elder65ccfe22012-08-09 10:33:26 -07001239static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1240{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001241 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001242
Alex Elder65ccfe22012-08-09 10:33:26 -07001243 return offset & (segment_size - 1);
1244}
1245
1246static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1247 u64 offset, u64 length)
1248{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001249 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Alex Elder65ccfe22012-08-09 10:33:26 -07001250
1251 offset &= segment_size - 1;
1252
Alex Elderaafb2302012-09-06 16:00:54 -05001253 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001254 if (offset + length > segment_size)
1255 length = segment_size - offset;
1256
1257 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001258}
1259
Ilya Dryomov5359a172018-01-20 10:30:10 +01001260static void zero_bvec(struct bio_vec *bv)
1261{
1262 void *buf;
1263 unsigned long flags;
1264
1265 buf = bvec_kmap_irq(bv, &flags);
1266 memset(buf, 0, bv->bv_len);
1267 flush_dcache_page(bv->bv_page);
1268 bvec_kunmap_irq(buf, &flags);
1269}
1270
1271static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1272{
1273 struct ceph_bio_iter it = *bio_pos;
1274
1275 ceph_bio_iter_advance(&it, off);
1276 ceph_bio_iter_advance_step(&it, bytes, ({
1277 zero_bvec(&bv);
1278 }));
1279}
1280
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001281static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001282{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001283 struct ceph_bvec_iter it = *bvec_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001284
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001285 ceph_bvec_iter_advance(&it, off);
1286 ceph_bvec_iter_advance_step(&it, bytes, ({
1287 zero_bvec(&bv);
1288 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001289}
1290
1291/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001292 * Zero a range in @obj_req data buffer defined by a bio (list) or
1293 * bio_vec array.
1294 *
1295 * @off is relative to the start of the data buffer.
1296 */
1297static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1298 u32 bytes)
1299{
1300 switch (obj_req->type) {
1301 case OBJ_REQUEST_BIO:
1302 zero_bios(&obj_req->bio_pos, off, bytes);
1303 break;
1304 case OBJ_REQUEST_BVECS:
1305 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1306 break;
1307 default:
1308 rbd_assert(0);
1309 }
1310}
1311
1312/*
Alex Elder926f9b32013-02-11 12:33:24 -06001313 * The default/initial value for all object request flags is 0. For
1314 * each flag, once its value is set to 1 it is never reset to 0
1315 * again.
1316 */
Alex Elder6365d332013-02-11 12:33:24 -06001317static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1318{
1319 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001320 struct rbd_device *rbd_dev;
1321
Alex Elder57acbaa2013-02-11 12:33:24 -06001322 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001323 rbd_warn(rbd_dev, "obj_request %p already marked img_data",
Alex Elder6365d332013-02-11 12:33:24 -06001324 obj_request);
1325 }
1326}
1327
1328static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1329{
1330 smp_mb();
1331 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1332}
1333
Alex Elder57acbaa2013-02-11 12:33:24 -06001334static void obj_request_done_set(struct rbd_obj_request *obj_request)
1335{
1336 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1337 struct rbd_device *rbd_dev = NULL;
1338
1339 if (obj_request_img_data_test(obj_request))
1340 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001341 rbd_warn(rbd_dev, "obj_request %p already marked done",
Alex Elder57acbaa2013-02-11 12:33:24 -06001342 obj_request);
1343 }
1344}
1345
1346static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1347{
1348 smp_mb();
1349 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1350}
1351
Ilya Dryomov96385562014-06-10 13:53:29 +04001352static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1353{
1354 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1355
1356 return obj_request->img_offset <
1357 round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1358}
1359
Alex Elderbf0d5f502012-11-22 00:00:08 -06001360static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1361{
Alex Elder37206ee2013-02-20 17:32:08 -06001362 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001363 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001364 kref_get(&obj_request->kref);
1365}
1366
1367static void rbd_obj_request_destroy(struct kref *kref);
1368static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1369{
1370 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001371 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001372 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001373 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1374}
1375
Alex Elder0f2d5be2014-04-26 14:21:44 +04001376static void rbd_img_request_get(struct rbd_img_request *img_request)
1377{
1378 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001379 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001380 kref_get(&img_request->kref);
1381}
1382
Alex Eldere93f3152013-05-08 22:50:04 -05001383static bool img_request_child_test(struct rbd_img_request *img_request);
1384static void rbd_parent_request_destroy(struct kref *kref);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001385static void rbd_img_request_destroy(struct kref *kref);
1386static void rbd_img_request_put(struct rbd_img_request *img_request)
1387{
1388 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001389 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001390 kref_read(&img_request->kref));
Alex Eldere93f3152013-05-08 22:50:04 -05001391 if (img_request_child_test(img_request))
1392 kref_put(&img_request->kref, rbd_parent_request_destroy);
1393 else
1394 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001395}
1396
1397static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1398 struct rbd_obj_request *obj_request)
1399{
Alex Elder25dcf952013-01-25 17:08:55 -06001400 rbd_assert(obj_request->img_request == NULL);
1401
Alex Elderb155e862013-04-15 14:50:37 -05001402 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001403 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001404 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001405 rbd_assert(!obj_request_img_data_test(obj_request));
1406 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001407 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001408 img_request->obj_request_count++;
1409 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001410 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1411 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001412}
1413
1414static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1415 struct rbd_obj_request *obj_request)
1416{
1417 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001418
Alex Elder37206ee2013-02-20 17:32:08 -06001419 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1420 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001421 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001422 rbd_assert(img_request->obj_request_count > 0);
1423 img_request->obj_request_count--;
1424 rbd_assert(obj_request->which == img_request->obj_request_count);
1425 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001426 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001427 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001428 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001429 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001430 rbd_obj_request_put(obj_request);
1431}
1432
1433static bool obj_request_type_valid(enum obj_request_type type)
1434{
1435 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001436 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001437 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001438 case OBJ_REQUEST_BVECS:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001439 return true;
1440 default:
1441 return false;
1442 }
1443}
1444
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001445static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
1446
Ilya Dryomov980917f2016-09-12 18:59:42 +02001447static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001448{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001449 struct ceph_osd_request *osd_req = obj_request->osd_req;
1450
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001451 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
1452 obj_request, obj_request->object_no, obj_request->offset,
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001453 obj_request->length, osd_req);
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001454 if (obj_request_img_data_test(obj_request)) {
1455 WARN_ON(obj_request->callback != rbd_img_obj_callback);
1456 rbd_img_request_get(obj_request->img_request);
1457 }
Ilya Dryomov980917f2016-09-12 18:59:42 +02001458 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001459}
1460
1461static void rbd_img_request_complete(struct rbd_img_request *img_request)
1462{
Alex Elder55f27e02013-04-10 12:34:25 -05001463
Alex Elder37206ee2013-02-20 17:32:08 -06001464 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001465
1466 /*
1467 * If no error occurred, compute the aggregate transfer
1468 * count for the image request. We could instead use
1469 * atomic64_cmpxchg() to update it as each object request
1470 * completes; not clear which way is better off hand.
1471 */
1472 if (!img_request->result) {
1473 struct rbd_obj_request *obj_request;
1474 u64 xferred = 0;
1475
1476 for_each_obj_request(img_request, obj_request)
1477 xferred += obj_request->xferred;
1478 img_request->xferred = xferred;
1479 }
1480
Alex Elderbf0d5f502012-11-22 00:00:08 -06001481 if (img_request->callback)
1482 img_request->callback(img_request);
1483 else
1484 rbd_img_request_put(img_request);
1485}
1486
Alex Elder0c425242013-02-08 09:55:49 -06001487/*
1488 * The default/initial value for all image request flags is 0. Each
1489 * is conditionally set to 1 at image request initialization time
1490 * and currently never change thereafter.
1491 */
Alex Elder9849e982013-01-24 16:13:36 -06001492static void img_request_child_set(struct rbd_img_request *img_request)
1493{
1494 set_bit(IMG_REQ_CHILD, &img_request->flags);
1495 smp_mb();
1496}
1497
Alex Eldere93f3152013-05-08 22:50:04 -05001498static void img_request_child_clear(struct rbd_img_request *img_request)
1499{
1500 clear_bit(IMG_REQ_CHILD, &img_request->flags);
1501 smp_mb();
1502}
1503
Alex Elder9849e982013-01-24 16:13:36 -06001504static bool img_request_child_test(struct rbd_img_request *img_request)
1505{
1506 smp_mb();
1507 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1508}
1509
Alex Elderd0b2e942013-01-24 16:13:36 -06001510static void img_request_layered_set(struct rbd_img_request *img_request)
1511{
1512 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1513 smp_mb();
1514}
1515
Alex Eldera2acd002013-05-08 22:50:04 -05001516static void img_request_layered_clear(struct rbd_img_request *img_request)
1517{
1518 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1519 smp_mb();
1520}
1521
Alex Elderd0b2e942013-01-24 16:13:36 -06001522static bool img_request_layered_test(struct rbd_img_request *img_request)
1523{
1524 smp_mb();
1525 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1526}
1527
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001528static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1529{
1530 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1531
1532 return !obj_req->offset &&
1533 obj_req->length == rbd_dev->layout.object_size;
1534}
1535
1536static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1537{
1538 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1539
1540 return obj_req->offset + obj_req->length ==
1541 rbd_dev->layout.object_size;
1542}
1543
1544static bool rbd_img_is_write(struct rbd_img_request *img_req)
1545{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001546 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001547 case OBJ_OP_READ:
1548 return false;
1549 case OBJ_OP_WRITE:
1550 case OBJ_OP_DISCARD:
1551 return true;
1552 default:
1553 rbd_assert(0);
1554 }
1555}
1556
Alex Elderbf0d5f502012-11-22 00:00:08 -06001557static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1558{
Alex Elder37206ee2013-02-20 17:32:08 -06001559 dout("%s: obj %p cb %p\n", __func__, obj_request,
1560 obj_request->callback);
Ilya Dryomov2e584bc2018-01-15 17:24:51 +01001561 obj_request->callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001562}
1563
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001564static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
1565
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001566static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001568 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001569
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001570 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1571 osd_req->r_result, obj_req);
1572 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001573
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001574 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1575 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1576 obj_req->xferred = osd_req->r_result;
1577 else
1578 /*
1579 * Writes aren't allowed to return a data payload. In some
1580 * guarded write cases (e.g. stat + zero on an empty object)
1581 * a stat response makes it through, but we don't care.
1582 */
1583 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001584
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001585 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001586}
1587
Alex Elder9d4df012013-04-19 15:34:50 -05001588static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001589{
Alex Elder8c042b02013-04-03 01:28:58 -05001590 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001591
Ilya Dryomov7c848832016-09-15 17:56:39 +02001592 rbd_assert(obj_request_img_data_test(obj_request));
Ilya Dryomova162b302018-01-30 17:52:10 +01001593 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001594 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001595}
1596
1597static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1598{
Alex Elder9d4df012013-04-19 15:34:50 -05001599 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001600
Ilya Dryomova162b302018-01-30 17:52:10 +01001601 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Deepa Dinamani1134e092017-05-08 15:59:19 -07001602 ktime_get_real_ts(&osd_req->r_mtime);
Ilya Dryomovbb873b5392016-05-26 00:29:52 +02001603 osd_req->r_data_offset = obj_request->offset;
Alex Elder430c28c2013-04-03 21:32:51 -05001604}
1605
Ilya Dryomovbc812072017-01-25 18:16:23 +01001606static struct ceph_osd_request *
Ilya Dryomova162b302018-01-30 17:52:10 +01001607rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001608{
Ilya Dryomova162b302018-01-30 17:52:10 +01001609 struct rbd_img_request *img_req = obj_req->img_request;
1610 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001611 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1612 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001613 const char *name_format = rbd_dev->image_format == 1 ?
1614 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001615
Ilya Dryomova162b302018-01-30 17:52:10 +01001616 req = ceph_osdc_alloc_request(osdc,
1617 (rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1618 num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001619 if (!req)
1620 return NULL;
1621
Ilya Dryomovbc812072017-01-25 18:16:23 +01001622 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001623 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001624
1625 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001626 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomova162b302018-01-30 17:52:10 +01001627 rbd_dev->header.object_prefix, obj_req->object_no))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001628 goto err_req;
1629
1630 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1631 goto err_req;
1632
1633 return req;
1634
1635err_req:
1636 ceph_osdc_put_request(req);
1637 return NULL;
1638}
1639
Alex Elderbf0d5f502012-11-22 00:00:08 -06001640static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1641{
1642 ceph_osdc_put_request(osd_req);
1643}
1644
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001645static struct rbd_obj_request *
1646rbd_obj_request_create(enum obj_request_type type)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001647{
1648 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001649
1650 rbd_assert(obj_request_type_valid(type));
1651
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001652 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001653 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001654 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001655
Alex Elderbf0d5f502012-11-22 00:00:08 -06001656 obj_request->which = BAD_WHICH;
1657 obj_request->type = type;
1658 INIT_LIST_HEAD(&obj_request->links);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001659 kref_init(&obj_request->kref);
1660
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001661 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001662 return obj_request;
1663}
1664
1665static void rbd_obj_request_destroy(struct kref *kref)
1666{
1667 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001668 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001669
1670 obj_request = container_of(kref, struct rbd_obj_request, kref);
1671
Alex Elder37206ee2013-02-20 17:32:08 -06001672 dout("%s: obj %p\n", __func__, obj_request);
1673
Alex Elderbf0d5f502012-11-22 00:00:08 -06001674 rbd_assert(obj_request->img_request == NULL);
1675 rbd_assert(obj_request->which == BAD_WHICH);
1676
1677 if (obj_request->osd_req)
1678 rbd_osd_req_destroy(obj_request->osd_req);
1679
Alex Elderbf0d5f502012-11-22 00:00:08 -06001680 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001681 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001682 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001683 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001684 break; /* Nothing to do */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001685 default:
1686 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001687 }
1688
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001689 if (obj_request->copyup_bvecs) {
1690 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1691 if (obj_request->copyup_bvecs[i].bv_page)
1692 __free_page(obj_request->copyup_bvecs[i].bv_page);
1693 }
1694 kfree(obj_request->copyup_bvecs);
1695 }
Ilya Dryomovf9dcbc42018-01-20 10:30:11 +01001696
Alex Elder868311b2013-05-01 12:43:03 -05001697 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001698}
1699
Alex Elderfb65d2282013-05-08 22:50:04 -05001700/* It's OK to call this for a device with no parent */
1701
1702static void rbd_spec_put(struct rbd_spec *spec);
1703static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1704{
1705 rbd_dev_remove_parent(rbd_dev);
1706 rbd_spec_put(rbd_dev->parent_spec);
1707 rbd_dev->parent_spec = NULL;
1708 rbd_dev->parent_overlap = 0;
1709}
1710
Alex Elderbf0d5f502012-11-22 00:00:08 -06001711/*
Alex Eldera2acd002013-05-08 22:50:04 -05001712 * Parent image reference counting is used to determine when an
1713 * image's parent fields can be safely torn down--after there are no
1714 * more in-flight requests to the parent image. When the last
1715 * reference is dropped, cleaning them up is safe.
1716 */
1717static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1718{
1719 int counter;
1720
1721 if (!rbd_dev->parent_spec)
1722 return;
1723
1724 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1725 if (counter > 0)
1726 return;
1727
1728 /* Last reference; clean up parent data structures */
1729
1730 if (!counter)
1731 rbd_dev_unparent(rbd_dev);
1732 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001733 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001734}
1735
1736/*
1737 * If an image has a non-zero parent overlap, get a reference to its
1738 * parent.
1739 *
1740 * Returns true if the rbd device has a parent with a non-zero
1741 * overlap and a reference for it was successfully taken, or
1742 * false otherwise.
1743 */
1744static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1745{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001746 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001747
1748 if (!rbd_dev->parent_spec)
1749 return false;
1750
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001751 down_read(&rbd_dev->header_rwsem);
1752 if (rbd_dev->parent_overlap)
1753 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1754 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001755
1756 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001757 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001758
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001759 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001760}
1761
Alex Elderbf0d5f502012-11-22 00:00:08 -06001762/*
1763 * Caller is responsible for filling in the list of object requests
1764 * that comprises the image request, and the Linux request pointer
1765 * (if there is one).
1766 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001767static struct rbd_img_request *rbd_img_request_create(
1768 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06001769 u64 offset, u64 length,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001770 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001771 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001772{
1773 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001774
Ilya Dryomova0c58952018-01-22 16:03:06 +01001775 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001776 if (!img_request)
1777 return NULL;
1778
Alex Elderbf0d5f502012-11-22 00:00:08 -06001779 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001780 img_request->op_type = op_type;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001781 img_request->offset = offset;
1782 img_request->length = length;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001783 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001784 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001785 else
1786 img_request->snapc = snapc;
1787
Alex Eldera2acd002013-05-08 22:50:04 -05001788 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001789 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001790
Alex Elderbf0d5f502012-11-22 00:00:08 -06001791 spin_lock_init(&img_request->completion_lock);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001792 INIT_LIST_HEAD(&img_request->obj_requests);
1793 kref_init(&img_request->kref);
1794
Alex Elder37206ee2013-02-20 17:32:08 -06001795 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001796 obj_op_name(op_type), offset, length, img_request);
Alex Elder37206ee2013-02-20 17:32:08 -06001797
Alex Elderbf0d5f502012-11-22 00:00:08 -06001798 return img_request;
1799}
1800
1801static void rbd_img_request_destroy(struct kref *kref)
1802{
1803 struct rbd_img_request *img_request;
1804 struct rbd_obj_request *obj_request;
1805 struct rbd_obj_request *next_obj_request;
1806
1807 img_request = container_of(kref, struct rbd_img_request, kref);
1808
Alex Elder37206ee2013-02-20 17:32:08 -06001809 dout("%s: img %p\n", __func__, img_request);
1810
Alex Elderbf0d5f502012-11-22 00:00:08 -06001811 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1812 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001813 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001814
Alex Eldera2acd002013-05-08 22:50:04 -05001815 if (img_request_layered_test(img_request)) {
1816 img_request_layered_clear(img_request);
1817 rbd_dev_parent_put(img_request->rbd_dev);
1818 }
1819
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001820 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001821 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001822
Alex Elder1c2a9df2013-05-01 12:43:03 -05001823 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001824}
1825
Alex Eldere93f3152013-05-08 22:50:04 -05001826static struct rbd_img_request *rbd_parent_request_create(
1827 struct rbd_obj_request *obj_request,
1828 u64 img_offset, u64 length)
1829{
1830 struct rbd_img_request *parent_request;
1831 struct rbd_device *rbd_dev;
1832
1833 rbd_assert(obj_request->img_request);
1834 rbd_dev = obj_request->img_request->rbd_dev;
1835
Josh Durgin4e752f02014-04-08 11:12:11 -07001836 parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001837 length, OBJ_OP_READ, NULL);
Alex Eldere93f3152013-05-08 22:50:04 -05001838 if (!parent_request)
1839 return NULL;
1840
1841 img_request_child_set(parent_request);
1842 rbd_obj_request_get(obj_request);
1843 parent_request->obj_request = obj_request;
1844
1845 return parent_request;
1846}
1847
1848static void rbd_parent_request_destroy(struct kref *kref)
1849{
1850 struct rbd_img_request *parent_request;
1851 struct rbd_obj_request *orig_request;
1852
1853 parent_request = container_of(kref, struct rbd_img_request, kref);
1854 orig_request = parent_request->obj_request;
1855
1856 parent_request->obj_request = NULL;
1857 rbd_obj_request_put(orig_request);
1858 img_request_child_clear(parent_request);
1859
1860 rbd_img_request_destroy(kref);
1861}
1862
Alex Elder12178572013-02-08 09:55:49 -06001863static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
1864{
Alex Elder6365d332013-02-11 12:33:24 -06001865 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06001866 unsigned int xferred;
1867 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06001868 bool more;
Alex Elder12178572013-02-08 09:55:49 -06001869
Alex Elder6365d332013-02-11 12:33:24 -06001870 rbd_assert(obj_request_img_data_test(obj_request));
1871 img_request = obj_request->img_request;
1872
Alex Elder12178572013-02-08 09:55:49 -06001873 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
1874 xferred = (unsigned int)obj_request->xferred;
1875 result = obj_request->result;
1876 if (result) {
1877 struct rbd_device *rbd_dev = img_request->rbd_dev;
1878
Ilya Dryomov9584d502014-07-11 12:11:20 +04001879 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001880 obj_op_name(img_request->op_type), obj_request->length,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001881 obj_request->img_offset, obj_request->offset);
Ilya Dryomov9584d502014-07-11 12:11:20 +04001882 rbd_warn(rbd_dev, " result %d xferred %x",
Alex Elder12178572013-02-08 09:55:49 -06001883 result, xferred);
1884 if (!img_request->result)
1885 img_request->result = result;
Ilya Dryomov082a75d2015-04-25 15:56:15 +03001886 /*
1887 * Need to end I/O on the entire obj_request worth of
1888 * bytes in case of error.
1889 */
1890 xferred = obj_request->length;
Alex Elder12178572013-02-08 09:55:49 -06001891 }
1892
Alex Elder8b3e1a52013-01-24 16:13:36 -06001893 if (img_request_child_test(img_request)) {
1894 rbd_assert(img_request->obj_request != NULL);
1895 more = obj_request->which < img_request->obj_request_count - 1;
1896 } else {
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02001897 blk_status_t status = errno_to_blk_status(result);
1898
Alex Elder8b3e1a52013-01-24 16:13:36 -06001899 rbd_assert(img_request->rq != NULL);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01001900
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02001901 more = blk_update_request(img_request->rq, status, xferred);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01001902 if (!more)
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02001903 __blk_mq_end_request(img_request->rq, status);
Alex Elder8b3e1a52013-01-24 16:13:36 -06001904 }
1905
1906 return more;
Alex Elder12178572013-02-08 09:55:49 -06001907}
1908
Alex Elder21692382013-04-05 01:27:12 -05001909static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
1910{
1911 struct rbd_img_request *img_request;
1912 u32 which = obj_request->which;
1913 bool more = true;
1914
Alex Elder6365d332013-02-11 12:33:24 -06001915 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05001916 img_request = obj_request->img_request;
1917
1918 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
1919 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05001920 rbd_assert(img_request->obj_request_count > 0);
1921 rbd_assert(which != BAD_WHICH);
1922 rbd_assert(which < img_request->obj_request_count);
Alex Elder21692382013-04-05 01:27:12 -05001923
1924 spin_lock_irq(&img_request->completion_lock);
1925 if (which != img_request->next_completion)
1926 goto out;
1927
1928 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05001929 rbd_assert(more);
1930 rbd_assert(which < img_request->obj_request_count);
1931
1932 if (!obj_request_done_test(obj_request))
1933 break;
Alex Elder12178572013-02-08 09:55:49 -06001934 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05001935 which++;
1936 }
1937
1938 rbd_assert(more ^ (which == img_request->obj_request_count));
1939 img_request->next_completion = which;
1940out:
1941 spin_unlock_irq(&img_request->completion_lock);
Alex Elder0f2d5be2014-04-26 14:21:44 +04001942 rbd_img_request_put(img_request);
Alex Elder21692382013-04-05 01:27:12 -05001943
1944 if (!more)
1945 rbd_img_request_complete(img_request);
1946}
1947
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001948static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1949{
1950 switch (obj_req->type) {
1951 case OBJ_REQUEST_BIO:
1952 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1953 &obj_req->bio_pos,
1954 obj_req->length);
1955 break;
1956 case OBJ_REQUEST_BVECS:
1957 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
1958 obj_req->length);
1959 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1960 &obj_req->bvec_pos);
1961 break;
1962 default:
1963 rbd_assert(0);
1964 }
1965}
1966
1967static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1968{
Ilya Dryomova162b302018-01-30 17:52:10 +01001969 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001970 if (!obj_req->osd_req)
1971 return -ENOMEM;
1972
1973 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
1974 obj_req->offset, obj_req->length, 0, 0);
1975 rbd_osd_req_setup_data(obj_req, 0);
1976
1977 rbd_osd_req_format_read(obj_req);
1978 return 0;
1979}
1980
1981static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1982 unsigned int which)
1983{
1984 struct page **pages;
1985
1986 /*
1987 * The response data for a STAT call consists of:
1988 * le64 length;
1989 * struct {
1990 * le32 tv_sec;
1991 * le32 tv_nsec;
1992 * } mtime;
1993 */
1994 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1995 if (IS_ERR(pages))
1996 return PTR_ERR(pages);
1997
1998 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1999 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
2000 8 + sizeof(struct ceph_timespec),
2001 0, false, true);
2002 return 0;
2003}
2004
2005static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
2006 unsigned int which)
2007{
2008 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2009 u16 opcode;
2010
2011 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
2012 rbd_dev->layout.object_size,
2013 rbd_dev->layout.object_size);
2014
2015 if (rbd_obj_is_entire(obj_req))
2016 opcode = CEPH_OSD_OP_WRITEFULL;
2017 else
2018 opcode = CEPH_OSD_OP_WRITE;
2019
2020 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
2021 obj_req->offset, obj_req->length, 0, 0);
2022 rbd_osd_req_setup_data(obj_req, which++);
2023
2024 rbd_assert(which == obj_req->osd_req->r_num_ops);
2025 rbd_osd_req_format_write(obj_req);
2026}
2027
2028static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
2029{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002030 unsigned int num_osd_ops, which = 0;
2031 int ret;
2032
2033 if (obj_request_overlaps_parent(obj_req)) {
2034 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2035 num_osd_ops = 3; /* stat + setallochint + write/writefull */
2036 } else {
2037 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2038 num_osd_ops = 2; /* setallochint + write/writefull */
2039 }
2040
Ilya Dryomova162b302018-01-30 17:52:10 +01002041 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002042 if (!obj_req->osd_req)
2043 return -ENOMEM;
2044
2045 if (obj_request_overlaps_parent(obj_req)) {
2046 ret = __rbd_obj_setup_stat(obj_req, which++);
2047 if (ret)
2048 return ret;
2049 }
2050
2051 __rbd_obj_setup_write(obj_req, which);
2052 return 0;
2053}
2054
2055static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
2056 unsigned int which)
2057{
2058 u16 opcode;
2059
2060 if (rbd_obj_is_entire(obj_req)) {
2061 if (obj_request_overlaps_parent(obj_req)) {
2062 opcode = CEPH_OSD_OP_TRUNCATE;
2063 } else {
2064 osd_req_op_init(obj_req->osd_req, which++,
2065 CEPH_OSD_OP_DELETE, 0);
2066 opcode = 0;
2067 }
2068 } else if (rbd_obj_is_tail(obj_req)) {
2069 opcode = CEPH_OSD_OP_TRUNCATE;
2070 } else {
2071 opcode = CEPH_OSD_OP_ZERO;
2072 }
2073
2074 if (opcode)
2075 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
2076 obj_req->offset, obj_req->length,
2077 0, 0);
2078
2079 rbd_assert(which == obj_req->osd_req->r_num_ops);
2080 rbd_osd_req_format_write(obj_req);
2081}
2082
2083static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
2084{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002085 unsigned int num_osd_ops, which = 0;
2086 int ret;
2087
2088 if (rbd_obj_is_entire(obj_req)) {
2089 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2090 num_osd_ops = 1; /* truncate/delete */
2091 } else {
2092 if (obj_request_overlaps_parent(obj_req)) {
2093 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2094 num_osd_ops = 2; /* stat + truncate/zero */
2095 } else {
2096 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2097 num_osd_ops = 1; /* truncate/zero */
2098 }
2099 }
2100
Ilya Dryomova162b302018-01-30 17:52:10 +01002101 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002102 if (!obj_req->osd_req)
2103 return -ENOMEM;
2104
2105 if (!rbd_obj_is_entire(obj_req) &&
2106 obj_request_overlaps_parent(obj_req)) {
2107 ret = __rbd_obj_setup_stat(obj_req, which++);
2108 if (ret)
2109 return ret;
2110 }
2111
2112 __rbd_obj_setup_discard(obj_req, which);
2113 return 0;
2114}
2115
2116/*
2117 * For each object request in @img_req, allocate an OSD request, add
2118 * individual OSD ops and prepare them for submission. The number of
2119 * OSD ops depends on op_type and the overlap point (if any).
2120 */
2121static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2122{
2123 struct rbd_obj_request *obj_req;
2124 int ret;
2125
2126 for_each_obj_request(img_req, obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002127 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002128 case OBJ_OP_READ:
2129 ret = rbd_obj_setup_read(obj_req);
2130 break;
2131 case OBJ_OP_WRITE:
2132 ret = rbd_obj_setup_write(obj_req);
2133 break;
2134 case OBJ_OP_DISCARD:
2135 ret = rbd_obj_setup_discard(obj_req);
2136 break;
2137 default:
2138 rbd_assert(0);
2139 }
2140 if (ret)
2141 return ret;
2142 }
2143
2144 return 0;
2145}
2146
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002147/*
Alex Elderf1a47392013-04-19 15:34:50 -05002148 * Split up an image request into one or more object requests, each
2149 * to a different object. The "type" parameter indicates whether
2150 * "data_desc" is the pointer to the head of a list of bio
2151 * structures, or the base of a page array. In either case this
2152 * function assumes data_desc describes memory sufficient to hold
2153 * all data described by the image request.
2154 */
2155static int rbd_img_request_fill(struct rbd_img_request *img_request,
2156 enum obj_request_type type,
2157 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002158{
2159 struct rbd_device *rbd_dev = img_request->rbd_dev;
2160 struct rbd_obj_request *obj_request = NULL;
2161 struct rbd_obj_request *next_obj_request;
Ilya Dryomov5359a172018-01-20 10:30:10 +01002162 struct ceph_bio_iter bio_it;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002163 struct ceph_bvec_iter bvec_it;
Alex Elder7da22d22013-01-24 16:13:36 -06002164 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002165 u64 resid;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002166
Alex Elderf1a47392013-04-19 15:34:50 -05002167 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2168 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06002169
Alex Elder7da22d22013-01-24 16:13:36 -06002170 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002171 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06002172 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05002173
2174 if (type == OBJ_REQUEST_BIO) {
Ilya Dryomov5359a172018-01-20 10:30:10 +01002175 bio_it = *(struct ceph_bio_iter *)data_desc;
Kent Overstreet4f024f32013-10-11 15:44:27 -07002176 rbd_assert(img_offset ==
Ilya Dryomov5359a172018-01-20 10:30:10 +01002177 bio_it.iter.bi_sector << SECTOR_SHIFT);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002178 } else if (type == OBJ_REQUEST_BVECS) {
2179 bvec_it = *(struct ceph_bvec_iter *)data_desc;
Alex Elderf1a47392013-04-19 15:34:50 -05002180 }
2181
Alex Elderbf0d5f502012-11-22 00:00:08 -06002182 while (resid) {
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01002183 u64 object_no = img_offset >> rbd_dev->header.obj_order;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002184 u64 offset = rbd_segment_offset(rbd_dev, img_offset);
2185 u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002186
Ilya Dryomov6c696d82017-01-25 18:16:23 +01002187 obj_request = rbd_obj_request_create(type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002188 if (!obj_request)
2189 goto out_unwind;
Ilya Dryomov62054da2014-03-04 11:57:17 +02002190
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01002191 obj_request->object_no = object_no;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002192 obj_request->offset = offset;
2193 obj_request->length = length;
2194
Josh Durgin03507db2013-08-27 14:45:46 -07002195 /*
2196 * set obj_request->img_request before creating the
2197 * osd_request so that it gets the right snapc
2198 */
2199 rbd_img_obj_request_add(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002200
Alex Elderf1a47392013-04-19 15:34:50 -05002201 if (type == OBJ_REQUEST_BIO) {
Ilya Dryomov5359a172018-01-20 10:30:10 +01002202 obj_request->bio_pos = bio_it;
2203 ceph_bio_iter_advance(&bio_it, length);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002204 } else if (type == OBJ_REQUEST_BVECS) {
2205 obj_request->bvec_pos = bvec_it;
2206 ceph_bvec_iter_shorten(&obj_request->bvec_pos, length);
2207 ceph_bvec_iter_advance(&bvec_it, length);
Alex Elderf1a47392013-04-19 15:34:50 -05002208 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002209
Alex Elder21692382013-04-05 01:27:12 -05002210 obj_request->callback = rbd_img_obj_callback;
Alex Elder7da22d22013-01-24 16:13:36 -06002211 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002212
Alex Elder7da22d22013-01-24 16:13:36 -06002213 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002214 resid -= length;
2215 }
2216
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002217 return __rbd_img_fill_request(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002218
Alex Elderbf0d5f502012-11-22 00:00:08 -06002219out_unwind:
2220 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
Ilya Dryomov42dd0372014-03-04 11:57:17 +02002221 rbd_img_obj_request_del(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002222
2223 return -ENOMEM;
2224}
2225
Alex Elderbf0d5f502012-11-22 00:00:08 -06002226static int rbd_img_request_submit(struct rbd_img_request *img_request)
2227{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002228 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002229 struct rbd_obj_request *next_obj_request;
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002230 int ret = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002231
Alex Elder37206ee2013-02-20 17:32:08 -06002232 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002233
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002234 rbd_img_request_get(img_request);
2235 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002236 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002237 }
2238
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002239 rbd_img_request_put(img_request);
2240 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002241}
2242
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002243static void rbd_img_end_child_request(struct rbd_img_request *img_req);
2244
2245static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req,
2246 u64 img_offset, u32 bytes)
2247{
2248 struct rbd_img_request *img_req = obj_req->img_request;
2249 struct rbd_img_request *child_img_req;
2250 int ret;
2251
2252 child_img_req = rbd_parent_request_create(obj_req, img_offset, bytes);
2253 if (!child_img_req)
2254 return -ENOMEM;
2255
2256 child_img_req->callback = rbd_img_end_child_request;
2257
2258 if (!rbd_img_is_write(img_req)) {
2259 switch (obj_req->type) {
2260 case OBJ_REQUEST_BIO:
2261 ret = rbd_img_request_fill(child_img_req,
2262 OBJ_REQUEST_BIO,
2263 &obj_req->bio_pos);
2264 break;
2265 case OBJ_REQUEST_BVECS:
2266 ret = rbd_img_request_fill(child_img_req,
2267 OBJ_REQUEST_BVECS,
2268 &obj_req->bvec_pos);
2269 break;
2270 default:
2271 rbd_assert(0);
2272 }
2273 } else {
2274 struct ceph_bvec_iter it = {
2275 .bvecs = obj_req->copyup_bvecs,
2276 .iter = { .bi_size = bytes },
2277 };
2278
2279 ret = rbd_img_request_fill(child_img_req, OBJ_REQUEST_BVECS,
2280 &it);
2281 }
2282 if (ret) {
2283 rbd_img_request_put(child_img_req);
2284 return ret;
2285 }
2286
2287 rbd_img_request_submit(child_img_req);
2288 return 0;
2289}
2290
2291static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2292{
2293 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2294 int ret;
2295
2296 if (obj_req->result == -ENOENT &&
2297 obj_req->img_offset < rbd_dev->parent_overlap &&
2298 !obj_req->tried_parent) {
2299 u64 obj_overlap = min(obj_req->length,
2300 rbd_dev->parent_overlap - obj_req->img_offset);
2301
2302 obj_req->tried_parent = true;
2303 ret = rbd_obj_read_from_parent(obj_req, obj_req->img_offset,
2304 obj_overlap);
2305 if (ret) {
2306 obj_req->result = ret;
2307 return true;
2308 }
2309 return false;
2310 }
2311
2312 /*
2313 * -ENOENT means a hole in the image -- zero-fill the entire
2314 * length of the request. A short read also implies zero-fill
2315 * to the end of the request. In both cases we update xferred
2316 * count to indicate the whole request was satisfied.
2317 */
2318 if (obj_req->result == -ENOENT ||
2319 (!obj_req->result && obj_req->xferred < obj_req->length)) {
2320 rbd_assert(!obj_req->xferred || !obj_req->result);
2321 rbd_obj_zero_range(obj_req, obj_req->xferred,
2322 obj_req->length - obj_req->xferred);
2323 obj_req->result = 0;
2324 obj_req->xferred = obj_req->length;
2325 }
2326
2327 return true;
2328}
2329
2330/*
2331 * copyup_bvecs pages are never highmem pages
2332 */
2333static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2334{
2335 struct ceph_bvec_iter it = {
2336 .bvecs = bvecs,
2337 .iter = { .bi_size = bytes },
2338 };
2339
2340 ceph_bvec_iter_advance_step(&it, bytes, ({
2341 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2342 bv.bv_len))
2343 return false;
2344 }));
2345 return true;
2346}
2347
2348static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2349{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002350 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
2351
2352 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2353 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2354 rbd_osd_req_destroy(obj_req->osd_req);
2355
2356 /*
2357 * Create a copyup request with the same number of OSD ops as
2358 * the original request. The original request was stat + op(s),
2359 * the new copyup request will be copyup + the same op(s).
2360 */
Ilya Dryomova162b302018-01-30 17:52:10 +01002361 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002362 if (!obj_req->osd_req)
2363 return -ENOMEM;
2364
2365 /*
2366 * Only send non-zero copyup data to save some I/O and network
2367 * bandwidth -- zero copyup data is equivalent to the object not
2368 * existing.
2369 */
2370 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2371 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2372 bytes = 0;
2373 }
2374
2375 osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
2376 "copyup");
2377 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
2378 obj_req->copyup_bvecs, bytes);
2379
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002380 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002381 case OBJ_OP_WRITE:
2382 __rbd_obj_setup_write(obj_req, 1);
2383 break;
2384 case OBJ_OP_DISCARD:
2385 rbd_assert(!rbd_obj_is_entire(obj_req));
2386 __rbd_obj_setup_discard(obj_req, 1);
2387 break;
2388 default:
2389 rbd_assert(0);
2390 }
2391
2392 rbd_obj_request_submit(obj_req);
2393 /* FIXME: in lieu of rbd_img_obj_callback() */
2394 rbd_img_request_put(obj_req->img_request);
2395 return 0;
2396}
2397
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002398static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2399{
2400 u32 i;
2401
2402 rbd_assert(!obj_req->copyup_bvecs);
2403 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2404 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2405 sizeof(*obj_req->copyup_bvecs),
2406 GFP_NOIO);
2407 if (!obj_req->copyup_bvecs)
2408 return -ENOMEM;
2409
2410 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2411 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2412
2413 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2414 if (!obj_req->copyup_bvecs[i].bv_page)
2415 return -ENOMEM;
2416
2417 obj_req->copyup_bvecs[i].bv_offset = 0;
2418 obj_req->copyup_bvecs[i].bv_len = len;
2419 obj_overlap -= len;
2420 }
2421
2422 rbd_assert(!obj_overlap);
2423 return 0;
2424}
2425
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002426static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2427{
2428 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2429 u64 img_offset;
2430 u64 obj_overlap;
2431 int ret;
2432
2433 if (!obj_request_overlaps_parent(obj_req)) {
2434 /*
2435 * The overlap has become 0 (most likely because the
2436 * image has been flattened). Use rbd_obj_issue_copyup()
2437 * to re-submit the original write request -- the copyup
2438 * operation itself will be a no-op, since someone must
2439 * have populated the child object while we weren't
2440 * looking. Move to WRITE_FLAT state as we'll be done
2441 * with the operation once the null copyup completes.
2442 */
2443 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2444 return rbd_obj_issue_copyup(obj_req, 0);
2445 }
2446
2447 /*
2448 * Determine the byte range covered by the object in the
2449 * child image to which the original request was to be sent.
2450 */
2451 img_offset = obj_req->img_offset - obj_req->offset;
2452 obj_overlap = rbd_dev->layout.object_size;
2453
2454 /*
2455 * There is no defined parent data beyond the parent
2456 * overlap, so limit what we read at that boundary if
2457 * necessary.
2458 */
2459 if (img_offset + obj_overlap > rbd_dev->parent_overlap) {
2460 rbd_assert(img_offset < rbd_dev->parent_overlap);
2461 obj_overlap = rbd_dev->parent_overlap - img_offset;
2462 }
2463
2464 ret = setup_copyup_bvecs(obj_req, obj_overlap);
2465 if (ret)
2466 return ret;
2467
2468 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
2469 return rbd_obj_read_from_parent(obj_req, img_offset, obj_overlap);
2470}
2471
2472static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2473{
2474 int ret;
2475
2476again:
2477 switch (obj_req->write_state) {
2478 case RBD_OBJ_WRITE_GUARD:
2479 rbd_assert(!obj_req->xferred);
2480 if (obj_req->result == -ENOENT) {
2481 /*
2482 * The target object doesn't exist. Read the data for
2483 * the entire target object up to the overlap point (if
2484 * any) from the parent, so we can use it for a copyup.
2485 */
2486 ret = rbd_obj_handle_write_guard(obj_req);
2487 if (ret) {
2488 obj_req->result = ret;
2489 return true;
2490 }
2491 return false;
2492 }
2493 /* fall through */
2494 case RBD_OBJ_WRITE_FLAT:
2495 if (!obj_req->result)
2496 /*
2497 * There is no such thing as a successful short
2498 * write -- indicate the whole request was satisfied.
2499 */
2500 obj_req->xferred = obj_req->length;
2501 return true;
2502 case RBD_OBJ_WRITE_COPYUP:
2503 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2504 if (obj_req->result)
2505 goto again;
2506
2507 rbd_assert(obj_req->xferred);
2508 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2509 if (ret) {
2510 obj_req->result = ret;
2511 return true;
2512 }
2513 return false;
2514 default:
2515 rbd_assert(0);
2516 }
2517}
2518
2519/*
2520 * Returns true if @obj_req is completed, or false otherwise.
2521 */
2522static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2523{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002524 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002525 case OBJ_OP_READ:
2526 return rbd_obj_handle_read(obj_req);
2527 case OBJ_OP_WRITE:
2528 return rbd_obj_handle_write(obj_req);
2529 case OBJ_OP_DISCARD:
2530 if (rbd_obj_handle_write(obj_req)) {
2531 /*
2532 * Hide -ENOENT from delete/truncate/zero -- discarding
2533 * a non-existent object is not a problem.
2534 */
2535 if (obj_req->result == -ENOENT) {
2536 obj_req->result = 0;
2537 obj_req->xferred = obj_req->length;
2538 }
2539 return true;
2540 }
2541 return false;
2542 default:
2543 rbd_assert(0);
2544 }
2545}
2546
2547static void rbd_img_end_child_request(struct rbd_img_request *img_req)
2548{
2549 struct rbd_obj_request *obj_req = img_req->obj_request;
2550
2551 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
2552
2553 obj_req->result = img_req->result;
2554 obj_req->xferred = img_req->xferred;
2555 rbd_img_request_put(img_req);
2556
2557 rbd_obj_handle_request(obj_req);
2558}
2559
2560static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2561{
2562 if (!__rbd_obj_handle_request(obj_req))
2563 return;
2564
2565 obj_request_done_set(obj_req);
2566 rbd_obj_request_complete(obj_req);
2567}
2568
Ilya Dryomoved95b212016-08-12 16:40:02 +02002569static const struct rbd_client_id rbd_empty_cid;
2570
2571static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2572 const struct rbd_client_id *rhs)
2573{
2574 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2575}
2576
2577static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2578{
2579 struct rbd_client_id cid;
2580
2581 mutex_lock(&rbd_dev->watch_mutex);
2582 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2583 cid.handle = rbd_dev->watch_cookie;
2584 mutex_unlock(&rbd_dev->watch_mutex);
2585 return cid;
2586}
2587
2588/*
2589 * lock_rwsem must be held for write
2590 */
2591static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2592 const struct rbd_client_id *cid)
2593{
2594 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2595 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2596 cid->gid, cid->handle);
2597 rbd_dev->owner_cid = *cid; /* struct */
2598}
2599
2600static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2601{
2602 mutex_lock(&rbd_dev->watch_mutex);
2603 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2604 mutex_unlock(&rbd_dev->watch_mutex);
2605}
2606
Florian Margaineedd8ca82017-12-13 16:43:59 +01002607static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2608{
2609 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2610
2611 strcpy(rbd_dev->lock_cookie, cookie);
2612 rbd_set_owner_cid(rbd_dev, &cid);
2613 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2614}
2615
Ilya Dryomoved95b212016-08-12 16:40:02 +02002616/*
2617 * lock_rwsem must be held for write
2618 */
2619static int rbd_lock(struct rbd_device *rbd_dev)
2620{
2621 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002622 char cookie[32];
2623 int ret;
2624
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002625 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2626 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002627
2628 format_lock_cookie(rbd_dev, cookie);
2629 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2630 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2631 RBD_LOCK_TAG, "", 0);
2632 if (ret)
2633 return ret;
2634
2635 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002636 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002637 return 0;
2638}
2639
2640/*
2641 * lock_rwsem must be held for write
2642 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002643static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002644{
2645 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002646 int ret;
2647
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002648 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2649 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002650
Ilya Dryomoved95b212016-08-12 16:40:02 +02002651 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002652 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002653 if (ret && ret != -ENOENT)
2654 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002655
Ilya Dryomovbbead742017-04-13 12:17:38 +02002656 /* treat errors as the image is unlocked */
2657 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002658 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002659 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2660 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002661}
2662
2663static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2664 enum rbd_notify_op notify_op,
2665 struct page ***preply_pages,
2666 size_t *preply_len)
2667{
2668 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2669 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2670 int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
2671 char buf[buf_size];
2672 void *p = buf;
2673
2674 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2675
2676 /* encode *LockPayload NotifyMessage (op + ClientId) */
2677 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2678 ceph_encode_32(&p, notify_op);
2679 ceph_encode_64(&p, cid.gid);
2680 ceph_encode_64(&p, cid.handle);
2681
2682 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2683 &rbd_dev->header_oloc, buf, buf_size,
2684 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2685}
2686
2687static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2688 enum rbd_notify_op notify_op)
2689{
2690 struct page **reply_pages;
2691 size_t reply_len;
2692
2693 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2694 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2695}
2696
2697static void rbd_notify_acquired_lock(struct work_struct *work)
2698{
2699 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2700 acquired_lock_work);
2701
2702 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2703}
2704
2705static void rbd_notify_released_lock(struct work_struct *work)
2706{
2707 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2708 released_lock_work);
2709
2710 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2711}
2712
2713static int rbd_request_lock(struct rbd_device *rbd_dev)
2714{
2715 struct page **reply_pages;
2716 size_t reply_len;
2717 bool lock_owner_responded = false;
2718 int ret;
2719
2720 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2721
2722 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2723 &reply_pages, &reply_len);
2724 if (ret && ret != -ETIMEDOUT) {
2725 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2726 goto out;
2727 }
2728
2729 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2730 void *p = page_address(reply_pages[0]);
2731 void *const end = p + reply_len;
2732 u32 n;
2733
2734 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2735 while (n--) {
2736 u8 struct_v;
2737 u32 len;
2738
2739 ceph_decode_need(&p, end, 8 + 8, e_inval);
2740 p += 8 + 8; /* skip gid and cookie */
2741
2742 ceph_decode_32_safe(&p, end, len, e_inval);
2743 if (!len)
2744 continue;
2745
2746 if (lock_owner_responded) {
2747 rbd_warn(rbd_dev,
2748 "duplicate lock owners detected");
2749 ret = -EIO;
2750 goto out;
2751 }
2752
2753 lock_owner_responded = true;
2754 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2755 &struct_v, &len);
2756 if (ret) {
2757 rbd_warn(rbd_dev,
2758 "failed to decode ResponseMessage: %d",
2759 ret);
2760 goto e_inval;
2761 }
2762
2763 ret = ceph_decode_32(&p);
2764 }
2765 }
2766
2767 if (!lock_owner_responded) {
2768 rbd_warn(rbd_dev, "no lock owners detected");
2769 ret = -ETIMEDOUT;
2770 }
2771
2772out:
2773 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2774 return ret;
2775
2776e_inval:
2777 ret = -EINVAL;
2778 goto out;
2779}
2780
2781static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2782{
2783 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2784
2785 cancel_delayed_work(&rbd_dev->lock_dwork);
2786 if (wake_all)
2787 wake_up_all(&rbd_dev->lock_waitq);
2788 else
2789 wake_up(&rbd_dev->lock_waitq);
2790}
2791
2792static int get_lock_owner_info(struct rbd_device *rbd_dev,
2793 struct ceph_locker **lockers, u32 *num_lockers)
2794{
2795 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2796 u8 lock_type;
2797 char *lock_tag;
2798 int ret;
2799
2800 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2801
2802 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2803 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2804 &lock_type, &lock_tag, lockers, num_lockers);
2805 if (ret)
2806 return ret;
2807
2808 if (*num_lockers == 0) {
2809 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2810 goto out;
2811 }
2812
2813 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2814 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2815 lock_tag);
2816 ret = -EBUSY;
2817 goto out;
2818 }
2819
2820 if (lock_type == CEPH_CLS_LOCK_SHARED) {
2821 rbd_warn(rbd_dev, "shared lock type detected");
2822 ret = -EBUSY;
2823 goto out;
2824 }
2825
2826 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2827 strlen(RBD_LOCK_COOKIE_PREFIX))) {
2828 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2829 (*lockers)[0].id.cookie);
2830 ret = -EBUSY;
2831 goto out;
2832 }
2833
2834out:
2835 kfree(lock_tag);
2836 return ret;
2837}
2838
2839static int find_watcher(struct rbd_device *rbd_dev,
2840 const struct ceph_locker *locker)
2841{
2842 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2843 struct ceph_watch_item *watchers;
2844 u32 num_watchers;
2845 u64 cookie;
2846 int i;
2847 int ret;
2848
2849 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2850 &rbd_dev->header_oloc, &watchers,
2851 &num_watchers);
2852 if (ret)
2853 return ret;
2854
2855 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2856 for (i = 0; i < num_watchers; i++) {
2857 if (!memcmp(&watchers[i].addr, &locker->info.addr,
2858 sizeof(locker->info.addr)) &&
2859 watchers[i].cookie == cookie) {
2860 struct rbd_client_id cid = {
2861 .gid = le64_to_cpu(watchers[i].name.num),
2862 .handle = cookie,
2863 };
2864
2865 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
2866 rbd_dev, cid.gid, cid.handle);
2867 rbd_set_owner_cid(rbd_dev, &cid);
2868 ret = 1;
2869 goto out;
2870 }
2871 }
2872
2873 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
2874 ret = 0;
2875out:
2876 kfree(watchers);
2877 return ret;
2878}
2879
2880/*
2881 * lock_rwsem must be held for write
2882 */
2883static int rbd_try_lock(struct rbd_device *rbd_dev)
2884{
2885 struct ceph_client *client = rbd_dev->rbd_client->client;
2886 struct ceph_locker *lockers;
2887 u32 num_lockers;
2888 int ret;
2889
2890 for (;;) {
2891 ret = rbd_lock(rbd_dev);
2892 if (ret != -EBUSY)
2893 return ret;
2894
2895 /* determine if the current lock holder is still alive */
2896 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
2897 if (ret)
2898 return ret;
2899
2900 if (num_lockers == 0)
2901 goto again;
2902
2903 ret = find_watcher(rbd_dev, lockers);
2904 if (ret) {
2905 if (ret > 0)
2906 ret = 0; /* have to request lock */
2907 goto out;
2908 }
2909
2910 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
2911 ENTITY_NAME(lockers[0].id.name));
2912
2913 ret = ceph_monc_blacklist_add(&client->monc,
2914 &lockers[0].info.addr);
2915 if (ret) {
2916 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
2917 ENTITY_NAME(lockers[0].id.name), ret);
2918 goto out;
2919 }
2920
2921 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
2922 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2923 lockers[0].id.cookie,
2924 &lockers[0].id.name);
2925 if (ret && ret != -ENOENT)
2926 goto out;
2927
2928again:
2929 ceph_free_lockers(lockers, num_lockers);
2930 }
2931
2932out:
2933 ceph_free_lockers(lockers, num_lockers);
2934 return ret;
2935}
2936
2937/*
2938 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
2939 */
2940static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
2941 int *pret)
2942{
2943 enum rbd_lock_state lock_state;
2944
2945 down_read(&rbd_dev->lock_rwsem);
2946 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
2947 rbd_dev->lock_state);
2948 if (__rbd_is_lock_owner(rbd_dev)) {
2949 lock_state = rbd_dev->lock_state;
2950 up_read(&rbd_dev->lock_rwsem);
2951 return lock_state;
2952 }
2953
2954 up_read(&rbd_dev->lock_rwsem);
2955 down_write(&rbd_dev->lock_rwsem);
2956 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
2957 rbd_dev->lock_state);
2958 if (!__rbd_is_lock_owner(rbd_dev)) {
2959 *pret = rbd_try_lock(rbd_dev);
2960 if (*pret)
2961 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
2962 }
2963
2964 lock_state = rbd_dev->lock_state;
2965 up_write(&rbd_dev->lock_rwsem);
2966 return lock_state;
2967}
2968
2969static void rbd_acquire_lock(struct work_struct *work)
2970{
2971 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
2972 struct rbd_device, lock_dwork);
2973 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08002974 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002975
2976 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2977again:
2978 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
2979 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
2980 if (lock_state == RBD_LOCK_STATE_LOCKED)
2981 wake_requests(rbd_dev, true);
2982 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
2983 rbd_dev, lock_state, ret);
2984 return;
2985 }
2986
2987 ret = rbd_request_lock(rbd_dev);
2988 if (ret == -ETIMEDOUT) {
2989 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02002990 } else if (ret == -EROFS) {
2991 rbd_warn(rbd_dev, "peer will not release lock");
2992 /*
2993 * If this is rbd_add_acquire_lock(), we want to fail
2994 * immediately -- reuse BLACKLISTED flag. Otherwise we
2995 * want to block.
2996 */
2997 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
2998 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
2999 /* wake "rbd map --exclusive" process */
3000 wake_requests(rbd_dev, false);
3001 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003002 } else if (ret < 0) {
3003 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3004 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3005 RBD_RETRY_DELAY);
3006 } else {
3007 /*
3008 * lock owner acked, but resend if we don't see them
3009 * release the lock
3010 */
3011 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3012 rbd_dev);
3013 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3014 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3015 }
3016}
3017
3018/*
3019 * lock_rwsem must be held for write
3020 */
3021static bool rbd_release_lock(struct rbd_device *rbd_dev)
3022{
3023 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3024 rbd_dev->lock_state);
3025 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3026 return false;
3027
3028 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3029 downgrade_write(&rbd_dev->lock_rwsem);
3030 /*
3031 * Ensure that all in-flight IO is flushed.
3032 *
3033 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3034 * may be shared with other devices.
3035 */
3036 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3037 up_read(&rbd_dev->lock_rwsem);
3038
3039 down_write(&rbd_dev->lock_rwsem);
3040 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3041 rbd_dev->lock_state);
3042 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3043 return false;
3044
Ilya Dryomovbbead742017-04-13 12:17:38 +02003045 rbd_unlock(rbd_dev);
3046 /*
3047 * Give others a chance to grab the lock - we would re-acquire
3048 * almost immediately if we got new IO during ceph_osdc_sync()
3049 * otherwise. We need to ack our own notifications, so this
3050 * lock_dwork will be requeued from rbd_wait_state_locked()
3051 * after wake_requests() in rbd_handle_released_lock().
3052 */
3053 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003054 return true;
3055}
3056
3057static void rbd_release_lock_work(struct work_struct *work)
3058{
3059 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3060 unlock_work);
3061
3062 down_write(&rbd_dev->lock_rwsem);
3063 rbd_release_lock(rbd_dev);
3064 up_write(&rbd_dev->lock_rwsem);
3065}
3066
3067static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3068 void **p)
3069{
3070 struct rbd_client_id cid = { 0 };
3071
3072 if (struct_v >= 2) {
3073 cid.gid = ceph_decode_64(p);
3074 cid.handle = ceph_decode_64(p);
3075 }
3076
3077 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3078 cid.handle);
3079 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3080 down_write(&rbd_dev->lock_rwsem);
3081 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3082 /*
3083 * we already know that the remote client is
3084 * the owner
3085 */
3086 up_write(&rbd_dev->lock_rwsem);
3087 return;
3088 }
3089
3090 rbd_set_owner_cid(rbd_dev, &cid);
3091 downgrade_write(&rbd_dev->lock_rwsem);
3092 } else {
3093 down_read(&rbd_dev->lock_rwsem);
3094 }
3095
3096 if (!__rbd_is_lock_owner(rbd_dev))
3097 wake_requests(rbd_dev, false);
3098 up_read(&rbd_dev->lock_rwsem);
3099}
3100
3101static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3102 void **p)
3103{
3104 struct rbd_client_id cid = { 0 };
3105
3106 if (struct_v >= 2) {
3107 cid.gid = ceph_decode_64(p);
3108 cid.handle = ceph_decode_64(p);
3109 }
3110
3111 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3112 cid.handle);
3113 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3114 down_write(&rbd_dev->lock_rwsem);
3115 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3116 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3117 __func__, rbd_dev, cid.gid, cid.handle,
3118 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3119 up_write(&rbd_dev->lock_rwsem);
3120 return;
3121 }
3122
3123 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3124 downgrade_write(&rbd_dev->lock_rwsem);
3125 } else {
3126 down_read(&rbd_dev->lock_rwsem);
3127 }
3128
3129 if (!__rbd_is_lock_owner(rbd_dev))
3130 wake_requests(rbd_dev, false);
3131 up_read(&rbd_dev->lock_rwsem);
3132}
3133
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003134/*
3135 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3136 * ResponseMessage is needed.
3137 */
3138static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3139 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003140{
3141 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3142 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003143 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003144
3145 if (struct_v >= 2) {
3146 cid.gid = ceph_decode_64(p);
3147 cid.handle = ceph_decode_64(p);
3148 }
3149
3150 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3151 cid.handle);
3152 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003153 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003154
3155 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003156 if (__rbd_is_lock_owner(rbd_dev)) {
3157 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3158 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3159 goto out_unlock;
3160
3161 /*
3162 * encode ResponseMessage(0) so the peer can detect
3163 * a missing owner
3164 */
3165 result = 0;
3166
3167 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003168 if (!rbd_dev->opts->exclusive) {
3169 dout("%s rbd_dev %p queueing unlock_work\n",
3170 __func__, rbd_dev);
3171 queue_work(rbd_dev->task_wq,
3172 &rbd_dev->unlock_work);
3173 } else {
3174 /* refuse to release the lock */
3175 result = -EROFS;
3176 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003177 }
3178 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003179
3180out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003181 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003182 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003183}
3184
3185static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3186 u64 notify_id, u64 cookie, s32 *result)
3187{
3188 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3189 int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3190 char buf[buf_size];
3191 int ret;
3192
3193 if (result) {
3194 void *p = buf;
3195
3196 /* encode ResponseMessage */
3197 ceph_start_encoding(&p, 1, 1,
3198 buf_size - CEPH_ENCODING_START_BLK_LEN);
3199 ceph_encode_32(&p, *result);
3200 } else {
3201 buf_size = 0;
3202 }
3203
3204 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3205 &rbd_dev->header_oloc, notify_id, cookie,
3206 buf, buf_size);
3207 if (ret)
3208 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3209}
3210
3211static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3212 u64 cookie)
3213{
3214 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3215 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3216}
3217
3218static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3219 u64 notify_id, u64 cookie, s32 result)
3220{
3221 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3222 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3223}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003224
3225static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3226 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003227{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003228 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003229 void *p = data;
3230 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003231 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003232 u32 len;
3233 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003234 int ret;
3235
Ilya Dryomoved95b212016-08-12 16:40:02 +02003236 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3237 __func__, rbd_dev, cookie, notify_id, data_len);
3238 if (data_len) {
3239 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3240 &struct_v, &len);
3241 if (ret) {
3242 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3243 ret);
3244 return;
3245 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003246
Ilya Dryomoved95b212016-08-12 16:40:02 +02003247 notify_op = ceph_decode_32(&p);
3248 } else {
3249 /* legacy notification for header updates */
3250 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3251 len = 0;
3252 }
Alex Elderb8d70032012-11-30 17:53:04 -06003253
Ilya Dryomoved95b212016-08-12 16:40:02 +02003254 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3255 switch (notify_op) {
3256 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3257 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3258 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3259 break;
3260 case RBD_NOTIFY_OP_RELEASED_LOCK:
3261 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3262 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3263 break;
3264 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003265 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3266 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003267 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003268 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003269 else
3270 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3271 break;
3272 case RBD_NOTIFY_OP_HEADER_UPDATE:
3273 ret = rbd_dev_refresh(rbd_dev);
3274 if (ret)
3275 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3276
3277 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3278 break;
3279 default:
3280 if (rbd_is_lock_owner(rbd_dev))
3281 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3282 cookie, -EOPNOTSUPP);
3283 else
3284 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3285 break;
3286 }
Alex Elderb8d70032012-11-30 17:53:04 -06003287}
3288
Ilya Dryomov99d16942016-08-12 16:11:41 +02003289static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3290
Ilya Dryomov922dab62016-05-26 01:15:02 +02003291static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003292{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003293 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003294
Ilya Dryomov922dab62016-05-26 01:15:02 +02003295 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003296
Ilya Dryomoved95b212016-08-12 16:40:02 +02003297 down_write(&rbd_dev->lock_rwsem);
3298 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3299 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003300
Ilya Dryomov99d16942016-08-12 16:11:41 +02003301 mutex_lock(&rbd_dev->watch_mutex);
3302 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3303 __rbd_unregister_watch(rbd_dev);
3304 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003305
Ilya Dryomov99d16942016-08-12 16:11:41 +02003306 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003307 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003308 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003309}
3310
3311/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003312 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003313 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003314static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003315{
3316 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003317 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003318
Ilya Dryomov922dab62016-05-26 01:15:02 +02003319 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003320 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003321
Ilya Dryomov922dab62016-05-26 01:15:02 +02003322 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3323 &rbd_dev->header_oloc, rbd_watch_cb,
3324 rbd_watch_errcb, rbd_dev);
3325 if (IS_ERR(handle))
3326 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003327
Ilya Dryomov922dab62016-05-26 01:15:02 +02003328 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003329 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003330}
3331
Ilya Dryomov99d16942016-08-12 16:11:41 +02003332/*
3333 * watch_mutex must be locked
3334 */
3335static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003336{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003337 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3338 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003339
Ilya Dryomov99d16942016-08-12 16:11:41 +02003340 rbd_assert(rbd_dev->watch_handle);
3341 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003342
Ilya Dryomov922dab62016-05-26 01:15:02 +02003343 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3344 if (ret)
3345 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003346
Ilya Dryomov922dab62016-05-26 01:15:02 +02003347 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003348}
3349
Ilya Dryomov99d16942016-08-12 16:11:41 +02003350static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003351{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003352 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003353
Ilya Dryomov99d16942016-08-12 16:11:41 +02003354 mutex_lock(&rbd_dev->watch_mutex);
3355 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3356 ret = __rbd_register_watch(rbd_dev);
3357 if (ret)
3358 goto out;
3359
3360 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3361 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3362
3363out:
3364 mutex_unlock(&rbd_dev->watch_mutex);
3365 return ret;
3366}
3367
3368static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3369{
3370 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3371
3372 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003373 cancel_work_sync(&rbd_dev->acquired_lock_work);
3374 cancel_work_sync(&rbd_dev->released_lock_work);
3375 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3376 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003377}
3378
3379static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3380{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003381 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003382 cancel_tasks_sync(rbd_dev);
3383
3384 mutex_lock(&rbd_dev->watch_mutex);
3385 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3386 __rbd_unregister_watch(rbd_dev);
3387 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3388 mutex_unlock(&rbd_dev->watch_mutex);
3389
Ilya Dryomov811c6682016-04-15 16:22:16 +02003390 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003391}
3392
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003393/*
3394 * lock_rwsem must be held for write
3395 */
3396static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3397{
3398 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3399 char cookie[32];
3400 int ret;
3401
3402 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3403
3404 format_lock_cookie(rbd_dev, cookie);
3405 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3406 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3407 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3408 RBD_LOCK_TAG, cookie);
3409 if (ret) {
3410 if (ret != -EOPNOTSUPP)
3411 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3412 ret);
3413
3414 /*
3415 * Lock cookie cannot be updated on older OSDs, so do
3416 * a manual release and queue an acquire.
3417 */
3418 if (rbd_release_lock(rbd_dev))
3419 queue_delayed_work(rbd_dev->task_wq,
3420 &rbd_dev->lock_dwork, 0);
3421 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003422 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003423 }
3424}
3425
Ilya Dryomov99d16942016-08-12 16:11:41 +02003426static void rbd_reregister_watch(struct work_struct *work)
3427{
3428 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3429 struct rbd_device, watch_dwork);
3430 int ret;
3431
3432 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3433
3434 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003435 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3436 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003437 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003438 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003439
3440 ret = __rbd_register_watch(rbd_dev);
3441 if (ret) {
3442 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003443 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003444 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003445 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003446 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003447 queue_delayed_work(rbd_dev->task_wq,
3448 &rbd_dev->watch_dwork,
3449 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003450 }
3451 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003452 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003453 }
3454
3455 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3456 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3457 mutex_unlock(&rbd_dev->watch_mutex);
3458
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003459 down_write(&rbd_dev->lock_rwsem);
3460 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3461 rbd_reacquire_lock(rbd_dev);
3462 up_write(&rbd_dev->lock_rwsem);
3463
Ilya Dryomov99d16942016-08-12 16:11:41 +02003464 ret = rbd_dev_refresh(rbd_dev);
3465 if (ret)
3466 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003467}
3468
Alex Elder36be9a72013-01-19 00:30:28 -06003469/*
Alex Elderf40eb342013-04-25 15:09:42 -05003470 * Synchronous osd object method call. Returns the number of bytes
3471 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003472 */
3473static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003474 struct ceph_object_id *oid,
3475 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003476 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003477 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003478 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003479 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003480 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003481{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003482 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3483 struct page *req_page = NULL;
3484 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003485 int ret;
3486
3487 /*
Alex Elder6010a452013-04-05 01:27:11 -05003488 * Method calls are ultimately read operations. The result
3489 * should placed into the inbound buffer provided. They
3490 * also supply outbound data--parameters for the object
3491 * method. Currently if this is present it will be a
3492 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003493 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003494 if (outbound) {
3495 if (outbound_size > PAGE_SIZE)
3496 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003497
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003498 req_page = alloc_page(GFP_KERNEL);
3499 if (!req_page)
3500 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003501
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003502 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003503 }
Alex Elder430c28c2013-04-03 21:32:51 -05003504
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003505 reply_page = alloc_page(GFP_KERNEL);
3506 if (!reply_page) {
3507 if (req_page)
3508 __free_page(req_page);
3509 return -ENOMEM;
3510 }
Alex Elder36be9a72013-01-19 00:30:28 -06003511
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003512 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3513 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3514 reply_page, &inbound_size);
3515 if (!ret) {
3516 memcpy(inbound, page_address(reply_page), inbound_size);
3517 ret = inbound_size;
3518 }
Alex Elder57385b52013-04-21 12:14:45 -05003519
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003520 if (req_page)
3521 __free_page(req_page);
3522 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003523 return ret;
3524}
3525
Ilya Dryomoved95b212016-08-12 16:40:02 +02003526/*
3527 * lock_rwsem must be held for read
3528 */
3529static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
3530{
3531 DEFINE_WAIT(wait);
3532
3533 do {
3534 /*
3535 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3536 * and cancel_delayed_work() in wake_requests().
3537 */
3538 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3539 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3540 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3541 TASK_UNINTERRUPTIBLE);
3542 up_read(&rbd_dev->lock_rwsem);
3543 schedule();
3544 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003545 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
3546 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
3547
Ilya Dryomoved95b212016-08-12 16:40:02 +02003548 finish_wait(&rbd_dev->lock_waitq, &wait);
3549}
3550
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003551static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003552{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003553 struct request *rq = blk_mq_rq_from_pdu(work);
3554 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003555 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003556 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003557 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3558 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003559 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003560 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003561 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003562 int result;
3563
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003564 switch (req_op(rq)) {
3565 case REQ_OP_DISCARD:
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003566 case REQ_OP_WRITE_ZEROES:
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003567 op_type = OBJ_OP_DISCARD;
3568 break;
3569 case REQ_OP_WRITE:
3570 op_type = OBJ_OP_WRITE;
3571 break;
3572 case REQ_OP_READ:
3573 op_type = OBJ_OP_READ;
3574 break;
3575 default:
3576 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003577 result = -EIO;
3578 goto err;
3579 }
3580
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003581 /* Ignore/skip any zero-length requests */
3582
3583 if (!length) {
3584 dout("%s: zero-length request\n", __func__);
3585 result = 0;
3586 goto err_rq;
3587 }
3588
Ilya Dryomov9568c932017-10-12 12:35:19 +02003589 rbd_assert(op_type == OBJ_OP_READ ||
3590 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003591
3592 /*
3593 * Quit early if the mapped snapshot no longer exists. It's
3594 * still possible the snapshot will have disappeared by the
3595 * time our request arrives at the osd, but there's no sense in
3596 * sending it if we already know.
3597 */
3598 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3599 dout("request for non-existent snapshot");
3600 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3601 result = -ENXIO;
3602 goto err_rq;
3603 }
3604
3605 if (offset && length > U64_MAX - offset + 1) {
3606 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3607 length);
3608 result = -EINVAL;
3609 goto err_rq; /* Shouldn't happen */
3610 }
3611
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003612 blk_mq_start_request(rq);
3613
Josh Durgin4e752f02014-04-08 11:12:11 -07003614 down_read(&rbd_dev->header_rwsem);
3615 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003616 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003617 snapc = rbd_dev->header.snapc;
3618 ceph_get_snap_context(snapc);
3619 }
3620 up_read(&rbd_dev->header_rwsem);
3621
3622 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003623 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003624 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003625 result = -EIO;
3626 goto err_rq;
3627 }
3628
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003629 must_be_locked =
3630 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3631 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003632 if (must_be_locked) {
3633 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003634 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
Ilya Dryomove010dd02017-04-13 12:17:39 +02003635 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3636 if (rbd_dev->opts->exclusive) {
3637 rbd_warn(rbd_dev, "exclusive lock required");
3638 result = -EROFS;
3639 goto err_unlock;
3640 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003641 rbd_wait_state_locked(rbd_dev);
Ilya Dryomove010dd02017-04-13 12:17:39 +02003642 }
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003643 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3644 result = -EBLACKLISTED;
3645 goto err_unlock;
3646 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003647 }
3648
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003649 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07003650 snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003651 if (!img_request) {
3652 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003653 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003654 }
3655 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003656 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003657
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003658 if (op_type == OBJ_OP_DISCARD)
3659 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
3660 NULL);
Ilya Dryomov5359a172018-01-20 10:30:10 +01003661 else {
3662 struct ceph_bio_iter bio_it = { .bio = rq->bio,
3663 .iter = rq->bio->bi_iter };
3664
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003665 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
Ilya Dryomov5359a172018-01-20 10:30:10 +01003666 &bio_it);
3667 }
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003668 if (result)
3669 goto err_img_request;
3670
3671 result = rbd_img_request_submit(img_request);
3672 if (result)
3673 goto err_img_request;
3674
Ilya Dryomoved95b212016-08-12 16:40:02 +02003675 if (must_be_locked)
3676 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003677 return;
3678
3679err_img_request:
3680 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003681err_unlock:
3682 if (must_be_locked)
3683 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003684err_rq:
3685 if (result)
3686 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003687 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003688 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003689err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003690 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003691}
3692
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003693static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003694 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003695{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003696 struct request *rq = bd->rq;
3697 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003698
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003699 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003700 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003701}
3702
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003703static void rbd_free_disk(struct rbd_device *rbd_dev)
3704{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003705 blk_cleanup_queue(rbd_dev->disk->queue);
3706 blk_mq_free_tag_set(&rbd_dev->tag_set);
3707 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003708 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003709}
3710
Alex Elder788e2df2013-01-17 12:25:27 -06003711static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003712 struct ceph_object_id *oid,
3713 struct ceph_object_locator *oloc,
3714 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003715
3716{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003717 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3718 struct ceph_osd_request *req;
3719 struct page **pages;
3720 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003721 int ret;
3722
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003723 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3724 if (!req)
3725 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003726
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003727 ceph_oid_copy(&req->r_base_oid, oid);
3728 ceph_oloc_copy(&req->r_base_oloc, oloc);
3729 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003730
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003731 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06003732 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003733 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06003734
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003735 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3736 if (IS_ERR(pages)) {
3737 ret = PTR_ERR(pages);
3738 goto out_req;
3739 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003740
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003741 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3742 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3743 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003744
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003745 ceph_osdc_start_request(osdc, req, false);
3746 ret = ceph_osdc_wait_request(osdc, req);
3747 if (ret >= 0)
3748 ceph_copy_from_page_vector(pages, buf, 0, ret);
3749
3750out_req:
3751 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003752 return ret;
3753}
3754
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003755/*
Alex Elder662518b2013-05-06 09:51:29 -05003756 * Read the complete header for the given rbd device. On successful
3757 * return, the rbd_dev->header field will contain up-to-date
3758 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003759 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003760static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003761{
3762 struct rbd_image_header_ondisk *ondisk = NULL;
3763 u32 snap_count = 0;
3764 u64 names_size = 0;
3765 u32 want_count;
3766 int ret;
3767
3768 /*
3769 * The complete header will include an array of its 64-bit
3770 * snapshot ids, followed by the names of those snapshots as
3771 * a contiguous block of NUL-terminated strings. Note that
3772 * the number of snapshots could change by the time we read
3773 * it in, in which case we re-read it.
3774 */
3775 do {
3776 size_t size;
3777
3778 kfree(ondisk);
3779
3780 size = sizeof (*ondisk);
3781 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3782 size += names_size;
3783 ondisk = kmalloc(size, GFP_KERNEL);
3784 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003785 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003786
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003787 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3788 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05003789 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003790 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003791 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003792 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003793 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3794 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003795 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003796 }
3797 if (!rbd_dev_ondisk_valid(ondisk)) {
3798 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003799 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003800 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003801 }
3802
3803 names_size = le64_to_cpu(ondisk->snap_names_len);
3804 want_count = snap_count;
3805 snap_count = le32_to_cpu(ondisk->snap_count);
3806 } while (snap_count != want_count);
3807
Alex Elder662518b2013-05-06 09:51:29 -05003808 ret = rbd_header_from_disk(rbd_dev, ondisk);
3809out:
Alex Elder4156d992012-08-02 11:29:46 -05003810 kfree(ondisk);
3811
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003812 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003813}
3814
Alex Elder15228ed2013-05-01 12:43:03 -05003815/*
3816 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3817 * has disappeared from the (just updated) snapshot context.
3818 */
3819static void rbd_exists_validate(struct rbd_device *rbd_dev)
3820{
3821 u64 snap_id;
3822
3823 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3824 return;
3825
3826 snap_id = rbd_dev->spec->snap_id;
3827 if (snap_id == CEPH_NOSNAP)
3828 return;
3829
3830 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3831 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3832}
3833
Josh Durgin98752012013-08-29 17:26:31 -07003834static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3835{
3836 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07003837
3838 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02003839 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3840 * try to update its size. If REMOVING is set, updating size
3841 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07003842 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02003843 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3844 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07003845 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3846 dout("setting size to %llu sectors", (unsigned long long)size);
3847 set_capacity(rbd_dev->disk, size);
3848 revalidate_disk(rbd_dev->disk);
3849 }
3850}
3851
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003852static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003853{
Alex Eldere627db02013-05-06 07:40:30 -05003854 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003855 int ret;
3856
Alex Eldercfbf6372013-05-31 17:40:45 -05003857 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05003858 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04003859
3860 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003861 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003862 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05003863
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003864 /*
3865 * If there is a parent, see if it has disappeared due to the
3866 * mapped image getting flattened.
3867 */
3868 if (rbd_dev->parent) {
3869 ret = rbd_dev_v2_parent_info(rbd_dev);
3870 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003871 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003872 }
3873
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003874 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003875 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003876 } else {
3877 /* validate mapped snapshot's EXISTS flag */
3878 rbd_exists_validate(rbd_dev);
3879 }
Alex Elder15228ed2013-05-01 12:43:03 -05003880
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003881out:
Alex Eldercfbf6372013-05-31 17:40:45 -05003882 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003883 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07003884 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003885
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003886 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05003887}
3888
Christoph Hellwigd6296d392017-05-01 10:19:08 -06003889static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
3890 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003891{
3892 struct work_struct *work = blk_mq_rq_to_pdu(rq);
3893
3894 INIT_WORK(work, rbd_queue_workfn);
3895 return 0;
3896}
3897
Eric Biggersf363b082017-03-30 13:39:16 -07003898static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003899 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003900 .init_request = rbd_init_request,
3901};
3902
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003903static int rbd_init_disk(struct rbd_device *rbd_dev)
3904{
3905 struct gendisk *disk;
3906 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06003907 u64 segment_size;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003908 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003909
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003910 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003911 disk = alloc_disk(single_major ?
3912 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3913 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003914 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003915 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003916
Alex Elderf0f8cef2012-01-29 13:57:44 -06003917 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003918 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003919 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003920 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003921 if (single_major)
3922 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003923 disk->fops = &rbd_bd_ops;
3924 disk->private_data = rbd_dev;
3925
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003926 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
3927 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003928 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003929 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003930 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003931 rbd_dev->tag_set.nr_hw_queues = 1;
3932 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
3933
3934 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
3935 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003936 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003937
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003938 q = blk_mq_init_queue(&rbd_dev->tag_set);
3939 if (IS_ERR(q)) {
3940 err = PTR_ERR(q);
3941 goto out_tag_set;
3942 }
3943
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03003944 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
3945 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06003946
Josh Durgin029bcbd2011-07-22 11:35:23 -07003947 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06003948 segment_size = rbd_obj_bytes(&rbd_dev->header);
3949 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02003950 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01003951 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01003952 blk_queue_max_segment_size(q, UINT_MAX);
Alex Elder593a9e72012-02-07 12:03:37 -06003953 blk_queue_io_min(q, segment_size);
3954 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07003955
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003956 /* enable the discard support */
3957 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
3958 q->limits.discard_granularity = segment_size;
Jens Axboe2bb4cd52015-07-14 08:15:12 -06003959 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003960 blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003961
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003962 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01003963 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00003964
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003965 /*
3966 * disk_release() expects a queue ref from add_disk() and will
3967 * put it. Hold an extra ref until add_disk() is called.
3968 */
3969 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003970 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003971 q->queuedata = rbd_dev;
3972
3973 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003974
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003975 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003976out_tag_set:
3977 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003978out_disk:
3979 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003980 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003981}
3982
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003983/*
3984 sysfs
3985*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003986
Alex Elder593a9e72012-02-07 12:03:37 -06003987static struct rbd_device *dev_to_rbd_dev(struct device *dev)
3988{
3989 return container_of(dev, struct rbd_device, dev);
3990}
3991
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003992static ssize_t rbd_size_show(struct device *dev,
3993 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003994{
Alex Elder593a9e72012-02-07 12:03:37 -06003995 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003996
Alex Elderfc71d832013-04-26 15:44:36 -05003997 return sprintf(buf, "%llu\n",
3998 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003999}
4000
Alex Elder34b13182012-07-13 20:35:12 -05004001/*
4002 * Note this shows the features for whatever's mapped, which is not
4003 * necessarily the base image.
4004 */
4005static ssize_t rbd_features_show(struct device *dev,
4006 struct device_attribute *attr, char *buf)
4007{
4008 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4009
4010 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004011 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004012}
4013
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004014static ssize_t rbd_major_show(struct device *dev,
4015 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004016{
Alex Elder593a9e72012-02-07 12:03:37 -06004017 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004018
Alex Elderfc71d832013-04-26 15:44:36 -05004019 if (rbd_dev->major)
4020 return sprintf(buf, "%d\n", rbd_dev->major);
4021
4022 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004023}
Alex Elderfc71d832013-04-26 15:44:36 -05004024
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004025static ssize_t rbd_minor_show(struct device *dev,
4026 struct device_attribute *attr, char *buf)
4027{
4028 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4029
4030 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004031}
4032
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004033static ssize_t rbd_client_addr_show(struct device *dev,
4034 struct device_attribute *attr, char *buf)
4035{
4036 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4037 struct ceph_entity_addr *client_addr =
4038 ceph_client_addr(rbd_dev->rbd_client->client);
4039
4040 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4041 le32_to_cpu(client_addr->nonce));
4042}
4043
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004044static ssize_t rbd_client_id_show(struct device *dev,
4045 struct device_attribute *attr, char *buf)
4046{
Alex Elder593a9e72012-02-07 12:03:37 -06004047 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004048
Alex Elder1dbb4392012-01-24 10:08:37 -06004049 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004050 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004051}
4052
Mike Christie267fb902016-08-18 18:38:43 +02004053static ssize_t rbd_cluster_fsid_show(struct device *dev,
4054 struct device_attribute *attr, char *buf)
4055{
4056 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4057
4058 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4059}
4060
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004061static ssize_t rbd_config_info_show(struct device *dev,
4062 struct device_attribute *attr, char *buf)
4063{
4064 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4065
4066 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004067}
4068
4069static ssize_t rbd_pool_show(struct device *dev,
4070 struct device_attribute *attr, char *buf)
4071{
Alex Elder593a9e72012-02-07 12:03:37 -06004072 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004073
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004074 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004075}
4076
Alex Elder9bb2f332012-07-12 10:46:35 -05004077static ssize_t rbd_pool_id_show(struct device *dev,
4078 struct device_attribute *attr, char *buf)
4079{
4080 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4081
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004082 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004083 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004084}
4085
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004086static ssize_t rbd_name_show(struct device *dev,
4087 struct device_attribute *attr, char *buf)
4088{
Alex Elder593a9e72012-02-07 12:03:37 -06004089 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004090
Alex Eldera92ffdf2012-10-30 19:40:33 -05004091 if (rbd_dev->spec->image_name)
4092 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4093
4094 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004095}
4096
Alex Elder589d30e2012-07-10 20:30:11 -05004097static ssize_t rbd_image_id_show(struct device *dev,
4098 struct device_attribute *attr, char *buf)
4099{
4100 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4101
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004102 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004103}
4104
Alex Elder34b13182012-07-13 20:35:12 -05004105/*
4106 * Shows the name of the currently-mapped snapshot (or
4107 * RBD_SNAP_HEAD_NAME for the base image).
4108 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004109static ssize_t rbd_snap_show(struct device *dev,
4110 struct device_attribute *attr,
4111 char *buf)
4112{
Alex Elder593a9e72012-02-07 12:03:37 -06004113 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004114
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004115 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004116}
4117
Mike Christie92a58672016-08-18 18:38:44 +02004118static ssize_t rbd_snap_id_show(struct device *dev,
4119 struct device_attribute *attr, char *buf)
4120{
4121 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4122
4123 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4124}
4125
Alex Elder86b00e02012-10-25 23:34:42 -05004126/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004127 * For a v2 image, shows the chain of parent images, separated by empty
4128 * lines. For v1 images or if there is no parent, shows "(no parent
4129 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004130 */
4131static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004132 struct device_attribute *attr,
4133 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004134{
4135 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004136 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004137
Ilya Dryomovff961282014-07-22 21:53:07 +04004138 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004139 return sprintf(buf, "(no parent image)\n");
4140
Ilya Dryomovff961282014-07-22 21:53:07 +04004141 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4142 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004143
Ilya Dryomovff961282014-07-22 21:53:07 +04004144 count += sprintf(&buf[count], "%s"
4145 "pool_id %llu\npool_name %s\n"
4146 "image_id %s\nimage_name %s\n"
4147 "snap_id %llu\nsnap_name %s\n"
4148 "overlap %llu\n",
4149 !count ? "" : "\n", /* first? */
4150 spec->pool_id, spec->pool_name,
4151 spec->image_id, spec->image_name ?: "(unknown)",
4152 spec->snap_id, spec->snap_name,
4153 rbd_dev->parent_overlap);
4154 }
Alex Elder86b00e02012-10-25 23:34:42 -05004155
Ilya Dryomovff961282014-07-22 21:53:07 +04004156 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004157}
4158
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004159static ssize_t rbd_image_refresh(struct device *dev,
4160 struct device_attribute *attr,
4161 const char *buf,
4162 size_t size)
4163{
Alex Elder593a9e72012-02-07 12:03:37 -06004164 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004165 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004166
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004167 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004168 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004169 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004170
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004171 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004172}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004173
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004174static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05004175static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004176static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004177static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004178static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004179static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
Mike Christie267fb902016-08-18 18:38:43 +02004180static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004181static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004182static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05004183static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004184static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05004185static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004186static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4187static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Mike Christie92a58672016-08-18 18:38:44 +02004188static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05004189static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004190
4191static struct attribute *rbd_attrs[] = {
4192 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004193 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004194 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004195 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004196 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004197 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004198 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004199 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004200 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004201 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004202 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004203 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004204 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004205 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004206 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004207 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004208 NULL
4209};
4210
4211static struct attribute_group rbd_attr_group = {
4212 .attrs = rbd_attrs,
4213};
4214
4215static const struct attribute_group *rbd_attr_groups[] = {
4216 &rbd_attr_group,
4217 NULL
4218};
4219
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004220static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004221
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304222static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004223 .name = "rbd",
4224 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004225 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004226};
4227
Alex Elder8b8fb992012-10-26 17:25:24 -05004228static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4229{
4230 kref_get(&spec->kref);
4231
4232 return spec;
4233}
4234
4235static void rbd_spec_free(struct kref *kref);
4236static void rbd_spec_put(struct rbd_spec *spec)
4237{
4238 if (spec)
4239 kref_put(&spec->kref, rbd_spec_free);
4240}
4241
4242static struct rbd_spec *rbd_spec_alloc(void)
4243{
4244 struct rbd_spec *spec;
4245
4246 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4247 if (!spec)
4248 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004249
4250 spec->pool_id = CEPH_NOPOOL;
4251 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004252 kref_init(&spec->kref);
4253
Alex Elder8b8fb992012-10-26 17:25:24 -05004254 return spec;
4255}
4256
4257static void rbd_spec_free(struct kref *kref)
4258{
4259 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4260
4261 kfree(spec->pool_name);
4262 kfree(spec->image_id);
4263 kfree(spec->image_name);
4264 kfree(spec->snap_name);
4265 kfree(spec);
4266}
4267
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004268static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004269{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004270 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004271 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004272
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004273 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004274 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004275 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004276
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004277 rbd_put_client(rbd_dev->rbd_client);
4278 rbd_spec_put(rbd_dev->spec);
4279 kfree(rbd_dev->opts);
4280 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004281}
4282
4283static void rbd_dev_release(struct device *dev)
4284{
4285 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4286 bool need_put = !!rbd_dev->opts;
4287
4288 if (need_put) {
4289 destroy_workqueue(rbd_dev->task_wq);
4290 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4291 }
4292
4293 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004294
4295 /*
4296 * This is racy, but way better than putting module outside of
4297 * the release callback. The race window is pretty small, so
4298 * doing something similar to dm (dm-builtin.c) is overkill.
4299 */
4300 if (need_put)
4301 module_put(THIS_MODULE);
4302}
4303
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004304static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4305 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004306{
4307 struct rbd_device *rbd_dev;
4308
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004309 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004310 if (!rbd_dev)
4311 return NULL;
4312
4313 spin_lock_init(&rbd_dev->lock);
4314 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004315 init_rwsem(&rbd_dev->header_rwsem);
4316
Ilya Dryomov7e973322017-01-25 18:16:22 +01004317 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004318 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004319 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004320
Ilya Dryomov99d16942016-08-12 16:11:41 +02004321 mutex_init(&rbd_dev->watch_mutex);
4322 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4323 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4324
Ilya Dryomoved95b212016-08-12 16:40:02 +02004325 init_rwsem(&rbd_dev->lock_rwsem);
4326 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4327 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4328 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4329 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4330 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4331 init_waitqueue_head(&rbd_dev->lock_waitq);
4332
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004333 rbd_dev->dev.bus = &rbd_bus_type;
4334 rbd_dev->dev.type = &rbd_device_type;
4335 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004336 device_initialize(&rbd_dev->dev);
4337
Alex Elderc53d5892012-10-25 23:34:42 -05004338 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004339 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004340
Alex Elderc53d5892012-10-25 23:34:42 -05004341 return rbd_dev;
4342}
4343
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004344/*
4345 * Create a mapping rbd_dev.
4346 */
4347static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4348 struct rbd_spec *spec,
4349 struct rbd_options *opts)
4350{
4351 struct rbd_device *rbd_dev;
4352
4353 rbd_dev = __rbd_dev_create(rbdc, spec);
4354 if (!rbd_dev)
4355 return NULL;
4356
4357 rbd_dev->opts = opts;
4358
4359 /* get an id and fill in device name */
4360 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4361 minor_to_rbd_dev_id(1 << MINORBITS),
4362 GFP_KERNEL);
4363 if (rbd_dev->dev_id < 0)
4364 goto fail_rbd_dev;
4365
4366 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4367 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4368 rbd_dev->name);
4369 if (!rbd_dev->task_wq)
4370 goto fail_dev_id;
4371
4372 /* we have a ref from do_rbd_add() */
4373 __module_get(THIS_MODULE);
4374
4375 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4376 return rbd_dev;
4377
4378fail_dev_id:
4379 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4380fail_rbd_dev:
4381 rbd_dev_free(rbd_dev);
4382 return NULL;
4383}
4384
Alex Elderc53d5892012-10-25 23:34:42 -05004385static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4386{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004387 if (rbd_dev)
4388 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004389}
4390
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004391/*
Alex Elder9d475de2012-07-03 16:01:19 -05004392 * Get the size and object order for an image snapshot, or if
4393 * snap_id is CEPH_NOSNAP, gets this information for the base
4394 * image.
4395 */
4396static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4397 u8 *order, u64 *snap_size)
4398{
4399 __le64 snapid = cpu_to_le64(snap_id);
4400 int ret;
4401 struct {
4402 u8 order;
4403 __le64 size;
4404 } __attribute__ ((packed)) size_buf = { 0 };
4405
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004406 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4407 &rbd_dev->header_oloc, "get_size",
4408 &snapid, sizeof(snapid),
4409 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004410 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004411 if (ret < 0)
4412 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004413 if (ret < sizeof (size_buf))
4414 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004415
Josh Durginc3545572013-08-28 17:08:10 -07004416 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004417 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004418 dout(" order %u", (unsigned int)*order);
4419 }
Alex Elder9d475de2012-07-03 16:01:19 -05004420 *snap_size = le64_to_cpu(size_buf.size);
4421
Josh Durginc3545572013-08-28 17:08:10 -07004422 dout(" snap_id 0x%016llx snap_size = %llu\n",
4423 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004424 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004425
4426 return 0;
4427}
4428
4429static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4430{
4431 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4432 &rbd_dev->header.obj_order,
4433 &rbd_dev->header.image_size);
4434}
4435
Alex Elder1e130192012-07-03 16:01:19 -05004436static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4437{
4438 void *reply_buf;
4439 int ret;
4440 void *p;
4441
4442 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4443 if (!reply_buf)
4444 return -ENOMEM;
4445
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004446 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4447 &rbd_dev->header_oloc, "get_object_prefix",
4448 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004449 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004450 if (ret < 0)
4451 goto out;
4452
4453 p = reply_buf;
4454 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004455 p + ret, NULL, GFP_NOIO);
4456 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004457
4458 if (IS_ERR(rbd_dev->header.object_prefix)) {
4459 ret = PTR_ERR(rbd_dev->header.object_prefix);
4460 rbd_dev->header.object_prefix = NULL;
4461 } else {
4462 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4463 }
Alex Elder1e130192012-07-03 16:01:19 -05004464out:
4465 kfree(reply_buf);
4466
4467 return ret;
4468}
4469
Alex Elderb1b54022012-07-03 16:01:19 -05004470static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4471 u64 *snap_features)
4472{
4473 __le64 snapid = cpu_to_le64(snap_id);
4474 struct {
4475 __le64 features;
4476 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004477 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004478 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004479 int ret;
4480
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004481 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4482 &rbd_dev->header_oloc, "get_features",
4483 &snapid, sizeof(snapid),
4484 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004485 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004486 if (ret < 0)
4487 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004488 if (ret < sizeof (features_buf))
4489 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004490
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004491 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4492 if (unsup) {
4493 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4494 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004495 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004496 }
Alex Elderd8891402012-10-09 13:50:17 -07004497
Alex Elderb1b54022012-07-03 16:01:19 -05004498 *snap_features = le64_to_cpu(features_buf.features);
4499
4500 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004501 (unsigned long long)snap_id,
4502 (unsigned long long)*snap_features,
4503 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004504
4505 return 0;
4506}
4507
4508static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4509{
4510 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4511 &rbd_dev->header.features);
4512}
4513
Alex Elder86b00e02012-10-25 23:34:42 -05004514static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4515{
4516 struct rbd_spec *parent_spec;
4517 size_t size;
4518 void *reply_buf = NULL;
4519 __le64 snapid;
4520 void *p;
4521 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05004522 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004523 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004524 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05004525 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05004526 int ret;
4527
4528 parent_spec = rbd_spec_alloc();
4529 if (!parent_spec)
4530 return -ENOMEM;
4531
4532 size = sizeof (__le64) + /* pool_id */
4533 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
4534 sizeof (__le64) + /* snap_id */
4535 sizeof (__le64); /* overlap */
4536 reply_buf = kmalloc(size, GFP_KERNEL);
4537 if (!reply_buf) {
4538 ret = -ENOMEM;
4539 goto out_err;
4540 }
4541
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04004542 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004543 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4544 &rbd_dev->header_oloc, "get_parent",
4545 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004546 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05004547 if (ret < 0)
4548 goto out_err;
4549
Alex Elder86b00e02012-10-25 23:34:42 -05004550 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004551 end = reply_buf + ret;
4552 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05004553 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05004554 if (pool_id == CEPH_NOPOOL) {
4555 /*
4556 * Either the parent never existed, or we have
4557 * record of it but the image got flattened so it no
4558 * longer has a parent. When the parent of a
4559 * layered image disappears we immediately set the
4560 * overlap to 0. The effect of this is that all new
4561 * requests will be treated as if the image had no
4562 * parent.
4563 */
4564 if (rbd_dev->parent_overlap) {
4565 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004566 rbd_dev_parent_put(rbd_dev);
4567 pr_info("%s: clone image has been flattened\n",
4568 rbd_dev->disk->disk_name);
4569 }
4570
Alex Elder86b00e02012-10-25 23:34:42 -05004571 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004572 }
Alex Elder86b00e02012-10-25 23:34:42 -05004573
Alex Elder0903e872012-11-14 12:25:19 -06004574 /* The ceph file layout needs to fit pool id in 32 bits */
4575
4576 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05004577 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004578 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05004579 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004580 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004581 }
Alex Elder0903e872012-11-14 12:25:19 -06004582
Alex Elder979ed482012-11-01 08:39:26 -05004583 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05004584 if (IS_ERR(image_id)) {
4585 ret = PTR_ERR(image_id);
4586 goto out_err;
4587 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004588 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05004589 ceph_decode_64_safe(&p, end, overlap, out_err);
4590
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004591 /*
4592 * The parent won't change (except when the clone is
4593 * flattened, already handled that). So we only need to
4594 * record the parent spec we have not already done so.
4595 */
4596 if (!rbd_dev->parent_spec) {
4597 parent_spec->pool_id = pool_id;
4598 parent_spec->image_id = image_id;
4599 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05004600 rbd_dev->parent_spec = parent_spec;
4601 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04004602 } else {
4603 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004604 }
4605
4606 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004607 * We always update the parent overlap. If it's zero we issue
4608 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004609 */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004610 if (!overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004611 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004612 /* refresh, careful to warn just once */
4613 if (rbd_dev->parent_overlap)
4614 rbd_warn(rbd_dev,
4615 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004616 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004617 /* initial probe */
4618 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004619 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004620 }
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004621 rbd_dev->parent_overlap = overlap;
4622
Alex Elder86b00e02012-10-25 23:34:42 -05004623out:
4624 ret = 0;
4625out_err:
4626 kfree(reply_buf);
4627 rbd_spec_put(parent_spec);
4628
4629 return ret;
4630}
4631
Alex Eldercc070d52013-04-21 12:14:45 -05004632static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4633{
4634 struct {
4635 __le64 stripe_unit;
4636 __le64 stripe_count;
4637 } __attribute__ ((packed)) striping_info_buf = { 0 };
4638 size_t size = sizeof (striping_info_buf);
4639 void *p;
4640 u64 obj_size;
4641 u64 stripe_unit;
4642 u64 stripe_count;
4643 int ret;
4644
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004645 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4646 &rbd_dev->header_oloc, "get_stripe_unit_count",
4647 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004648 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4649 if (ret < 0)
4650 return ret;
4651 if (ret < size)
4652 return -ERANGE;
4653
4654 /*
4655 * We don't actually support the "fancy striping" feature
4656 * (STRIPINGV2) yet, but if the striping sizes are the
4657 * defaults the behavior is the same as before. So find
4658 * out, and only fail if the image has non-default values.
4659 */
4660 ret = -EINVAL;
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01004661 obj_size = rbd_obj_bytes(&rbd_dev->header);
Alex Eldercc070d52013-04-21 12:14:45 -05004662 p = &striping_info_buf;
4663 stripe_unit = ceph_decode_64(&p);
4664 if (stripe_unit != obj_size) {
4665 rbd_warn(rbd_dev, "unsupported stripe unit "
4666 "(got %llu want %llu)",
4667 stripe_unit, obj_size);
4668 return -EINVAL;
4669 }
4670 stripe_count = ceph_decode_64(&p);
4671 if (stripe_count != 1) {
4672 rbd_warn(rbd_dev, "unsupported stripe count "
4673 "(got %llu want 1)", stripe_count);
4674 return -EINVAL;
4675 }
Alex Elder500d0c02013-04-26 09:43:47 -05004676 rbd_dev->header.stripe_unit = stripe_unit;
4677 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05004678
4679 return 0;
4680}
4681
Ilya Dryomov7e973322017-01-25 18:16:22 +01004682static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4683{
4684 __le64 data_pool_id;
4685 int ret;
4686
4687 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4688 &rbd_dev->header_oloc, "get_data_pool",
4689 NULL, 0, &data_pool_id, sizeof(data_pool_id));
4690 if (ret < 0)
4691 return ret;
4692 if (ret < sizeof(data_pool_id))
4693 return -EBADMSG;
4694
4695 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4696 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4697 return 0;
4698}
4699
Alex Elder9e15b772012-10-30 19:40:33 -05004700static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4701{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004702 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05004703 size_t image_id_size;
4704 char *image_id;
4705 void *p;
4706 void *end;
4707 size_t size;
4708 void *reply_buf = NULL;
4709 size_t len = 0;
4710 char *image_name = NULL;
4711 int ret;
4712
4713 rbd_assert(!rbd_dev->spec->image_name);
4714
Alex Elder69e7a022012-11-01 08:39:26 -05004715 len = strlen(rbd_dev->spec->image_id);
4716 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004717 image_id = kmalloc(image_id_size, GFP_KERNEL);
4718 if (!image_id)
4719 return NULL;
4720
4721 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004722 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004723 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004724
4725 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4726 reply_buf = kmalloc(size, GFP_KERNEL);
4727 if (!reply_buf)
4728 goto out;
4729
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004730 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4731 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
4732 "dir_get_name", image_id, image_id_size,
4733 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05004734 if (ret < 0)
4735 goto out;
4736 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004737 end = reply_buf + ret;
4738
Alex Elder9e15b772012-10-30 19:40:33 -05004739 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4740 if (IS_ERR(image_name))
4741 image_name = NULL;
4742 else
4743 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4744out:
4745 kfree(reply_buf);
4746 kfree(image_id);
4747
4748 return image_name;
4749}
4750
Alex Elder2ad3d712013-04-30 00:44:33 -05004751static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4752{
4753 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4754 const char *snap_name;
4755 u32 which = 0;
4756
4757 /* Skip over names until we find the one we are looking for */
4758
4759 snap_name = rbd_dev->header.snap_names;
4760 while (which < snapc->num_snaps) {
4761 if (!strcmp(name, snap_name))
4762 return snapc->snaps[which];
4763 snap_name += strlen(snap_name) + 1;
4764 which++;
4765 }
4766 return CEPH_NOSNAP;
4767}
4768
4769static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4770{
4771 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4772 u32 which;
4773 bool found = false;
4774 u64 snap_id;
4775
4776 for (which = 0; !found && which < snapc->num_snaps; which++) {
4777 const char *snap_name;
4778
4779 snap_id = snapc->snaps[which];
4780 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07004781 if (IS_ERR(snap_name)) {
4782 /* ignore no-longer existing snapshots */
4783 if (PTR_ERR(snap_name) == -ENOENT)
4784 continue;
4785 else
4786 break;
4787 }
Alex Elder2ad3d712013-04-30 00:44:33 -05004788 found = !strcmp(name, snap_name);
4789 kfree(snap_name);
4790 }
4791 return found ? snap_id : CEPH_NOSNAP;
4792}
4793
4794/*
4795 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4796 * no snapshot by that name is found, or if an error occurs.
4797 */
4798static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4799{
4800 if (rbd_dev->image_format == 1)
4801 return rbd_v1_snap_id_by_name(rbd_dev, name);
4802
4803 return rbd_v2_snap_id_by_name(rbd_dev, name);
4804}
4805
Alex Elder9e15b772012-10-30 19:40:33 -05004806/*
Ilya Dryomov04077592014-07-23 17:11:20 +04004807 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05004808 */
Ilya Dryomov04077592014-07-23 17:11:20 +04004809static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
4810{
4811 struct rbd_spec *spec = rbd_dev->spec;
4812
4813 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
4814 rbd_assert(spec->image_id && spec->image_name);
4815 rbd_assert(spec->snap_name);
4816
4817 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4818 u64 snap_id;
4819
4820 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4821 if (snap_id == CEPH_NOSNAP)
4822 return -ENOENT;
4823
4824 spec->snap_id = snap_id;
4825 } else {
4826 spec->snap_id = CEPH_NOSNAP;
4827 }
4828
4829 return 0;
4830}
4831
4832/*
4833 * A parent image will have all ids but none of the names.
4834 *
4835 * All names in an rbd spec are dynamically allocated. It's OK if we
4836 * can't figure out the name for an image id.
4837 */
4838static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05004839{
Alex Elder2e9f7f12013-04-26 09:43:48 -05004840 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4841 struct rbd_spec *spec = rbd_dev->spec;
4842 const char *pool_name;
4843 const char *image_name;
4844 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004845 int ret;
4846
Ilya Dryomov04077592014-07-23 17:11:20 +04004847 rbd_assert(spec->pool_id != CEPH_NOPOOL);
4848 rbd_assert(spec->image_id);
4849 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05004850
Alex Elder2e9f7f12013-04-26 09:43:48 -05004851 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05004852
Alex Elder2e9f7f12013-04-26 09:43:48 -05004853 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4854 if (!pool_name) {
4855 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05004856 return -EIO;
4857 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05004858 pool_name = kstrdup(pool_name, GFP_KERNEL);
4859 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05004860 return -ENOMEM;
4861
4862 /* Fetch the image name; tolerate failure here */
4863
Alex Elder2e9f7f12013-04-26 09:43:48 -05004864 image_name = rbd_dev_image_name(rbd_dev);
4865 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05004866 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05004867
Ilya Dryomov04077592014-07-23 17:11:20 +04004868 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05004869
Alex Elder2e9f7f12013-04-26 09:43:48 -05004870 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07004871 if (IS_ERR(snap_name)) {
4872 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004873 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05004874 }
4875
4876 spec->pool_name = pool_name;
4877 spec->image_name = image_name;
4878 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004879
4880 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04004881
Alex Elder9e15b772012-10-30 19:40:33 -05004882out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05004883 kfree(image_name);
4884 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004885 return ret;
4886}
4887
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004888static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05004889{
4890 size_t size;
4891 int ret;
4892 void *reply_buf;
4893 void *p;
4894 void *end;
4895 u64 seq;
4896 u32 snap_count;
4897 struct ceph_snap_context *snapc;
4898 u32 i;
4899
4900 /*
4901 * We'll need room for the seq value (maximum snapshot id),
4902 * snapshot count, and array of that many snapshot ids.
4903 * For now we have a fixed upper limit on the number we're
4904 * prepared to receive.
4905 */
4906 size = sizeof (__le64) + sizeof (__le32) +
4907 RBD_MAX_SNAP_COUNT * sizeof (__le64);
4908 reply_buf = kzalloc(size, GFP_KERNEL);
4909 if (!reply_buf)
4910 return -ENOMEM;
4911
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004912 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4913 &rbd_dev->header_oloc, "get_snapcontext",
4914 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004915 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05004916 if (ret < 0)
4917 goto out;
4918
Alex Elder35d489f2012-07-03 16:01:19 -05004919 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05004920 end = reply_buf + ret;
4921 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05004922 ceph_decode_64_safe(&p, end, seq, out);
4923 ceph_decode_32_safe(&p, end, snap_count, out);
4924
4925 /*
4926 * Make sure the reported number of snapshot ids wouldn't go
4927 * beyond the end of our buffer. But before checking that,
4928 * make sure the computed size of the snapshot context we
4929 * allocate is representable in a size_t.
4930 */
4931 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
4932 / sizeof (u64)) {
4933 ret = -EINVAL;
4934 goto out;
4935 }
4936 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
4937 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05004938 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05004939
Alex Elder812164f82013-04-30 00:44:32 -05004940 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05004941 if (!snapc) {
4942 ret = -ENOMEM;
4943 goto out;
4944 }
Alex Elder35d489f2012-07-03 16:01:19 -05004945 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05004946 for (i = 0; i < snap_count; i++)
4947 snapc->snaps[i] = ceph_decode_64(&p);
4948
Alex Elder49ece552013-05-06 08:37:00 -05004949 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05004950 rbd_dev->header.snapc = snapc;
4951
4952 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05004953 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05004954out:
4955 kfree(reply_buf);
4956
Alex Elder57385b52013-04-21 12:14:45 -05004957 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05004958}
4959
Alex Elder54cac612013-04-30 00:44:33 -05004960static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
4961 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004962{
4963 size_t size;
4964 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05004965 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004966 int ret;
4967 void *p;
4968 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004969 char *snap_name;
4970
4971 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
4972 reply_buf = kmalloc(size, GFP_KERNEL);
4973 if (!reply_buf)
4974 return ERR_PTR(-ENOMEM);
4975
Alex Elder54cac612013-04-30 00:44:33 -05004976 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004977 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4978 &rbd_dev->header_oloc, "get_snapshot_name",
4979 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06004980 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05004981 if (ret < 0) {
4982 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004983 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05004984 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004985
4986 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004987 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05004988 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05004989 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004990 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004991
Alex Elderf40eb342013-04-25 15:09:42 -05004992 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05004993 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004994out:
4995 kfree(reply_buf);
4996
Alex Elderf40eb342013-04-25 15:09:42 -05004997 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05004998}
4999
Alex Elder2df3fac2013-05-06 09:51:30 -05005000static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005001{
Alex Elder2df3fac2013-05-06 09:51:30 -05005002 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005003 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005004
Josh Durgin1617e402013-06-12 14:43:10 -07005005 ret = rbd_dev_v2_image_size(rbd_dev);
5006 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005007 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005008
Alex Elder2df3fac2013-05-06 09:51:30 -05005009 if (first_time) {
5010 ret = rbd_dev_v2_header_onetime(rbd_dev);
5011 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005012 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005013 }
5014
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005015 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005016 if (ret && first_time) {
5017 kfree(rbd_dev->header.object_prefix);
5018 rbd_dev->header.object_prefix = NULL;
5019 }
Alex Elder117973f2012-08-31 17:29:55 -05005020
5021 return ret;
5022}
5023
Ilya Dryomova720ae02014-07-23 17:11:19 +04005024static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5025{
5026 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5027
5028 if (rbd_dev->image_format == 1)
5029 return rbd_dev_v1_header_info(rbd_dev);
5030
5031 return rbd_dev_v2_header_info(rbd_dev);
5032}
5033
Alex Elder1ddbe942012-01-29 13:57:44 -06005034/*
Alex Eldere28fff262012-02-02 08:13:30 -06005035 * Skips over white space at *buf, and updates *buf to point to the
5036 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005037 * the token (string of non-white space characters) found. Note
5038 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005039 */
5040static inline size_t next_token(const char **buf)
5041{
5042 /*
5043 * These are the characters that produce nonzero for
5044 * isspace() in the "C" and "POSIX" locales.
5045 */
5046 const char *spaces = " \f\n\r\t\v";
5047
5048 *buf += strspn(*buf, spaces); /* Find start of token */
5049
5050 return strcspn(*buf, spaces); /* Return token length */
5051}
5052
5053/*
Alex Elderea3352f2012-07-09 21:04:23 -05005054 * Finds the next token in *buf, dynamically allocates a buffer big
5055 * enough to hold a copy of it, and copies the token into the new
5056 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5057 * that a duplicate buffer is created even for a zero-length token.
5058 *
5059 * Returns a pointer to the newly-allocated duplicate, or a null
5060 * pointer if memory for the duplicate was not available. If
5061 * the lenp argument is a non-null pointer, the length of the token
5062 * (not including the '\0') is returned in *lenp.
5063 *
5064 * If successful, the *buf pointer will be updated to point beyond
5065 * the end of the found token.
5066 *
5067 * Note: uses GFP_KERNEL for allocation.
5068 */
5069static inline char *dup_token(const char **buf, size_t *lenp)
5070{
5071 char *dup;
5072 size_t len;
5073
5074 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005075 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005076 if (!dup)
5077 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005078 *(dup + len) = '\0';
5079 *buf += len;
5080
5081 if (lenp)
5082 *lenp = len;
5083
5084 return dup;
5085}
5086
5087/*
Alex Elder859c31d2012-10-25 23:34:42 -05005088 * Parse the options provided for an "rbd add" (i.e., rbd image
5089 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5090 * and the data written is passed here via a NUL-terminated buffer.
5091 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005092 *
Alex Elder859c31d2012-10-25 23:34:42 -05005093 * The information extracted from these options is recorded in
5094 * the other parameters which return dynamically-allocated
5095 * structures:
5096 * ceph_opts
5097 * The address of a pointer that will refer to a ceph options
5098 * structure. Caller must release the returned pointer using
5099 * ceph_destroy_options() when it is no longer needed.
5100 * rbd_opts
5101 * Address of an rbd options pointer. Fully initialized by
5102 * this function; caller must release with kfree().
5103 * spec
5104 * Address of an rbd image specification pointer. Fully
5105 * initialized by this function based on parsed options.
5106 * Caller must release with rbd_spec_put().
5107 *
5108 * The options passed take this form:
5109 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5110 * where:
5111 * <mon_addrs>
5112 * A comma-separated list of one or more monitor addresses.
5113 * A monitor address is an ip address, optionally followed
5114 * by a port number (separated by a colon).
5115 * I.e.: ip1[:port1][,ip2[:port2]...]
5116 * <options>
5117 * A comma-separated list of ceph and/or rbd options.
5118 * <pool_name>
5119 * The name of the rados pool containing the rbd image.
5120 * <image_name>
5121 * The name of the image in that pool to map.
5122 * <snap_id>
5123 * An optional snapshot id. If provided, the mapping will
5124 * present data from the image at the time that snapshot was
5125 * created. The image head is used if no snapshot id is
5126 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005127 */
Alex Elder859c31d2012-10-25 23:34:42 -05005128static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005129 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005130 struct rbd_options **opts,
5131 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005132{
Alex Elderd22f76e2012-07-12 10:46:35 -05005133 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005134 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005135 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005136 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005137 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05005138 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005139 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005140 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005141 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005142
5143 /* The first four tokens are required */
5144
Alex Elder7ef32142012-02-02 08:13:30 -06005145 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005146 if (!len) {
5147 rbd_warn(NULL, "no monitor address(es) provided");
5148 return -EINVAL;
5149 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005150 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005151 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005152 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005153
Alex Elderdc79b112012-10-25 23:34:41 -05005154 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005155 options = dup_token(&buf, NULL);
5156 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005157 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005158 if (!*options) {
5159 rbd_warn(NULL, "no options provided");
5160 goto out_err;
5161 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005162
Alex Elder859c31d2012-10-25 23:34:42 -05005163 spec = rbd_spec_alloc();
5164 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005165 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005166
5167 spec->pool_name = dup_token(&buf, NULL);
5168 if (!spec->pool_name)
5169 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005170 if (!*spec->pool_name) {
5171 rbd_warn(NULL, "no pool name provided");
5172 goto out_err;
5173 }
Alex Eldere28fff262012-02-02 08:13:30 -06005174
Alex Elder69e7a022012-11-01 08:39:26 -05005175 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05005176 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005177 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005178 if (!*spec->image_name) {
5179 rbd_warn(NULL, "no image name provided");
5180 goto out_err;
5181 }
Alex Eldere28fff262012-02-02 08:13:30 -06005182
Alex Elderf28e5652012-10-25 23:34:41 -05005183 /*
5184 * Snapshot name is optional; default is to use "-"
5185 * (indicating the head/no snapshot).
5186 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005187 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005188 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005189 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5190 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005191 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005192 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005193 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005194 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005195 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5196 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005197 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005198 *(snap_name + len) = '\0';
5199 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005200
Alex Elder0ddebc02012-10-25 23:34:41 -05005201 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005202
Alex Elder4e9afeb2012-10-25 23:34:41 -05005203 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
5204 if (!rbd_opts)
5205 goto out_mem;
5206
5207 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Ilya Dryomovb5584182015-06-23 16:21:19 +03005208 rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov80de1912016-09-20 14:23:17 +02005209 rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
Ilya Dryomove010dd02017-04-13 12:17:39 +02005210 rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005211
Alex Elder859c31d2012-10-25 23:34:42 -05005212 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05005213 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05005214 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05005215 if (IS_ERR(copts)) {
5216 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005217 goto out_err;
5218 }
Alex Elder859c31d2012-10-25 23:34:42 -05005219 kfree(options);
5220
5221 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005222 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05005223 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005224
Alex Elderdc79b112012-10-25 23:34:41 -05005225 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005226out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005227 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005228out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05005229 kfree(rbd_opts);
5230 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005231 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005232
Alex Elderdc79b112012-10-25 23:34:41 -05005233 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005234}
5235
Alex Elder589d30e2012-07-10 20:30:11 -05005236/*
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005237 * Return pool id (>= 0) or a negative error code.
5238 */
5239static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
5240{
Ilya Dryomova319bf52015-05-15 12:02:17 +03005241 struct ceph_options *opts = rbdc->client->options;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005242 u64 newest_epoch;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005243 int tries = 0;
5244 int ret;
5245
5246again:
5247 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
5248 if (ret == -ENOENT && tries++ < 1) {
Ilya Dryomovd0b19702016-04-28 16:07:27 +02005249 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
5250 &newest_epoch);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005251 if (ret < 0)
5252 return ret;
5253
5254 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
Ilya Dryomov7cca78c2016-04-28 16:07:28 +02005255 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005256 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
Ilya Dryomova319bf52015-05-15 12:02:17 +03005257 newest_epoch,
5258 opts->mount_timeout);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005259 goto again;
5260 } else {
5261 /* the osdmap we have is new enough */
5262 return -ENOENT;
5263 }
5264 }
5265
5266 return ret;
5267}
5268
Ilya Dryomove010dd02017-04-13 12:17:39 +02005269static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5270{
5271 down_write(&rbd_dev->lock_rwsem);
5272 if (__rbd_is_lock_owner(rbd_dev))
5273 rbd_unlock(rbd_dev);
5274 up_write(&rbd_dev->lock_rwsem);
5275}
5276
5277static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5278{
5279 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5280 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5281 return -EINVAL;
5282 }
5283
5284 /* FIXME: "rbd map --exclusive" should be in interruptible */
5285 down_read(&rbd_dev->lock_rwsem);
5286 rbd_wait_state_locked(rbd_dev);
5287 up_read(&rbd_dev->lock_rwsem);
5288 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
5289 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5290 return -EROFS;
5291 }
5292
5293 return 0;
5294}
5295
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005296/*
Alex Elder589d30e2012-07-10 20:30:11 -05005297 * An rbd format 2 image has a unique identifier, distinct from the
5298 * name given to it by the user. Internally, that identifier is
5299 * what's used to specify the names of objects related to the image.
5300 *
5301 * A special "rbd id" object is used to map an rbd image name to its
5302 * id. If that object doesn't exist, then there is no v2 rbd image
5303 * with the supplied name.
5304 *
5305 * This function will record the given rbd_dev's image_id field if
5306 * it can be determined, and in that case will return 0. If any
5307 * errors occur a negative errno will be returned and the rbd_dev's
5308 * image_id field will be unchanged (and should be NULL).
5309 */
5310static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5311{
5312 int ret;
5313 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005314 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005315 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005316 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005317
Alex Elder589d30e2012-07-10 20:30:11 -05005318 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005319 * When probing a parent image, the image id is already
5320 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005321 * need to fetch the image id again in this case. We
5322 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005323 */
Alex Elderc0fba362013-04-25 23:15:08 -05005324 if (rbd_dev->spec->image_id) {
5325 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5326
Alex Elder2c0d0a12012-10-30 19:40:33 -05005327 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005328 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005329
5330 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005331 * First, see if the format 2 image id file exists, and if
5332 * so, get the image's persistent id from it.
5333 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005334 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5335 rbd_dev->spec->image_name);
5336 if (ret)
5337 return ret;
5338
5339 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005340
5341 /* Response will be an encoded string, which includes a length */
5342
5343 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5344 response = kzalloc(size, GFP_NOIO);
5345 if (!response) {
5346 ret = -ENOMEM;
5347 goto out;
5348 }
5349
Alex Elderc0fba362013-04-25 23:15:08 -05005350 /* If it doesn't exist we'll assume it's a format 1 image */
5351
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005352 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5353 "get_id", NULL, 0,
5354 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005355 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005356 if (ret == -ENOENT) {
5357 image_id = kstrdup("", GFP_KERNEL);
5358 ret = image_id ? 0 : -ENOMEM;
5359 if (!ret)
5360 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005361 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005362 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005363
Alex Elderc0fba362013-04-25 23:15:08 -05005364 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005365 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005366 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005367 if (!ret)
5368 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005369 }
5370
5371 if (!ret) {
5372 rbd_dev->spec->image_id = image_id;
5373 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005374 }
5375out:
5376 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005377 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005378 return ret;
5379}
5380
Alex Elder3abef3b2013-05-13 20:35:37 -05005381/*
5382 * Undo whatever state changes are made by v1 or v2 header info
5383 * call.
5384 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005385static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5386{
5387 struct rbd_image_header *header;
5388
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005389 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005390
5391 /* Free dynamic fields from the header, then zero it out */
5392
5393 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005394 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005395 kfree(header->snap_sizes);
5396 kfree(header->snap_names);
5397 kfree(header->object_prefix);
5398 memset(header, 0, sizeof (*header));
5399}
5400
Alex Elder2df3fac2013-05-06 09:51:30 -05005401static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005402{
5403 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005404
Alex Elder1e130192012-07-03 16:01:19 -05005405 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005406 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005407 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005408
Alex Elder2df3fac2013-05-06 09:51:30 -05005409 /*
5410 * Get the and check features for the image. Currently the
5411 * features are assumed to never change.
5412 */
Alex Elderb1b54022012-07-03 16:01:19 -05005413 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005414 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005415 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005416
Alex Eldercc070d52013-04-21 12:14:45 -05005417 /* If the image supports fancy striping, get its parameters */
5418
5419 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5420 ret = rbd_dev_v2_striping_info(rbd_dev);
5421 if (ret < 0)
5422 goto out_err;
5423 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005424
Ilya Dryomov7e973322017-01-25 18:16:22 +01005425 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5426 ret = rbd_dev_v2_data_pool(rbd_dev);
5427 if (ret)
5428 goto out_err;
5429 }
5430
Ilya Dryomov263423f2017-01-25 18:16:22 +01005431 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005432 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005433
Alex Elder9d475de2012-07-03 16:01:19 -05005434out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005435 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005436 kfree(rbd_dev->header.object_prefix);
5437 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005438 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005439}
5440
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005441/*
5442 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5443 * rbd_dev_image_probe() recursion depth, which means it's also the
5444 * length of the already discovered part of the parent chain.
5445 */
5446static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005447{
Alex Elder2f82ee52012-10-30 19:40:33 -05005448 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005449 int ret;
5450
5451 if (!rbd_dev->parent_spec)
5452 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005453
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005454 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5455 pr_info("parent chain is too long (%d)\n", depth);
5456 ret = -EINVAL;
5457 goto out_err;
5458 }
5459
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005460 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005461 if (!parent) {
5462 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005463 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005464 }
5465
5466 /*
5467 * Images related by parent/child relationships always share
5468 * rbd_client and spec/parent_spec, so bump their refcounts.
5469 */
5470 __rbd_get_client(rbd_dev->rbd_client);
5471 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005472
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005473 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005474 if (ret < 0)
5475 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005476
Alex Elder124afba2013-04-26 15:44:36 -05005477 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005478 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005479 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005480
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005481out_err:
5482 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005483 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005484 return ret;
5485}
5486
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005487static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5488{
5489 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5490 rbd_dev_mapping_clear(rbd_dev);
5491 rbd_free_disk(rbd_dev);
5492 if (!single_major)
5493 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5494}
5495
Ilya Dryomov811c6682016-04-15 16:22:16 +02005496/*
5497 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5498 * upon return.
5499 */
Alex Elder200a6a82013-04-28 23:32:34 -05005500static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005501{
Alex Elder83a06262012-10-30 15:47:17 -05005502 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005503
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005504 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005505
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005506 if (!single_major) {
5507 ret = register_blkdev(0, rbd_dev->name);
5508 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005509 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005510
5511 rbd_dev->major = ret;
5512 rbd_dev->minor = 0;
5513 } else {
5514 rbd_dev->major = rbd_major;
5515 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5516 }
Alex Elder83a06262012-10-30 15:47:17 -05005517
5518 /* Set up the blkdev mapping. */
5519
5520 ret = rbd_init_disk(rbd_dev);
5521 if (ret)
5522 goto err_out_blkdev;
5523
Alex Elderf35a4de2013-05-06 09:51:29 -05005524 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005525 if (ret)
5526 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005527
Alex Elderf35a4de2013-05-06 09:51:29 -05005528 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005529 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005530
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005531 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005532 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005533 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005534
Alex Elder129b79d2013-04-26 15:44:36 -05005535 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005536 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005537 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005538
Alex Elderf35a4de2013-05-06 09:51:29 -05005539err_out_mapping:
5540 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005541err_out_disk:
5542 rbd_free_disk(rbd_dev);
5543err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005544 if (!single_major)
5545 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005546err_out_unlock:
5547 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005548 return ret;
5549}
5550
Alex Elder332bb122013-04-27 09:59:30 -05005551static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5552{
5553 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005554 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005555
5556 /* Record the header object name for this rbd image. */
5557
5558 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005559 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005560 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5561 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005562 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005563 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5564 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005565
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005566 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005567}
5568
Alex Elder200a6a82013-04-28 23:32:34 -05005569static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5570{
Alex Elder6fd48b32013-04-28 23:32:34 -05005571 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005572 if (rbd_dev->opts)
5573 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005574 rbd_dev->image_format = 0;
5575 kfree(rbd_dev->spec->image_id);
5576 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005577}
5578
Alex Eldera30b71b2012-07-10 20:30:11 -05005579/*
5580 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005581 * device. If this image is the one being mapped (i.e., not a
5582 * parent), initiate a watch on its header object before using that
5583 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005584 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005585static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005586{
5587 int ret;
5588
5589 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005590 * Get the id from the image id object. Unless there's an
5591 * error, rbd_dev->spec->image_id will be filled in with
5592 * a dynamically-allocated string, and rbd_dev->image_format
5593 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005594 */
5595 ret = rbd_dev_image_id(rbd_dev);
5596 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005597 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005598
Alex Elder332bb122013-04-27 09:59:30 -05005599 ret = rbd_dev_header_name(rbd_dev);
5600 if (ret)
5601 goto err_out_format;
5602
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005603 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005604 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005605 if (ret) {
5606 if (ret == -ENOENT)
5607 pr_info("image %s/%s does not exist\n",
5608 rbd_dev->spec->pool_name,
5609 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005610 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005611 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005612 }
Alex Elderb644de22013-04-27 09:59:31 -05005613
Ilya Dryomova720ae02014-07-23 17:11:19 +04005614 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005615 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005616 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005617
Ilya Dryomov04077592014-07-23 17:11:20 +04005618 /*
5619 * If this image is the one being mapped, we have pool name and
5620 * id, image name and id, and snap name - need to fill snap id.
5621 * Otherwise this is a parent image, identified by pool, image
5622 * and snap ids - need to fill in names for those ids.
5623 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005624 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005625 ret = rbd_spec_fill_snap_id(rbd_dev);
5626 else
5627 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005628 if (ret) {
5629 if (ret == -ENOENT)
5630 pr_info("snap %s/%s@%s does not exist\n",
5631 rbd_dev->spec->pool_name,
5632 rbd_dev->spec->image_name,
5633 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005634 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005635 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005636
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005637 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5638 ret = rbd_dev_v2_parent_info(rbd_dev);
5639 if (ret)
5640 goto err_out_probe;
5641
5642 /*
5643 * Need to warn users if this image is the one being
5644 * mapped and has a parent.
5645 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005646 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005647 rbd_warn(rbd_dev,
5648 "WARNING: kernel layering is EXPERIMENTAL!");
5649 }
5650
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005651 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005652 if (ret)
5653 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005654
Alex Elder30d60ba2013-05-06 09:51:30 -05005655 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005656 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005657 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005658
Alex Elder6fd48b32013-04-28 23:32:34 -05005659err_out_probe:
5660 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005661err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005662 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005663 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005664err_out_format:
5665 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005666 kfree(rbd_dev->spec->image_id);
5667 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005668 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005669}
5670
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005671static ssize_t do_rbd_add(struct bus_type *bus,
5672 const char *buf,
5673 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005674{
Alex Eldercb8627c2012-07-09 21:04:23 -05005675 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005676 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005677 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005678 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005679 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005680 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005681
5682 if (!try_module_get(THIS_MODULE))
5683 return -ENODEV;
5684
Alex Eldera725f65e2012-02-02 08:13:30 -06005685 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005686 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005687 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005688 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06005689
Alex Elder9d3997f2012-10-25 23:34:42 -05005690 rbdc = rbd_get_client(ceph_opts);
5691 if (IS_ERR(rbdc)) {
5692 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005693 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005694 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005695
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005696 /* pick the pool */
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005697 rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005698 if (rc < 0) {
5699 if (rc == -ENOENT)
5700 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005701 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005702 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05005703 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005704
Ilya Dryomovd1475432015-06-22 13:24:48 +03005705 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005706 if (!rbd_dev) {
5707 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05005708 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005709 }
Alex Elderc53d5892012-10-25 23:34:42 -05005710 rbdc = NULL; /* rbd_dev now owns this */
5711 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03005712 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005713
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005714 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5715 if (!rbd_dev->config_info) {
5716 rc = -ENOMEM;
5717 goto err_out_rbd_dev;
5718 }
5719
Ilya Dryomov811c6682016-04-15 16:22:16 +02005720 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005721 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005722 if (rc < 0) {
5723 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05005724 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005725 }
Alex Elder05fd6f62012-08-29 17:11:07 -05005726
Alex Elder7ce4eef2013-05-06 17:40:33 -05005727 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05005728 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02005729 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05005730
Alex Elderb536f692013-04-28 23:32:34 -05005731 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005732 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005733 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05005734
Ilya Dryomove010dd02017-04-13 12:17:39 +02005735 if (rbd_dev->opts->exclusive) {
5736 rc = rbd_add_acquire_lock(rbd_dev);
5737 if (rc)
5738 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05005739 }
5740
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005741 /* Everything's ready. Announce the disk to the world. */
5742
5743 rc = device_add(&rbd_dev->dev);
5744 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02005745 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005746
5747 add_disk(rbd_dev->disk);
5748 /* see rbd_init_disk() */
5749 blk_put_queue(rbd_dev->disk->queue);
5750
5751 spin_lock(&rbd_dev_list_lock);
5752 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5753 spin_unlock(&rbd_dev_list_lock);
5754
5755 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5756 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5757 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005758 rc = count;
5759out:
5760 module_put(THIS_MODULE);
5761 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05005762
Ilya Dryomove010dd02017-04-13 12:17:39 +02005763err_out_image_lock:
5764 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005765err_out_device_setup:
5766 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005767err_out_image_probe:
5768 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005769err_out_rbd_dev:
5770 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005771err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005772 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005773err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05005774 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03005775 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005776 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005777}
5778
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005779static ssize_t rbd_add(struct bus_type *bus,
5780 const char *buf,
5781 size_t count)
5782{
5783 if (single_major)
5784 return -EINVAL;
5785
5786 return do_rbd_add(bus, buf, count);
5787}
5788
5789static ssize_t rbd_add_single_major(struct bus_type *bus,
5790 const char *buf,
5791 size_t count)
5792{
5793 return do_rbd_add(bus, buf, count);
5794}
5795
Alex Elder05a46af2013-04-26 15:44:36 -05005796static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5797{
Alex Elderad945fc2013-04-26 15:44:36 -05005798 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005799 struct rbd_device *first = rbd_dev;
5800 struct rbd_device *second = first->parent;
5801 struct rbd_device *third;
5802
5803 /*
5804 * Follow to the parent with no grandparent and
5805 * remove it.
5806 */
5807 while (second && (third = second->parent)) {
5808 first = second;
5809 second = third;
5810 }
Alex Elderad945fc2013-04-26 15:44:36 -05005811 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005812 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005813 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005814 first->parent = NULL;
5815 first->parent_overlap = 0;
5816
5817 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005818 rbd_spec_put(first->parent_spec);
5819 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005820 }
5821}
5822
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005823static ssize_t do_rbd_remove(struct bus_type *bus,
5824 const char *buf,
5825 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005826{
5827 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05005828 struct list_head *tmp;
5829 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02005830 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05005831 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02005832 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05005833 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005834
Mike Christie0276dca2016-08-18 18:38:45 +02005835 dev_id = -1;
5836 opt_buf[0] = '\0';
5837 sscanf(buf, "%d %5s", &dev_id, opt_buf);
5838 if (dev_id < 0) {
5839 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005840 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02005841 }
5842 if (opt_buf[0] != '\0') {
5843 if (!strcmp(opt_buf, "force")) {
5844 force = true;
5845 } else {
5846 pr_err("bad remove option at '%s'\n", opt_buf);
5847 return -EINVAL;
5848 }
5849 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005850
Alex Elder751cc0e2013-05-31 15:17:01 -05005851 ret = -ENOENT;
5852 spin_lock(&rbd_dev_list_lock);
5853 list_for_each(tmp, &rbd_dev_list) {
5854 rbd_dev = list_entry(tmp, struct rbd_device, node);
5855 if (rbd_dev->dev_id == dev_id) {
5856 ret = 0;
5857 break;
5858 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005859 }
Alex Elder751cc0e2013-05-31 15:17:01 -05005860 if (!ret) {
5861 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02005862 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05005863 ret = -EBUSY;
5864 else
Alex Elder82a442d2013-05-31 17:40:44 -05005865 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5866 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05005867 spin_unlock_irq(&rbd_dev->lock);
5868 }
5869 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05005870 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005871 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05005872
Mike Christie0276dca2016-08-18 18:38:45 +02005873 if (force) {
5874 /*
5875 * Prevent new IO from being queued and wait for existing
5876 * IO to complete/fail.
5877 */
5878 blk_mq_freeze_queue(rbd_dev->disk->queue);
5879 blk_set_queue_dying(rbd_dev->disk->queue);
5880 }
5881
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005882 del_gendisk(rbd_dev->disk);
5883 spin_lock(&rbd_dev_list_lock);
5884 list_del_init(&rbd_dev->node);
5885 spin_unlock(&rbd_dev_list_lock);
5886 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02005887
Ilya Dryomove010dd02017-04-13 12:17:39 +02005888 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005889 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005890 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005891 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005892 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005893}
5894
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005895static ssize_t rbd_remove(struct bus_type *bus,
5896 const char *buf,
5897 size_t count)
5898{
5899 if (single_major)
5900 return -EINVAL;
5901
5902 return do_rbd_remove(bus, buf, count);
5903}
5904
5905static ssize_t rbd_remove_single_major(struct bus_type *bus,
5906 const char *buf,
5907 size_t count)
5908{
5909 return do_rbd_remove(bus, buf, count);
5910}
5911
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005912/*
5913 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005914 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005915 */
5916static int rbd_sysfs_init(void)
5917{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005918 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005919
Alex Elderfed4c142012-02-07 12:03:36 -06005920 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005921 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005922 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005923
Alex Elderfed4c142012-02-07 12:03:36 -06005924 ret = bus_register(&rbd_bus_type);
5925 if (ret < 0)
5926 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005927
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005928 return ret;
5929}
5930
5931static void rbd_sysfs_cleanup(void)
5932{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005933 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06005934 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005935}
5936
Alex Elder1c2a9df2013-05-01 12:43:03 -05005937static int rbd_slab_init(void)
5938{
5939 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005940 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05005941 if (!rbd_img_request_cache)
5942 return -ENOMEM;
5943
5944 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08005945 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05005946 if (!rbd_obj_request_cache)
5947 goto out_err;
5948
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005949 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005950
Ilya Dryomov6c696d82017-01-25 18:16:23 +01005951out_err:
Alex Elder868311b2013-05-01 12:43:03 -05005952 kmem_cache_destroy(rbd_img_request_cache);
5953 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05005954 return -ENOMEM;
5955}
5956
5957static void rbd_slab_exit(void)
5958{
Alex Elder868311b2013-05-01 12:43:03 -05005959 rbd_assert(rbd_obj_request_cache);
5960 kmem_cache_destroy(rbd_obj_request_cache);
5961 rbd_obj_request_cache = NULL;
5962
Alex Elder1c2a9df2013-05-01 12:43:03 -05005963 rbd_assert(rbd_img_request_cache);
5964 kmem_cache_destroy(rbd_img_request_cache);
5965 rbd_img_request_cache = NULL;
5966}
5967
Alex Eldercc344fa2013-02-19 12:25:56 -06005968static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005969{
5970 int rc;
5971
Alex Elder1e32d342013-01-30 11:13:33 -06005972 if (!libceph_compatible(NULL)) {
5973 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06005974 return -EINVAL;
5975 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005976
Alex Elder1c2a9df2013-05-01 12:43:03 -05005977 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005978 if (rc)
5979 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02005980
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005981 /*
5982 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03005983 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005984 */
5985 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
5986 if (!rbd_wq) {
5987 rc = -ENOMEM;
5988 goto err_out_slab;
5989 }
5990
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005991 if (single_major) {
5992 rbd_major = register_blkdev(0, RBD_DRV_NAME);
5993 if (rbd_major < 0) {
5994 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005995 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005996 }
5997 }
5998
Alex Elder1c2a9df2013-05-01 12:43:03 -05005999 rc = rbd_sysfs_init();
6000 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006001 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006002
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006003 if (single_major)
6004 pr_info("loaded (major %d)\n", rbd_major);
6005 else
6006 pr_info("loaded\n");
6007
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006008 return 0;
6009
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006010err_out_blkdev:
6011 if (single_major)
6012 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006013err_out_wq:
6014 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006015err_out_slab:
6016 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006017 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006018}
6019
Alex Eldercc344fa2013-02-19 12:25:56 -06006020static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006021{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006022 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006023 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006024 if (single_major)
6025 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006026 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006027 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006028}
6029
6030module_init(rbd_init);
6031module_exit(rbd_exit);
6032
Alex Elderd552c612013-05-31 20:13:09 -05006033MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006034MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6035MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006036/* following authorship retained from original osdblk.c */
6037MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6038
Ilya Dryomov90da2582013-12-13 15:28:56 +02006039MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006040MODULE_LICENSE("GPL");