blob: a1bb8f3100a886ec00ccfab585ef40c62cc76992 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
Mark Rutlandbfc18e32018-06-21 13:13:04 +010064 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
Alex Eldera2acd002013-05-08 22:50:04 -050065 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100118#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100119#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100120#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100121
Ilya Dryomoved95b212016-08-12 16:40:02 +0200122#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
123 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100124 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100125 RBD_FEATURE_DEEP_FLATTEN | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100126 RBD_FEATURE_DATA_POOL | \
127 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700128
129/* Features supported by this (client software) implementation. */
130
Alex Elder770eba62012-10-25 23:34:40 -0500131#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700132
Alex Elder81a89792012-02-02 08:13:30 -0600133/*
134 * An RBD device name will be "rbd#", where the "rbd" comes from
135 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600136 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700137#define DEV_NAME_LEN 32
138
139/*
140 * block device image metadata (in-memory version)
141 */
142struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500143 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500144 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500146 u64 stripe_unit;
147 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100148 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500149 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700150
Alex Elderf84344f2012-08-31 17:29:51 -0500151 /* The remaining fields need to be updated occasionally */
152 u64 image_size;
153 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500154 char *snap_names; /* format 1 only */
155 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700156};
157
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500158/*
159 * An rbd image specification.
160 *
161 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500162 * identify an image. Each rbd_dev structure includes a pointer to
163 * an rbd_spec structure that encapsulates this identity.
164 *
165 * Each of the id's in an rbd_spec has an associated name. For a
166 * user-mapped image, the names are supplied and the id's associated
167 * with them are looked up. For a layered image, a parent image is
168 * defined by the tuple, and the names are looked up.
169 *
170 * An rbd_dev structure contains a parent_spec pointer which is
171 * non-null if the image it represents is a child in a layered
172 * image. This pointer will refer to the rbd_spec structure used
173 * by the parent rbd_dev for its own identity (i.e., the structure
174 * is shared between the parent and child).
175 *
176 * Since these structures are populated once, during the discovery
177 * phase of image construction, they are effectively immutable so
178 * we make no effort to synchronize access to them.
179 *
180 * Note that code herein does not assume the image name is known (it
181 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500182 */
183struct rbd_spec {
184 u64 pool_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500185 const char *pool_name;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200186 const char *pool_ns; /* NULL if default, never "" */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500187
Alex Elderecb4dc222013-04-26 09:43:47 -0500188 const char *image_id;
189 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500190
191 u64 snap_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500192 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500193
194 struct kref kref;
195};
196
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700197/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600198 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199 */
200struct rbd_client {
201 struct ceph_client *client;
202 struct kref kref;
203 struct list_head node;
204};
205
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200206struct pending_result {
207 int result; /* first nonzero result */
208 int num_pending;
209};
210
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212
Alex Elder9969ebc2013-01-18 12:31:10 -0600213enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100214 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100215 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100216 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100217 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600218};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600219
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800220enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100221 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800222 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800223 OBJ_OP_DISCARD,
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100224 OBJ_OP_ZEROOUT,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800225};
226
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200227#define RBD_OBJ_FLAG_DELETION (1U << 0)
228#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
Ilya Dryomov793333a302019-06-13 17:44:08 +0200229#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200230
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200231enum rbd_obj_read_state {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200232 RBD_OBJ_READ_START = 1,
233 RBD_OBJ_READ_OBJECT,
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200234 RBD_OBJ_READ_PARENT,
235};
236
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100237/*
238 * Writes go through the following state machine to deal with
239 * layering:
240 *
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100241 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
242 * . | .
243 * . v .
244 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
245 * . | . .
246 * . v v (deep-copyup .
247 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
248 * flattened) v | . .
249 * . v . .
250 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
251 * | not needed) v
252 * v .
253 * done . . . . . . . . . . . . . . . . . .
254 * ^
255 * |
256 * RBD_OBJ_WRITE_FLAT
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100257 *
258 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100259 * assert_exists guard is needed or not (in some cases it's not needed
260 * even if there is a parent).
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100261 */
262enum rbd_obj_write_state {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200263 RBD_OBJ_WRITE_START = 1,
264 RBD_OBJ_WRITE_OBJECT,
Ilya Dryomov793333a302019-06-13 17:44:08 +0200265 __RBD_OBJ_WRITE_COPYUP,
266 RBD_OBJ_WRITE_COPYUP,
267};
268
269enum rbd_obj_copyup_state {
270 RBD_OBJ_COPYUP_START = 1,
271 RBD_OBJ_COPYUP_READ_PARENT,
272 __RBD_OBJ_COPYUP_WRITE_OBJECT,
273 RBD_OBJ_COPYUP_WRITE_OBJECT,
Alex Elder926f9b32013-02-11 12:33:24 -0600274};
275
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100277 struct ceph_object_extent ex;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200278 unsigned int flags; /* RBD_OBJ_FLAG_* */
Alex Elderc5b5ef62013-02-11 12:33:24 -0600279 union {
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200280 enum rbd_obj_read_state read_state; /* for reads */
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100281 enum rbd_obj_write_state write_state; /* for writes */
282 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600283
Ilya Dryomov51c35092018-01-29 14:04:08 +0100284 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100285 struct ceph_file_extent *img_extents;
286 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600287
Alex Elder788e2df2013-01-17 12:25:27 -0600288 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100289 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600290 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100291 struct ceph_bvec_iter bvec_pos;
292 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100293 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600294 };
295 };
Ilya Dryomov793333a302019-06-13 17:44:08 +0200296
297 enum rbd_obj_copyup_state copyup_state;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100298 struct bio_vec *copyup_bvecs;
299 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600300
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +0200301 struct list_head osd_reqs; /* w/ r_private_item */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600302
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200303 struct mutex state_mutex;
Ilya Dryomov793333a302019-06-13 17:44:08 +0200304 struct pending_result pending;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600305 struct kref kref;
306};
307
Alex Elder0c425242013-02-08 09:55:49 -0600308enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600309 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600310 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600311};
312
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200313enum rbd_img_state {
314 RBD_IMG_START = 1,
315 __RBD_IMG_OBJECT_REQUESTS,
316 RBD_IMG_OBJECT_REQUESTS,
317};
318
Alex Elderbf0d5f502012-11-22 00:00:08 -0600319struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600320 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100321 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100322 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600323 unsigned long flags;
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200324 enum rbd_img_state state;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600325 union {
Alex Elder9849e982013-01-24 16:13:36 -0600326 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600327 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600328 };
329 union {
330 struct request *rq; /* block request */
331 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600332 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600333
Ilya Dryomove1fddc82019-05-30 16:07:48 +0200334 struct list_head lock_item;
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100335 struct list_head object_extents; /* obj_req.ex structs */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600336
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200337 struct mutex state_mutex;
338 struct pending_result pending;
339 struct work_struct work;
340 int work_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600341 struct kref kref;
342};
343
344#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100345 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600346#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100347 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600348
Ilya Dryomov99d16942016-08-12 16:11:41 +0200349enum rbd_watch_state {
350 RBD_WATCH_STATE_UNREGISTERED,
351 RBD_WATCH_STATE_REGISTERED,
352 RBD_WATCH_STATE_ERROR,
353};
354
Ilya Dryomoved95b212016-08-12 16:40:02 +0200355enum rbd_lock_state {
356 RBD_LOCK_STATE_UNLOCKED,
357 RBD_LOCK_STATE_LOCKED,
358 RBD_LOCK_STATE_RELEASING,
359};
360
361/* WatchNotify::ClientId */
362struct rbd_client_id {
363 u64 gid;
364 u64 handle;
365};
366
Alex Elderf84344f2012-08-31 17:29:51 -0500367struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500368 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500369 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500370};
371
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700372/*
373 * a single device
374 */
375struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500376 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377
378 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200379 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700381
Alex Eldera30b71b2012-07-10 20:30:11 -0500382 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383 struct rbd_client *rbd_client;
384
385 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
386
Alex Elderb82d1672013-01-14 12:43:31 -0600387 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700388
389 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600390 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500391 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300392 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200393 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200395 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200396 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500397
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200398 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600399
Ilya Dryomov99d16942016-08-12 16:11:41 +0200400 struct mutex watch_mutex;
401 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200402 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200403 u64 watch_cookie;
404 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700405
Ilya Dryomoved95b212016-08-12 16:40:02 +0200406 struct rw_semaphore lock_rwsem;
407 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200408 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200409 struct rbd_client_id owner_cid;
410 struct work_struct acquired_lock_work;
411 struct work_struct released_lock_work;
412 struct delayed_work lock_dwork;
413 struct work_struct unlock_work;
Ilya Dryomove1fddc82019-05-30 16:07:48 +0200414 spinlock_t lock_lists_lock;
415 struct list_head running_list;
416 struct completion releasing_wait;
Ilya Dryomoved95b212016-08-12 16:40:02 +0200417 wait_queue_head_t lock_waitq;
418
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200419 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700420
Alex Elder86b00e02012-10-25 23:34:42 -0500421 struct rbd_spec *parent_spec;
422 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500423 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500424 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500425
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100426 /* Block layer tags. */
427 struct blk_mq_tag_set tag_set;
428
Josh Durginc6666012011-11-21 17:11:12 -0800429 /* protects updating the header */
430 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500431
432 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433
434 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800435
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800436 /* sysfs related */
437 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600438 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800439};
440
Alex Elderb82d1672013-01-14 12:43:31 -0600441/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200442 * Flag bits for rbd_dev->flags:
443 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
444 * by rbd_dev->lock
445 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600446 */
Alex Elder6d292902013-01-14 12:43:31 -0600447enum rbd_dev_flags {
448 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600449 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200450 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600451};
452
Alex Eldercfbf6372013-05-31 17:40:45 -0500453static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600454
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700455static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600456static DEFINE_SPINLOCK(rbd_dev_list_lock);
457
Alex Elder432b8582012-01-29 13:57:44 -0600458static LIST_HEAD(rbd_client_list); /* clients */
459static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700460
Alex Elder78c2a442013-05-01 12:43:04 -0500461/* Slab caches for frequently-allocated structures */
462
Alex Elder1c2a9df2013-05-01 12:43:03 -0500463static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500464static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500465
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200466static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200467static DEFINE_IDA(rbd_dev_id_ida);
468
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400469static struct workqueue_struct *rbd_wq;
470
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100471static struct ceph_snap_context rbd_empty_snapc = {
472 .nref = REFCOUNT_INIT(1),
473};
474
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200475/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100476 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200477 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100478static bool single_major = true;
Joe Perches5657a812018-05-24 13:38:59 -0600479module_param(single_major, bool, 0444);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100480MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200481
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100482static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
483static ssize_t remove_store(struct bus_type *bus, const char *buf,
484 size_t count);
485static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
486 size_t count);
487static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
488 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200489static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600490
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200491static int rbd_dev_id_to_minor(int dev_id)
492{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200493 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200494}
495
496static int minor_to_rbd_dev_id(int minor)
497{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200498 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200499}
500
Ilya Dryomoved95b212016-08-12 16:40:02 +0200501static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
502{
503 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
504 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
505}
506
507static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
508{
509 bool is_lock_owner;
510
511 down_read(&rbd_dev->lock_rwsem);
512 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
513 up_read(&rbd_dev->lock_rwsem);
514 return is_lock_owner;
515}
516
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100517static ssize_t supported_features_show(struct bus_type *bus, char *buf)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100518{
519 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
520}
521
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100522static BUS_ATTR_WO(add);
523static BUS_ATTR_WO(remove);
524static BUS_ATTR_WO(add_single_major);
525static BUS_ATTR_WO(remove_single_major);
526static BUS_ATTR_RO(supported_features);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700527
528static struct attribute *rbd_bus_attrs[] = {
529 &bus_attr_add.attr,
530 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200531 &bus_attr_add_single_major.attr,
532 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100533 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700534 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600535};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200536
537static umode_t rbd_bus_is_visible(struct kobject *kobj,
538 struct attribute *attr, int index)
539{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200540 if (!single_major &&
541 (attr == &bus_attr_add_single_major.attr ||
542 attr == &bus_attr_remove_single_major.attr))
543 return 0;
544
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200545 return attr->mode;
546}
547
548static const struct attribute_group rbd_bus_group = {
549 .attrs = rbd_bus_attrs,
550 .is_visible = rbd_bus_is_visible,
551};
552__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600553
554static struct bus_type rbd_bus_type = {
555 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700556 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600557};
558
559static void rbd_root_dev_release(struct device *dev)
560{
561}
562
563static struct device rbd_root_dev = {
564 .init_name = "rbd",
565 .release = rbd_root_dev_release,
566};
567
Alex Elder06ecc6c2012-11-01 10:17:15 -0500568static __printf(2, 3)
569void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
570{
571 struct va_format vaf;
572 va_list args;
573
574 va_start(args, fmt);
575 vaf.fmt = fmt;
576 vaf.va = &args;
577
578 if (!rbd_dev)
579 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
580 else if (rbd_dev->disk)
581 printk(KERN_WARNING "%s: %s: %pV\n",
582 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
583 else if (rbd_dev->spec && rbd_dev->spec->image_name)
584 printk(KERN_WARNING "%s: image %s: %pV\n",
585 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
586 else if (rbd_dev->spec && rbd_dev->spec->image_id)
587 printk(KERN_WARNING "%s: id %s: %pV\n",
588 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
589 else /* punt */
590 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
591 RBD_DRV_NAME, rbd_dev, &vaf);
592 va_end(args);
593}
594
Alex Elderaafb2302012-09-06 16:00:54 -0500595#ifdef RBD_DEBUG
596#define rbd_assert(expr) \
597 if (unlikely(!(expr))) { \
598 printk(KERN_ERR "\nAssertion failure in %s() " \
599 "at line %d:\n\n" \
600 "\trbd_assert(%s);\n\n", \
601 __func__, __LINE__, #expr); \
602 BUG(); \
603 }
604#else /* !RBD_DEBUG */
605# define rbd_assert(expr) ((void) 0)
606#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800607
Alex Elder05a46af2013-04-26 15:44:36 -0500608static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600609
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500610static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500611static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400612static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400613static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500614static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
615 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500616static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
617 u8 *order, u64 *snap_size);
618static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
619 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700620
Ilya Dryomov54ab3b22019-05-11 16:21:49 +0200621static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200622static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
623
624/*
625 * Return true if nothing else is pending.
626 */
627static bool pending_result_dec(struct pending_result *pending, int *result)
628{
629 rbd_assert(pending->num_pending > 0);
630
631 if (*result && !pending->result)
632 pending->result = *result;
633 if (--pending->num_pending)
634 return false;
635
636 *result = pending->result;
637 return true;
638}
Ilya Dryomov54ab3b22019-05-11 16:21:49 +0200639
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700640static int rbd_open(struct block_device *bdev, fmode_t mode)
641{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600642 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600643 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700644
Alex Eldera14ea262013-02-05 13:23:12 -0600645 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600646 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
647 removing = true;
648 else
649 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600650 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600651 if (removing)
652 return -ENOENT;
653
Alex Elderc3e946c2012-11-16 09:29:16 -0600654 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700655
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656 return 0;
657}
658
Al Virodb2a1442013-05-05 21:52:57 -0400659static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800660{
661 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600662 unsigned long open_count_before;
663
Alex Eldera14ea262013-02-05 13:23:12 -0600664 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600665 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600666 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600667 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800668
Alex Elderc3e946c2012-11-16 09:29:16 -0600669 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800670}
671
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800672static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
673{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200674 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800675
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200676 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800677 return -EFAULT;
678
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200679 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800680 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
681 return -EROFS;
682
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200683 /* Let blkdev_roset() handle it */
684 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800685}
686
687static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
688 unsigned int cmd, unsigned long arg)
689{
690 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200691 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800692
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800693 switch (cmd) {
694 case BLKROSET:
695 ret = rbd_ioctl_set_ro(rbd_dev, arg);
696 break;
697 default:
698 ret = -ENOTTY;
699 }
700
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800701 return ret;
702}
703
704#ifdef CONFIG_COMPAT
705static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
706 unsigned int cmd, unsigned long arg)
707{
708 return rbd_ioctl(bdev, mode, cmd, arg);
709}
710#endif /* CONFIG_COMPAT */
711
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712static const struct block_device_operations rbd_bd_ops = {
713 .owner = THIS_MODULE,
714 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800715 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800716 .ioctl = rbd_ioctl,
717#ifdef CONFIG_COMPAT
718 .compat_ioctl = rbd_compat_ioctl,
719#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700720};
721
722/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500723 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500724 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700725 */
Alex Elderf8c38922012-08-10 13:12:07 -0700726static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700727{
728 struct rbd_client *rbdc;
729 int ret = -ENOMEM;
730
Alex Elder37206ee2013-02-20 17:32:08 -0600731 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700732 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
733 if (!rbdc)
734 goto out_opt;
735
736 kref_init(&rbdc->kref);
737 INIT_LIST_HEAD(&rbdc->node);
738
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100739 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700740 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500741 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500742 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743
744 ret = ceph_open_session(rbdc->client);
745 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500746 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700747
Alex Elder432b8582012-01-29 13:57:44 -0600748 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700749 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600750 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751
Alex Elder37206ee2013-02-20 17:32:08 -0600752 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600753
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700754 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500755out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700756 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500757out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758 kfree(rbdc);
759out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500760 if (ceph_opts)
761 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600762 dout("%s: error %d\n", __func__, ret);
763
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400764 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700765}
766
Alex Elder2f82ee52012-10-30 19:40:33 -0500767static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
768{
769 kref_get(&rbdc->kref);
770
771 return rbdc;
772}
773
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700774/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700775 * Find a ceph client with specific addr and configuration. If
776 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700777 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700778static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779{
780 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700781 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700782
Alex Elder43ae4702012-07-03 16:01:18 -0500783 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 return NULL;
785
Alex Elder1f7ba332012-08-10 13:12:07 -0700786 spin_lock(&rbd_client_list_lock);
787 list_for_each_entry(client_node, &rbd_client_list, node) {
788 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500789 __rbd_get_client(client_node);
790
Alex Elder1f7ba332012-08-10 13:12:07 -0700791 found = true;
792 break;
793 }
794 }
795 spin_unlock(&rbd_client_list_lock);
796
797 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700798}
799
800/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300801 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700802 */
803enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300804 Opt_queue_depth,
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100805 Opt_alloc_size,
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400806 Opt_lock_timeout,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700807 Opt_last_int,
808 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200809 Opt_pool_ns,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700810 Opt_last_string,
811 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700812 Opt_read_only,
813 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200814 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200815 Opt_exclusive,
Ilya Dryomovd9360542018-03-23 06:14:47 +0100816 Opt_notrim,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300817 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700818};
819
Alex Elder43ae4702012-07-03 16:01:18 -0500820static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300821 {Opt_queue_depth, "queue_depth=%d"},
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100822 {Opt_alloc_size, "alloc_size=%d"},
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400823 {Opt_lock_timeout, "lock_timeout=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700824 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200825 {Opt_pool_ns, "_pool_ns=%s"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700826 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500827 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700828 {Opt_read_only, "ro"}, /* Alternate spelling */
829 {Opt_read_write, "read_write"},
830 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200831 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200832 {Opt_exclusive, "exclusive"},
Ilya Dryomovd9360542018-03-23 06:14:47 +0100833 {Opt_notrim, "notrim"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300834 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700835};
836
Alex Elder98571b52013-01-20 14:44:42 -0600837struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300838 int queue_depth;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100839 int alloc_size;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400840 unsigned long lock_timeout;
Alex Elder98571b52013-01-20 14:44:42 -0600841 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200842 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200843 bool exclusive;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100844 bool trim;
Alex Elder98571b52013-01-20 14:44:42 -0600845};
846
Ilya Dryomovb5584182015-06-23 16:21:19 +0300847#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100848#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400849#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
Alex Elder98571b52013-01-20 14:44:42 -0600850#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200851#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200852#define RBD_EXCLUSIVE_DEFAULT false
Ilya Dryomovd9360542018-03-23 06:14:47 +0100853#define RBD_TRIM_DEFAULT true
Alex Elder98571b52013-01-20 14:44:42 -0600854
Ilya Dryomovc3001562018-07-03 15:28:43 +0200855struct parse_rbd_opts_ctx {
856 struct rbd_spec *spec;
857 struct rbd_options *opts;
858};
859
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700860static int parse_rbd_opts_token(char *c, void *private)
861{
Ilya Dryomovc3001562018-07-03 15:28:43 +0200862 struct parse_rbd_opts_ctx *pctx = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700863 substring_t argstr[MAX_OPT_ARGS];
864 int token, intval, ret;
865
Alex Elder43ae4702012-07-03 16:01:18 -0500866 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700867 if (token < Opt_last_int) {
868 ret = match_int(&argstr[0], &intval);
869 if (ret < 0) {
Ilya Dryomov2f56b6b2018-06-27 16:38:13 +0200870 pr_err("bad option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700871 return ret;
872 }
873 dout("got int token %d val %d\n", token, intval);
874 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300875 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700876 } else {
877 dout("got token %d\n", token);
878 }
879
880 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300881 case Opt_queue_depth:
882 if (intval < 1) {
883 pr_err("queue_depth out of range\n");
884 return -EINVAL;
885 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200886 pctx->opts->queue_depth = intval;
Ilya Dryomovb5584182015-06-23 16:21:19 +0300887 break;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100888 case Opt_alloc_size:
Ilya Dryomov16d80c52019-03-15 14:50:04 +0100889 if (intval < SECTOR_SIZE) {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100890 pr_err("alloc_size out of range\n");
891 return -EINVAL;
892 }
893 if (!is_power_of_2(intval)) {
894 pr_err("alloc_size must be a power of 2\n");
895 return -EINVAL;
896 }
897 pctx->opts->alloc_size = intval;
898 break;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400899 case Opt_lock_timeout:
900 /* 0 is "wait forever" (i.e. infinite timeout) */
901 if (intval < 0 || intval > INT_MAX / 1000) {
902 pr_err("lock_timeout out of range\n");
903 return -EINVAL;
904 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200905 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400906 break;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200907 case Opt_pool_ns:
908 kfree(pctx->spec->pool_ns);
909 pctx->spec->pool_ns = match_strdup(argstr);
910 if (!pctx->spec->pool_ns)
911 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700912 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700913 case Opt_read_only:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200914 pctx->opts->read_only = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700915 break;
916 case Opt_read_write:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200917 pctx->opts->read_only = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700918 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200919 case Opt_lock_on_read:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200920 pctx->opts->lock_on_read = true;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200921 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200922 case Opt_exclusive:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200923 pctx->opts->exclusive = true;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200924 break;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100925 case Opt_notrim:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200926 pctx->opts->trim = false;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100927 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700928 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300929 /* libceph prints "bad option" msg */
930 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700931 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300932
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700933 return 0;
934}
935
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800936static char* obj_op_name(enum obj_operation_type op_type)
937{
938 switch (op_type) {
939 case OBJ_OP_READ:
940 return "read";
941 case OBJ_OP_WRITE:
942 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800943 case OBJ_OP_DISCARD:
944 return "discard";
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100945 case OBJ_OP_ZEROOUT:
946 return "zeroout";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800947 default:
948 return "???";
949 }
950}
951
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700952/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700953 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600954 *
Alex Elder432b8582012-01-29 13:57:44 -0600955 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700956 */
957static void rbd_client_release(struct kref *kref)
958{
959 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
960
Alex Elder37206ee2013-02-20 17:32:08 -0600961 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500962 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700963 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500964 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700965
966 ceph_destroy_client(rbdc->client);
967 kfree(rbdc);
968}
969
970/*
971 * Drop reference to ceph client node. If it's not referenced anymore, release
972 * it.
973 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500974static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975{
Alex Elderc53d5892012-10-25 23:34:42 -0500976 if (rbdc)
977 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700978}
979
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100980/*
981 * Get a ceph client with specific addr and configuration, if one does
982 * not exist create it. Either way, ceph_opts is consumed by this
983 * function.
984 */
985static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
986{
987 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100988 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100989
Ilya Dryomova32e4142019-05-02 15:56:00 +0200990 mutex_lock(&client_mutex);
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100991 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100992 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100993 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100994
995 /*
996 * Using an existing client. Make sure ->pg_pools is up to
997 * date before we look up the pool id in do_rbd_add().
998 */
Ilya Dryomov9d4a2272019-03-20 10:58:05 +0100999 ret = ceph_wait_for_latest_osdmap(rbdc->client,
1000 rbdc->client->options->mount_timeout);
Ilya Dryomovdd435852018-02-22 13:43:24 +01001001 if (ret) {
1002 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
1003 rbd_put_client(rbdc);
1004 rbdc = ERR_PTR(ret);
1005 }
1006 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +01001007 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +01001008 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +01001009 mutex_unlock(&client_mutex);
1010
1011 return rbdc;
1012}
1013
Alex Eldera30b71b2012-07-10 20:30:11 -05001014static bool rbd_image_format_valid(u32 image_format)
1015{
1016 return image_format == 1 || image_format == 2;
1017}
1018
Alex Elder8e94af82012-07-25 09:32:40 -05001019static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
1020{
Alex Elder103a1502012-08-02 11:29:45 -05001021 size_t size;
1022 u32 snap_count;
1023
1024 /* The header has to start with the magic rbd header text */
1025 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
1026 return false;
1027
Alex Elderdb2388b2012-10-20 22:17:27 -05001028 /* The bio layer requires at least sector-sized I/O */
1029
1030 if (ondisk->options.order < SECTOR_SHIFT)
1031 return false;
1032
1033 /* If we use u64 in a few spots we may be able to loosen this */
1034
1035 if (ondisk->options.order > 8 * sizeof (int) - 1)
1036 return false;
1037
Alex Elder103a1502012-08-02 11:29:45 -05001038 /*
1039 * The size of a snapshot header has to fit in a size_t, and
1040 * that limits the number of snapshots.
1041 */
1042 snap_count = le32_to_cpu(ondisk->snap_count);
1043 size = SIZE_MAX - sizeof (struct ceph_snap_context);
1044 if (snap_count > size / sizeof (__le64))
1045 return false;
1046
1047 /*
1048 * Not only that, but the size of the entire the snapshot
1049 * header must also be representable in a size_t.
1050 */
1051 size -= snap_count * sizeof (__le64);
1052 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1053 return false;
1054
1055 return true;
Alex Elder8e94af82012-07-25 09:32:40 -05001056}
1057
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001058/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001059 * returns the size of an object in the image
1060 */
1061static u32 rbd_obj_bytes(struct rbd_image_header *header)
1062{
1063 return 1U << header->obj_order;
1064}
1065
Ilya Dryomov263423f2017-01-25 18:16:22 +01001066static void rbd_init_layout(struct rbd_device *rbd_dev)
1067{
1068 if (rbd_dev->header.stripe_unit == 0 ||
1069 rbd_dev->header.stripe_count == 0) {
1070 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1071 rbd_dev->header.stripe_count = 1;
1072 }
1073
1074 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1075 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1076 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +01001077 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1078 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001079 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1080}
1081
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001082/*
Alex Elderbb23e372013-05-06 09:51:29 -05001083 * Fill an rbd image header with information from the given format 1
1084 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001085 */
Alex Elder662518b2013-05-06 09:51:29 -05001086static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001087 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088{
Alex Elder662518b2013-05-06 09:51:29 -05001089 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001090 bool first_time = header->object_prefix == NULL;
1091 struct ceph_snap_context *snapc;
1092 char *object_prefix = NULL;
1093 char *snap_names = NULL;
1094 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001095 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001096 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001097 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001098
Alex Elderbb23e372013-05-06 09:51:29 -05001099 /* Allocate this now to avoid having to handle failure below */
1100
1101 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001102 object_prefix = kstrndup(ondisk->object_prefix,
1103 sizeof(ondisk->object_prefix),
1104 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001105 if (!object_prefix)
1106 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001107 }
1108
1109 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001110
Alex Elder103a1502012-08-02 11:29:45 -05001111 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001112 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1113 if (!snapc)
1114 goto out_err;
1115 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001116 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001117 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001118 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1119
Alex Elderbb23e372013-05-06 09:51:29 -05001120 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001121
Alex Elderbb23e372013-05-06 09:51:29 -05001122 if (snap_names_len > (u64)SIZE_MAX)
1123 goto out_2big;
1124 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1125 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001126 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001127
1128 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001129 snap_sizes = kmalloc_array(snap_count,
1130 sizeof(*header->snap_sizes),
1131 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001132 if (!snap_sizes)
1133 goto out_err;
1134
Alex Elderf785cc12012-08-23 23:22:06 -05001135 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001136 * Copy the names, and fill in each snapshot's id
1137 * and size.
1138 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001139 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001140 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001141 * snap_names_len bytes beyond the end of the
1142 * snapshot id array, this memcpy() is safe.
1143 */
Alex Elderbb23e372013-05-06 09:51:29 -05001144 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1145 snaps = ondisk->snaps;
1146 for (i = 0; i < snap_count; i++) {
1147 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1148 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1149 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001150 }
Alex Elder849b4262012-07-09 21:04:24 -05001151
Alex Elderbb23e372013-05-06 09:51:29 -05001152 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001153
Alex Elderbb23e372013-05-06 09:51:29 -05001154 if (first_time) {
1155 header->object_prefix = object_prefix;
1156 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001157 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001158 } else {
1159 ceph_put_snap_context(header->snapc);
1160 kfree(header->snap_names);
1161 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001162 }
1163
1164 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001165
Alex Elderf84344f2012-08-31 17:29:51 -05001166 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001167 header->snapc = snapc;
1168 header->snap_names = snap_names;
1169 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001170
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001171 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001172out_2big:
1173 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001174out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001175 kfree(snap_sizes);
1176 kfree(snap_names);
1177 ceph_put_snap_context(snapc);
1178 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001179
Alex Elderbb23e372013-05-06 09:51:29 -05001180 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001181}
1182
Alex Elder9682fc62013-04-30 00:44:33 -05001183static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1184{
1185 const char *snap_name;
1186
1187 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1188
1189 /* Skip over names until we find the one we are looking for */
1190
1191 snap_name = rbd_dev->header.snap_names;
1192 while (which--)
1193 snap_name += strlen(snap_name) + 1;
1194
1195 return kstrdup(snap_name, GFP_KERNEL);
1196}
1197
Alex Elder30d1cff2013-05-01 12:43:03 -05001198/*
1199 * Snapshot id comparison function for use with qsort()/bsearch().
1200 * Note that result is for snapshots in *descending* order.
1201 */
1202static int snapid_compare_reverse(const void *s1, const void *s2)
1203{
1204 u64 snap_id1 = *(u64 *)s1;
1205 u64 snap_id2 = *(u64 *)s2;
1206
1207 if (snap_id1 < snap_id2)
1208 return 1;
1209 return snap_id1 == snap_id2 ? 0 : -1;
1210}
1211
1212/*
1213 * Search a snapshot context to see if the given snapshot id is
1214 * present.
1215 *
1216 * Returns the position of the snapshot id in the array if it's found,
1217 * or BAD_SNAP_INDEX otherwise.
1218 *
1219 * Note: The snapshot array is in kept sorted (by the osd) in
1220 * reverse order, highest snapshot id first.
1221 */
Alex Elder9682fc62013-04-30 00:44:33 -05001222static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1223{
1224 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001225 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001226
Alex Elder30d1cff2013-05-01 12:43:03 -05001227 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1228 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001229
Alex Elder30d1cff2013-05-01 12:43:03 -05001230 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001231}
1232
Alex Elder2ad3d712013-04-30 00:44:33 -05001233static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1234 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001235{
1236 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001237 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001238
1239 which = rbd_dev_snap_index(rbd_dev, snap_id);
1240 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001241 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001242
Josh Durginda6a6b62013-09-04 17:57:31 -07001243 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1244 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001245}
1246
Alex Elder9e15b772012-10-30 19:40:33 -05001247static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1248{
Alex Elder9e15b772012-10-30 19:40:33 -05001249 if (snap_id == CEPH_NOSNAP)
1250 return RBD_SNAP_HEAD_NAME;
1251
Alex Elder54cac612013-04-30 00:44:33 -05001252 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1253 if (rbd_dev->image_format == 1)
1254 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001255
Alex Elder54cac612013-04-30 00:44:33 -05001256 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001257}
1258
Alex Elder2ad3d712013-04-30 00:44:33 -05001259static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1260 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001261{
Alex Elder2ad3d712013-04-30 00:44:33 -05001262 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1263 if (snap_id == CEPH_NOSNAP) {
1264 *snap_size = rbd_dev->header.image_size;
1265 } else if (rbd_dev->image_format == 1) {
1266 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001267
Alex Elder2ad3d712013-04-30 00:44:33 -05001268 which = rbd_dev_snap_index(rbd_dev, snap_id);
1269 if (which == BAD_SNAP_INDEX)
1270 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001271
Alex Elder2ad3d712013-04-30 00:44:33 -05001272 *snap_size = rbd_dev->header.snap_sizes[which];
1273 } else {
1274 u64 size = 0;
1275 int ret;
1276
1277 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1278 if (ret)
1279 return ret;
1280
1281 *snap_size = size;
1282 }
1283 return 0;
1284}
1285
1286static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1287 u64 *snap_features)
1288{
1289 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1290 if (snap_id == CEPH_NOSNAP) {
1291 *snap_features = rbd_dev->header.features;
1292 } else if (rbd_dev->image_format == 1) {
1293 *snap_features = 0; /* No features for format 1 */
1294 } else {
1295 u64 features = 0;
1296 int ret;
1297
1298 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1299 if (ret)
1300 return ret;
1301
1302 *snap_features = features;
1303 }
1304 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001305}
1306
Alex Elderd1cf5782013-04-27 09:59:30 -05001307static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001308{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001309 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001310 u64 size = 0;
1311 u64 features = 0;
1312 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001313
Alex Elder2ad3d712013-04-30 00:44:33 -05001314 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1315 if (ret)
1316 return ret;
1317 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1318 if (ret)
1319 return ret;
1320
1321 rbd_dev->mapping.size = size;
1322 rbd_dev->mapping.features = features;
1323
Alex Elder8b0241f2013-04-25 23:15:08 -05001324 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001325}
1326
Alex Elderd1cf5782013-04-27 09:59:30 -05001327static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1328{
1329 rbd_dev->mapping.size = 0;
1330 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001331}
1332
Ilya Dryomov5359a172018-01-20 10:30:10 +01001333static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001334{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001335 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001336 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001337
Ilya Dryomov5359a172018-01-20 10:30:10 +01001338 buf = bvec_kmap_irq(bv, &flags);
1339 memset(buf, 0, bv->bv_len);
1340 flush_dcache_page(bv->bv_page);
1341 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001342}
1343
Ilya Dryomov5359a172018-01-20 10:30:10 +01001344static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001345{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001346 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001347
Ilya Dryomov5359a172018-01-20 10:30:10 +01001348 ceph_bio_iter_advance(&it, off);
1349 ceph_bio_iter_advance_step(&it, bytes, ({
1350 zero_bvec(&bv);
1351 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001352}
1353
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001354static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001355{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001356 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001357
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001358 ceph_bvec_iter_advance(&it, off);
1359 ceph_bvec_iter_advance_step(&it, bytes, ({
1360 zero_bvec(&bv);
1361 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001362}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001363
Alex Elderf7760da2012-10-20 22:17:27 -05001364/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001365 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001366 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001367 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001368 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001369 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001370static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1371 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001372{
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001373 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1374
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001375 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001376 case OBJ_REQUEST_BIO:
1377 zero_bios(&obj_req->bio_pos, off, bytes);
1378 break;
1379 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001380 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001381 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1382 break;
1383 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001384 BUG();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001385 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001386}
1387
1388static void rbd_obj_request_destroy(struct kref *kref);
1389static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1390{
1391 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001392 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001393 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001394 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1395}
1396
Alex Elderbf0d5f502012-11-22 00:00:08 -06001397static void rbd_img_request_destroy(struct kref *kref);
1398static void rbd_img_request_put(struct rbd_img_request *img_request)
1399{
1400 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001401 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001402 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001403 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001404}
1405
1406static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1407 struct rbd_obj_request *obj_request)
1408{
Alex Elder25dcf952013-01-25 17:08:55 -06001409 rbd_assert(obj_request->img_request == NULL);
1410
Alex Elderb155e862013-04-15 14:50:37 -05001411 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001412 obj_request->img_request = img_request;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001413 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001414}
1415
1416static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1417 struct rbd_obj_request *obj_request)
1418{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001419 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001420 list_del(&obj_request->ex.oe_item);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001421 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001422 rbd_obj_request_put(obj_request);
1423}
1424
Ilya Dryomova086a1b2019-06-12 18:33:31 +02001425static void rbd_osd_submit(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001426{
Ilya Dryomova086a1b2019-06-12 18:33:31 +02001427 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov980917f2016-09-12 18:59:42 +02001428
Ilya Dryomova086a1b2019-06-12 18:33:31 +02001429 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1430 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1431 obj_req->ex.oe_off, obj_req->ex.oe_len);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001432 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001433}
1434
Alex Elder0c425242013-02-08 09:55:49 -06001435/*
1436 * The default/initial value for all image request flags is 0. Each
1437 * is conditionally set to 1 at image request initialization time
1438 * and currently never change thereafter.
1439 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001440static void img_request_layered_set(struct rbd_img_request *img_request)
1441{
1442 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1443 smp_mb();
1444}
1445
Alex Eldera2acd002013-05-08 22:50:04 -05001446static void img_request_layered_clear(struct rbd_img_request *img_request)
1447{
1448 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1449 smp_mb();
1450}
1451
Alex Elderd0b2e942013-01-24 16:13:36 -06001452static bool img_request_layered_test(struct rbd_img_request *img_request)
1453{
1454 smp_mb();
1455 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1456}
1457
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001458static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001459{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001460 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1461
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001462 return !obj_req->ex.oe_off &&
1463 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001464}
1465
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001466static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001467{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001468 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001469
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001470 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001471 rbd_dev->layout.object_size;
1472}
1473
Ilya Dryomov13488d52019-02-25 12:37:50 +01001474/*
1475 * Must be called after rbd_obj_calc_img_extents().
1476 */
1477static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1478{
1479 if (!obj_req->num_img_extents ||
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01001480 (rbd_obj_is_entire(obj_req) &&
1481 !obj_req->img_request->snapc->num_snaps))
Ilya Dryomov13488d52019-02-25 12:37:50 +01001482 return false;
1483
1484 return true;
1485}
1486
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001487static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1488{
1489 return ceph_file_extents_bytes(obj_req->img_extents,
1490 obj_req->num_img_extents);
1491}
1492
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001493static bool rbd_img_is_write(struct rbd_img_request *img_req)
1494{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001495 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001496 case OBJ_OP_READ:
1497 return false;
1498 case OBJ_OP_WRITE:
1499 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001500 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001501 return true;
1502 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02001503 BUG();
Alex Elder6e2a4502013-03-27 09:16:30 -05001504 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001505}
1506
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001507static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001508{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001509 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001510 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001511
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001512 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1513 osd_req->r_result, obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001514
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001515 /*
1516 * Writes aren't allowed to return a data payload. In some
1517 * guarded write cases (e.g. stat + zero on an empty object)
1518 * a stat response makes it through, but we don't care.
1519 */
1520 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1521 result = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001522 else
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001523 result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001524
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001525 rbd_obj_handle_request(obj_req, result);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001526}
1527
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001528static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
Alex Elder430c28c2013-04-03 21:32:51 -05001529{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001530 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elder430c28c2013-04-03 21:32:51 -05001531
Ilya Dryomova162b302018-01-30 17:52:10 +01001532 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001533 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001534}
1535
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001536static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
Alex Elder9d4df012013-04-19 15:34:50 -05001537{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001538 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elder9d4df012013-04-19 15:34:50 -05001539
Ilya Dryomova162b302018-01-30 17:52:10 +01001540 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001541 ktime_get_real_ts64(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001542 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001543}
1544
Ilya Dryomovbc812072017-01-25 18:16:23 +01001545static struct ceph_osd_request *
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001546__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1547 struct ceph_snap_context *snapc, int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001548{
Ilya Dryomove28eded2019-02-25 11:42:26 +01001549 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001550 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1551 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001552 const char *name_format = rbd_dev->image_format == 1 ?
1553 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001554 int ret;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001555
Ilya Dryomove28eded2019-02-25 11:42:26 +01001556 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001557 if (!req)
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001558 return ERR_PTR(-ENOMEM);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001559
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001560 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001561 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001562 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001563
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001564 /*
1565 * Data objects may be stored in a separate pool, but always in
1566 * the same namespace in that pool as the header in its pool.
1567 */
1568 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001569 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001570
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001571 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1572 rbd_dev->header.object_prefix,
1573 obj_req->ex.oe_objno);
1574 if (ret)
1575 return ERR_PTR(ret);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001576
Ilya Dryomovbc812072017-01-25 18:16:23 +01001577 return req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001578}
1579
Ilya Dryomove28eded2019-02-25 11:42:26 +01001580static struct ceph_osd_request *
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001581rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
Ilya Dryomove28eded2019-02-25 11:42:26 +01001582{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001583 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1584 num_ops);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001585}
1586
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001587static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001588{
1589 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001590
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001591 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001592 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001593 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001594
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001595 ceph_object_extent_init(&obj_request->ex);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001596 INIT_LIST_HEAD(&obj_request->osd_reqs);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001597 mutex_init(&obj_request->state_mutex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001598 kref_init(&obj_request->kref);
1599
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001600 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001601 return obj_request;
1602}
1603
1604static void rbd_obj_request_destroy(struct kref *kref)
1605{
1606 struct rbd_obj_request *obj_request;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001607 struct ceph_osd_request *osd_req;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001608 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001609
1610 obj_request = container_of(kref, struct rbd_obj_request, kref);
1611
Alex Elder37206ee2013-02-20 17:32:08 -06001612 dout("%s: obj %p\n", __func__, obj_request);
1613
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001614 while (!list_empty(&obj_request->osd_reqs)) {
1615 osd_req = list_first_entry(&obj_request->osd_reqs,
1616 struct ceph_osd_request, r_private_item);
1617 list_del_init(&osd_req->r_private_item);
1618 ceph_osdc_put_request(osd_req);
1619 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001620
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001621 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001622 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001623 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001624 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001625 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001626 case OBJ_REQUEST_OWN_BVECS:
1627 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001628 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001629 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001630 BUG();
Alex Elderbf0d5f502012-11-22 00:00:08 -06001631 }
1632
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001633 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001634 if (obj_request->copyup_bvecs) {
1635 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1636 if (obj_request->copyup_bvecs[i].bv_page)
1637 __free_page(obj_request->copyup_bvecs[i].bv_page);
1638 }
1639 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001640 }
1641
Alex Elder868311b2013-05-01 12:43:03 -05001642 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001643}
1644
Alex Elderfb65d2282013-05-08 22:50:04 -05001645/* It's OK to call this for a device with no parent */
1646
1647static void rbd_spec_put(struct rbd_spec *spec);
1648static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1649{
1650 rbd_dev_remove_parent(rbd_dev);
1651 rbd_spec_put(rbd_dev->parent_spec);
1652 rbd_dev->parent_spec = NULL;
1653 rbd_dev->parent_overlap = 0;
1654}
1655
Alex Elderbf0d5f502012-11-22 00:00:08 -06001656/*
Alex Eldera2acd002013-05-08 22:50:04 -05001657 * Parent image reference counting is used to determine when an
1658 * image's parent fields can be safely torn down--after there are no
1659 * more in-flight requests to the parent image. When the last
1660 * reference is dropped, cleaning them up is safe.
1661 */
1662static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1663{
1664 int counter;
1665
1666 if (!rbd_dev->parent_spec)
1667 return;
1668
1669 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1670 if (counter > 0)
1671 return;
1672
1673 /* Last reference; clean up parent data structures */
1674
1675 if (!counter)
1676 rbd_dev_unparent(rbd_dev);
1677 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001678 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001679}
1680
1681/*
1682 * If an image has a non-zero parent overlap, get a reference to its
1683 * parent.
1684 *
1685 * Returns true if the rbd device has a parent with a non-zero
1686 * overlap and a reference for it was successfully taken, or
1687 * false otherwise.
1688 */
1689static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1690{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001691 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001692
1693 if (!rbd_dev->parent_spec)
1694 return false;
1695
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001696 down_read(&rbd_dev->header_rwsem);
1697 if (rbd_dev->parent_overlap)
1698 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1699 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001700
1701 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001702 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001703
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001704 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001705}
1706
Alex Elderbf0d5f502012-11-22 00:00:08 -06001707/*
1708 * Caller is responsible for filling in the list of object requests
1709 * that comprises the image request, and the Linux request pointer
1710 * (if there is one).
1711 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001712static struct rbd_img_request *rbd_img_request_create(
1713 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001714 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001715 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001716{
1717 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001718
Ilya Dryomova0c58952018-01-22 16:03:06 +01001719 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001720 if (!img_request)
1721 return NULL;
1722
Alex Elderbf0d5f502012-11-22 00:00:08 -06001723 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001724 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001725 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001726 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001727 else
1728 img_request->snapc = snapc;
1729
Alex Eldera2acd002013-05-08 22:50:04 -05001730 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001731 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001732
Ilya Dryomove1fddc82019-05-30 16:07:48 +02001733 INIT_LIST_HEAD(&img_request->lock_item);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001734 INIT_LIST_HEAD(&img_request->object_extents);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02001735 mutex_init(&img_request->state_mutex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001736 kref_init(&img_request->kref);
1737
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001738 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1739 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001740 return img_request;
1741}
1742
1743static void rbd_img_request_destroy(struct kref *kref)
1744{
1745 struct rbd_img_request *img_request;
1746 struct rbd_obj_request *obj_request;
1747 struct rbd_obj_request *next_obj_request;
1748
1749 img_request = container_of(kref, struct rbd_img_request, kref);
1750
Alex Elder37206ee2013-02-20 17:32:08 -06001751 dout("%s: img %p\n", __func__, img_request);
1752
Ilya Dryomove1fddc82019-05-30 16:07:48 +02001753 WARN_ON(!list_empty(&img_request->lock_item));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001754 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1755 rbd_img_obj_request_del(img_request, obj_request);
1756
Alex Eldera2acd002013-05-08 22:50:04 -05001757 if (img_request_layered_test(img_request)) {
1758 img_request_layered_clear(img_request);
1759 rbd_dev_parent_put(img_request->rbd_dev);
1760 }
1761
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001762 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001763 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001764
Alex Elder1c2a9df2013-05-01 12:43:03 -05001765 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001766}
1767
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001768static void prune_extents(struct ceph_file_extent *img_extents,
1769 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05001770{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001771 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05001772
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001773 /* drop extents completely beyond the overlap */
1774 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1775 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05001776
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001777 if (cnt) {
1778 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05001779
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001780 /* trim final overlapping extent */
1781 if (ex->fe_off + ex->fe_len > overlap)
1782 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06001783 }
1784
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001785 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05001786}
1787
Alex Elderf1a47392013-04-19 15:34:50 -05001788/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001789 * Determine the byte range(s) covered by either just the object extent
1790 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001791 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001792static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1793 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001794{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001795 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001796 int ret;
1797
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001798 if (!rbd_dev->parent_overlap)
1799 return 0;
1800
1801 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1802 entire ? 0 : obj_req->ex.oe_off,
1803 entire ? rbd_dev->layout.object_size :
1804 obj_req->ex.oe_len,
1805 &obj_req->img_extents,
1806 &obj_req->num_img_extents);
1807 if (ret)
1808 return ret;
1809
1810 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1811 rbd_dev->parent_overlap);
1812 return 0;
1813}
1814
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001815static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001816{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001817 struct rbd_obj_request *obj_req = osd_req->r_priv;
1818
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001819 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001820 case OBJ_REQUEST_BIO:
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001821 osd_req_op_extent_osd_data_bio(osd_req, which,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001822 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001823 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001824 break;
1825 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001826 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001827 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001828 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001829 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001830 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001831 &obj_req->bvec_pos);
1832 break;
1833 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001834 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001835 }
1836}
1837
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001838static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001839{
1840 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001841
Alex Elderc5b5ef62013-02-11 12:33:24 -06001842 /*
1843 * The response data for a STAT call consists of:
1844 * le64 length;
1845 * struct {
1846 * le32 tv_sec;
1847 * le32 tv_nsec;
1848 * } mtime;
1849 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001850 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1851 if (IS_ERR(pages))
1852 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06001853
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001854 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
1855 osd_req_op_raw_data_in_pages(osd_req, which, pages,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001856 8 + sizeof(struct ceph_timespec),
1857 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001858 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001859}
1860
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02001861static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
1862 u32 bytes)
1863{
1864 struct rbd_obj_request *obj_req = osd_req->r_priv;
1865 int ret;
1866
1867 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
1868 if (ret)
1869 return ret;
1870
1871 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
1872 obj_req->copyup_bvec_count, bytes);
1873 return 0;
1874}
1875
Ilya Dryomovea9b7432019-05-31 15:11:26 +02001876static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
1877{
1878 obj_req->read_state = RBD_OBJ_READ_START;
1879 return 0;
1880}
1881
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001882static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
1883 int which)
Alex Elderb454e362013-04-19 15:34:50 -05001884{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001885 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001886 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1887 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05001888
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001889 osd_req_op_alloc_hint_init(osd_req, which++,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001890 rbd_dev->layout.object_size,
1891 rbd_dev->layout.object_size);
Alex Elderb454e362013-04-19 15:34:50 -05001892
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001893 if (rbd_obj_is_entire(obj_req))
1894 opcode = CEPH_OSD_OP_WRITEFULL;
1895 else
1896 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001897
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001898 osd_req_op_extent_init(osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001899 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001900 rbd_osd_setup_data(osd_req, which);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001901}
1902
Ilya Dryomovea9b7432019-05-31 15:11:26 +02001903static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001904{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001905 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02001906
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001907 /* reverse map the entire object onto the parent */
1908 ret = rbd_obj_calc_img_extents(obj_req, true);
1909 if (ret)
1910 return ret;
1911
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001912 if (rbd_obj_copyup_enabled(obj_req))
1913 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
1914
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001915 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001916 return 0;
1917}
1918
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001919static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
1920{
1921 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
1922 CEPH_OSD_OP_ZERO;
1923}
1924
Ilya Dryomov27bbd912019-05-29 17:31:37 +02001925static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
1926 int which)
1927{
1928 struct rbd_obj_request *obj_req = osd_req->r_priv;
1929
1930 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
1931 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
1932 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
1933 } else {
1934 osd_req_op_extent_init(osd_req, which,
1935 truncate_or_zero_opcode(obj_req),
1936 obj_req->ex.oe_off, obj_req->ex.oe_len,
1937 0, 0);
1938 }
1939}
1940
Ilya Dryomovea9b7432019-05-31 15:11:26 +02001941static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001942{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001943 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov27bbd912019-05-29 17:31:37 +02001944 u64 off, next_off;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001945 int ret;
1946
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001947 /*
1948 * Align the range to alloc_size boundary and punt on discards
1949 * that are too small to free up any space.
1950 *
1951 * alloc_size == object_size && is_tail() is a special case for
1952 * filestore with filestore_punch_hole = false, needed to allow
1953 * truncate (in addition to delete).
1954 */
1955 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
1956 !rbd_obj_is_tail(obj_req)) {
Ilya Dryomov27bbd912019-05-29 17:31:37 +02001957 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
1958 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
1959 rbd_dev->opts->alloc_size);
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001960 if (off >= next_off)
1961 return 1;
Ilya Dryomov27bbd912019-05-29 17:31:37 +02001962
1963 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
1964 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
1965 off, next_off - off);
1966 obj_req->ex.oe_off = off;
1967 obj_req->ex.oe_len = next_off - off;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001968 }
1969
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001970 /* reverse map the entire object onto the parent */
1971 ret = rbd_obj_calc_img_extents(obj_req, true);
1972 if (ret)
1973 return ret;
1974
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001975 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
1976 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
1977
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001978 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001979 return 0;
1980}
1981
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001982static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
1983 int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001984{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001985 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001986 u16 opcode;
1987
1988 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001989 if (obj_req->num_img_extents) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001990 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001991 osd_req_op_init(osd_req, which++,
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01001992 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001993 opcode = CEPH_OSD_OP_TRUNCATE;
1994 } else {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001995 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001996 osd_req_op_init(osd_req, which++,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001997 CEPH_OSD_OP_DELETE, 0);
1998 opcode = 0;
1999 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002000 } else {
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002001 opcode = truncate_or_zero_opcode(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002002 }
2003
2004 if (opcode)
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002005 osd_req_op_extent_init(osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002006 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002007 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002008}
2009
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002010static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002011{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002012 int ret;
2013
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002014 /* reverse map the entire object onto the parent */
2015 ret = rbd_obj_calc_img_extents(obj_req, true);
2016 if (ret)
2017 return ret;
2018
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002019 if (rbd_obj_copyup_enabled(obj_req))
2020 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2021 if (!obj_req->num_img_extents) {
2022 if (rbd_obj_is_entire(obj_req))
2023 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2024 }
2025
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002026 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002027 return 0;
2028}
2029
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002030static int count_write_ops(struct rbd_obj_request *obj_req)
2031{
2032 switch (obj_req->img_request->op_type) {
2033 case OBJ_OP_WRITE:
2034 return 2; /* setallochint + write/writefull */
2035 case OBJ_OP_DISCARD:
2036 return 1; /* delete/truncate/zero */
2037 case OBJ_OP_ZEROOUT:
2038 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2039 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2040 return 2; /* create + truncate */
2041
2042 return 1; /* delete/truncate/zero */
2043 default:
2044 BUG();
2045 }
2046}
2047
2048static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2049 int which)
2050{
2051 struct rbd_obj_request *obj_req = osd_req->r_priv;
2052
2053 switch (obj_req->img_request->op_type) {
2054 case OBJ_OP_WRITE:
2055 __rbd_osd_setup_write_ops(osd_req, which);
2056 break;
2057 case OBJ_OP_DISCARD:
2058 __rbd_osd_setup_discard_ops(osd_req, which);
2059 break;
2060 case OBJ_OP_ZEROOUT:
2061 __rbd_osd_setup_zeroout_ops(osd_req, which);
2062 break;
2063 default:
2064 BUG();
2065 }
2066}
2067
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002068/*
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002069 * Prune the list of object requests (adjust offset and/or length, drop
2070 * redundant requests). Prepare object request state machines and image
2071 * request state machine for execution.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002072 */
2073static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2074{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002075 struct rbd_obj_request *obj_req, *next_obj_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002076 int ret;
2077
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002078 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002079 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002080 case OBJ_OP_READ:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002081 ret = rbd_obj_init_read(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002082 break;
2083 case OBJ_OP_WRITE:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002084 ret = rbd_obj_init_write(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002085 break;
2086 case OBJ_OP_DISCARD:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002087 ret = rbd_obj_init_discard(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002088 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002089 case OBJ_OP_ZEROOUT:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002090 ret = rbd_obj_init_zeroout(obj_req);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002091 break;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002092 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002093 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002094 }
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002095 if (ret < 0)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002096 return ret;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002097 if (ret > 0) {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002098 rbd_img_obj_request_del(img_req, obj_req);
2099 continue;
2100 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002101 }
2102
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002103 img_req->state = RBD_IMG_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002104 return 0;
2105}
2106
Ilya Dryomov5a237812018-02-06 19:26:34 +01002107union rbd_img_fill_iter {
2108 struct ceph_bio_iter bio_iter;
2109 struct ceph_bvec_iter bvec_iter;
2110};
2111
2112struct rbd_img_fill_ctx {
2113 enum obj_request_type pos_type;
2114 union rbd_img_fill_iter *pos;
2115 union rbd_img_fill_iter iter;
2116 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01002117 ceph_object_extent_fn_t count_fn;
2118 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01002119};
2120
2121static struct ceph_object_extent *alloc_object_extent(void *arg)
2122{
2123 struct rbd_img_request *img_req = arg;
2124 struct rbd_obj_request *obj_req;
2125
2126 obj_req = rbd_obj_request_create();
2127 if (!obj_req)
2128 return NULL;
2129
2130 rbd_img_obj_request_add(img_req, obj_req);
2131 return &obj_req->ex;
2132}
2133
2134/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002135 * While su != os && sc == 1 is technically not fancy (it's the same
2136 * layout as su == os && sc == 1), we can't use the nocopy path for it
2137 * because ->set_pos_fn() should be called only once per object.
2138 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2139 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01002140 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002141static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2142{
2143 return l->stripe_unit != l->object_size;
2144}
2145
2146static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2147 struct ceph_file_extent *img_extents,
2148 u32 num_img_extents,
2149 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01002150{
2151 u32 i;
2152 int ret;
2153
2154 img_req->data_type = fctx->pos_type;
2155
2156 /*
2157 * Create object requests and set each object request's starting
2158 * position in the provided bio (list) or bio_vec array.
2159 */
2160 fctx->iter = *fctx->pos;
2161 for (i = 0; i < num_img_extents; i++) {
2162 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2163 img_extents[i].fe_off,
2164 img_extents[i].fe_len,
2165 &img_req->object_extents,
2166 alloc_object_extent, img_req,
2167 fctx->set_pos_fn, &fctx->iter);
2168 if (ret)
2169 return ret;
2170 }
2171
2172 return __rbd_img_fill_request(img_req);
2173}
2174
Ilya Dryomovafb97882018-02-06 19:26:35 +01002175/*
2176 * Map a list of image extents to a list of object extents, create the
2177 * corresponding object requests (normally each to a different object,
2178 * but not always) and add them to @img_req. For each object request,
2179 * set up its data descriptor to point to the corresponding chunk(s) of
2180 * @fctx->pos data buffer.
2181 *
2182 * Because ceph_file_to_extents() will merge adjacent object extents
2183 * together, each object request's data descriptor may point to multiple
2184 * different chunks of @fctx->pos data buffer.
2185 *
2186 * @fctx->pos data buffer is assumed to be large enough.
2187 */
2188static int rbd_img_fill_request(struct rbd_img_request *img_req,
2189 struct ceph_file_extent *img_extents,
2190 u32 num_img_extents,
2191 struct rbd_img_fill_ctx *fctx)
2192{
2193 struct rbd_device *rbd_dev = img_req->rbd_dev;
2194 struct rbd_obj_request *obj_req;
2195 u32 i;
2196 int ret;
2197
2198 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2199 !rbd_layout_is_fancy(&rbd_dev->layout))
2200 return rbd_img_fill_request_nocopy(img_req, img_extents,
2201 num_img_extents, fctx);
2202
2203 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2204
2205 /*
2206 * Create object requests and determine ->bvec_count for each object
2207 * request. Note that ->bvec_count sum over all object requests may
2208 * be greater than the number of bio_vecs in the provided bio (list)
2209 * or bio_vec array because when mapped, those bio_vecs can straddle
2210 * stripe unit boundaries.
2211 */
2212 fctx->iter = *fctx->pos;
2213 for (i = 0; i < num_img_extents; i++) {
2214 ret = ceph_file_to_extents(&rbd_dev->layout,
2215 img_extents[i].fe_off,
2216 img_extents[i].fe_len,
2217 &img_req->object_extents,
2218 alloc_object_extent, img_req,
2219 fctx->count_fn, &fctx->iter);
2220 if (ret)
2221 return ret;
2222 }
2223
2224 for_each_obj_request(img_req, obj_req) {
2225 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2226 sizeof(*obj_req->bvec_pos.bvecs),
2227 GFP_NOIO);
2228 if (!obj_req->bvec_pos.bvecs)
2229 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002230 }
2231
2232 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002233 * Fill in each object request's private bio_vec array, splitting and
2234 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002235 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002236 fctx->iter = *fctx->pos;
2237 for (i = 0; i < num_img_extents; i++) {
2238 ret = ceph_iterate_extents(&rbd_dev->layout,
2239 img_extents[i].fe_off,
2240 img_extents[i].fe_len,
2241 &img_req->object_extents,
2242 fctx->copy_fn, &fctx->iter);
2243 if (ret)
2244 return ret;
2245 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002246
Ilya Dryomovafb97882018-02-06 19:26:35 +01002247 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002248}
2249
Ilya Dryomov5a237812018-02-06 19:26:34 +01002250static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2251 u64 off, u64 len)
2252{
2253 struct ceph_file_extent ex = { off, len };
2254 union rbd_img_fill_iter dummy;
2255 struct rbd_img_fill_ctx fctx = {
2256 .pos_type = OBJ_REQUEST_NODATA,
2257 .pos = &dummy,
2258 };
2259
2260 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2261}
2262
2263static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2264{
2265 struct rbd_obj_request *obj_req =
2266 container_of(ex, struct rbd_obj_request, ex);
2267 struct ceph_bio_iter *it = arg;
2268
2269 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2270 obj_req->bio_pos = *it;
2271 ceph_bio_iter_advance(it, bytes);
2272}
2273
Ilya Dryomovafb97882018-02-06 19:26:35 +01002274static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2275{
2276 struct rbd_obj_request *obj_req =
2277 container_of(ex, struct rbd_obj_request, ex);
2278 struct ceph_bio_iter *it = arg;
2279
2280 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2281 ceph_bio_iter_advance_step(it, bytes, ({
2282 obj_req->bvec_count++;
2283 }));
2284
2285}
2286
2287static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2288{
2289 struct rbd_obj_request *obj_req =
2290 container_of(ex, struct rbd_obj_request, ex);
2291 struct ceph_bio_iter *it = arg;
2292
2293 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2294 ceph_bio_iter_advance_step(it, bytes, ({
2295 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2296 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2297 }));
2298}
2299
Ilya Dryomov5a237812018-02-06 19:26:34 +01002300static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2301 struct ceph_file_extent *img_extents,
2302 u32 num_img_extents,
2303 struct ceph_bio_iter *bio_pos)
2304{
2305 struct rbd_img_fill_ctx fctx = {
2306 .pos_type = OBJ_REQUEST_BIO,
2307 .pos = (union rbd_img_fill_iter *)bio_pos,
2308 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002309 .count_fn = count_bio_bvecs,
2310 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002311 };
2312
2313 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2314 &fctx);
2315}
2316
2317static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2318 u64 off, u64 len, struct bio *bio)
2319{
2320 struct ceph_file_extent ex = { off, len };
2321 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2322
2323 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2324}
2325
2326static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2327{
2328 struct rbd_obj_request *obj_req =
2329 container_of(ex, struct rbd_obj_request, ex);
2330 struct ceph_bvec_iter *it = arg;
2331
2332 obj_req->bvec_pos = *it;
2333 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2334 ceph_bvec_iter_advance(it, bytes);
2335}
2336
Ilya Dryomovafb97882018-02-06 19:26:35 +01002337static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2338{
2339 struct rbd_obj_request *obj_req =
2340 container_of(ex, struct rbd_obj_request, ex);
2341 struct ceph_bvec_iter *it = arg;
2342
2343 ceph_bvec_iter_advance_step(it, bytes, ({
2344 obj_req->bvec_count++;
2345 }));
2346}
2347
2348static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2349{
2350 struct rbd_obj_request *obj_req =
2351 container_of(ex, struct rbd_obj_request, ex);
2352 struct ceph_bvec_iter *it = arg;
2353
2354 ceph_bvec_iter_advance_step(it, bytes, ({
2355 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2356 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2357 }));
2358}
2359
Ilya Dryomov5a237812018-02-06 19:26:34 +01002360static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2361 struct ceph_file_extent *img_extents,
2362 u32 num_img_extents,
2363 struct ceph_bvec_iter *bvec_pos)
2364{
2365 struct rbd_img_fill_ctx fctx = {
2366 .pos_type = OBJ_REQUEST_BVECS,
2367 .pos = (union rbd_img_fill_iter *)bvec_pos,
2368 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002369 .count_fn = count_bvecs,
2370 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002371 };
2372
2373 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2374 &fctx);
2375}
2376
2377static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2378 struct ceph_file_extent *img_extents,
2379 u32 num_img_extents,
2380 struct bio_vec *bvecs)
2381{
2382 struct ceph_bvec_iter it = {
2383 .bvecs = bvecs,
2384 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2385 num_img_extents) },
2386 };
2387
2388 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2389 &it);
2390}
2391
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002392static void rbd_img_handle_request_work(struct work_struct *work)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002393{
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002394 struct rbd_img_request *img_req =
2395 container_of(work, struct rbd_img_request, work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002396
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002397 rbd_img_handle_request(img_req, img_req->work_result);
2398}
Alex Elderbf0d5f502012-11-22 00:00:08 -06002399
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002400static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2401{
2402 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2403 img_req->work_result = result;
2404 queue_work(rbd_wq, &img_req->work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002405}
2406
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002407static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2408{
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002409 struct ceph_osd_request *osd_req;
2410 int ret;
2411
2412 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2413 if (IS_ERR(osd_req))
2414 return PTR_ERR(osd_req);
2415
2416 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2417 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2418 rbd_osd_setup_data(osd_req, 0);
2419 rbd_osd_format_read(osd_req);
2420
2421 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2422 if (ret)
2423 return ret;
2424
2425 rbd_osd_submit(osd_req);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002426 return 0;
2427}
2428
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002429static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002430{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002431 struct rbd_img_request *img_req = obj_req->img_request;
2432 struct rbd_img_request *child_img_req;
2433 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002434
Ilya Dryomove93aca02018-02-06 19:26:35 +01002435 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2436 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002437 if (!child_img_req)
2438 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002439
Ilya Dryomove93aca02018-02-06 19:26:35 +01002440 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2441 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002442
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002443 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002444 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002445 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002446 ret = __rbd_img_fill_from_bio(child_img_req,
2447 obj_req->img_extents,
2448 obj_req->num_img_extents,
2449 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002450 break;
2451 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002452 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002453 ret = __rbd_img_fill_from_bvecs(child_img_req,
2454 obj_req->img_extents,
2455 obj_req->num_img_extents,
2456 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002457 break;
2458 default:
Arnd Bergmannd342a152019-03-22 15:36:37 +01002459 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002460 }
2461 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002462 ret = rbd_img_fill_from_bvecs(child_img_req,
2463 obj_req->img_extents,
2464 obj_req->num_img_extents,
2465 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002466 }
2467 if (ret) {
2468 rbd_img_request_put(child_img_req);
2469 return ret;
2470 }
2471
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002472 /* avoid parent chain recursion */
2473 rbd_img_schedule(child_img_req, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002474 return 0;
2475}
2476
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002477static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002478{
2479 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2480 int ret;
2481
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002482 switch (obj_req->read_state) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002483 case RBD_OBJ_READ_START:
2484 rbd_assert(!*result);
2485
2486 ret = rbd_obj_read_object(obj_req);
2487 if (ret) {
2488 *result = ret;
2489 return true;
2490 }
2491 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2492 return false;
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002493 case RBD_OBJ_READ_OBJECT:
2494 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2495 /* reverse map this object extent onto the parent */
2496 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002497 if (ret) {
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002498 *result = ret;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002499 return true;
2500 }
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002501 if (obj_req->num_img_extents) {
2502 ret = rbd_obj_read_from_parent(obj_req);
2503 if (ret) {
2504 *result = ret;
2505 return true;
2506 }
2507 obj_req->read_state = RBD_OBJ_READ_PARENT;
2508 return false;
2509 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002510 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002511
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002512 /*
2513 * -ENOENT means a hole in the image -- zero-fill the entire
2514 * length of the request. A short read also implies zero-fill
2515 * to the end of the request.
2516 */
2517 if (*result == -ENOENT) {
2518 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2519 *result = 0;
2520 } else if (*result >= 0) {
2521 if (*result < obj_req->ex.oe_len)
2522 rbd_obj_zero_range(obj_req, *result,
2523 obj_req->ex.oe_len - *result);
2524 else
2525 rbd_assert(*result == obj_req->ex.oe_len);
2526 *result = 0;
2527 }
2528 return true;
2529 case RBD_OBJ_READ_PARENT:
2530 return true;
2531 default:
2532 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002533 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002534}
2535
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002536static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
2537{
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002538 struct ceph_osd_request *osd_req;
2539 int num_ops = count_write_ops(obj_req);
2540 int which = 0;
2541 int ret;
2542
2543 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2544 num_ops++; /* stat */
2545
2546 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
2547 if (IS_ERR(osd_req))
2548 return PTR_ERR(osd_req);
2549
2550 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2551 ret = rbd_osd_setup_stat(osd_req, which++);
2552 if (ret)
2553 return ret;
2554 }
2555
2556 rbd_osd_setup_write_ops(osd_req, which);
2557 rbd_osd_format_write(osd_req);
2558
2559 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2560 if (ret)
2561 return ret;
2562
2563 rbd_osd_submit(osd_req);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002564 return 0;
2565}
2566
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002567/*
2568 * copyup_bvecs pages are never highmem pages
2569 */
2570static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2571{
2572 struct ceph_bvec_iter it = {
2573 .bvecs = bvecs,
2574 .iter = { .bi_size = bytes },
2575 };
2576
2577 ceph_bvec_iter_advance_step(&it, bytes, ({
2578 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2579 bv.bv_len))
2580 return false;
2581 }));
2582 return true;
2583}
2584
Ilya Dryomov3a482502019-02-28 10:49:12 +01002585#define MODS_ONLY U32_MAX
2586
Ilya Dryomov793333a302019-06-13 17:44:08 +02002587static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
2588 u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002589{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002590 struct ceph_osd_request *osd_req;
Chengguang Xufe943d52018-04-12 12:04:55 +08002591 int ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002592
2593 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002594 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002595
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002596 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
2597 if (IS_ERR(osd_req))
2598 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002599
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02002600 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
Chengguang Xufe943d52018-04-12 12:04:55 +08002601 if (ret)
2602 return ret;
2603
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002604 rbd_osd_format_write(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002605
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002606 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002607 if (ret)
2608 return ret;
2609
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002610 rbd_osd_submit(osd_req);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002611 return 0;
2612}
2613
Ilya Dryomov793333a302019-06-13 17:44:08 +02002614static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
2615 u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002616{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002617 struct ceph_osd_request *osd_req;
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002618 int num_ops = count_write_ops(obj_req);
2619 int which = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002620 int ret;
2621
2622 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002623
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002624 if (bytes != MODS_ONLY)
2625 num_ops++; /* copyup */
Ilya Dryomov13488d52019-02-25 12:37:50 +01002626
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002627 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002628 if (IS_ERR(osd_req))
2629 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002630
Ilya Dryomov3a482502019-02-28 10:49:12 +01002631 if (bytes != MODS_ONLY) {
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02002632 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
Ilya Dryomov3a482502019-02-28 10:49:12 +01002633 if (ret)
2634 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002635 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002636
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002637 rbd_osd_setup_write_ops(osd_req, which);
2638 rbd_osd_format_write(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002639
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002640 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002641 if (ret)
2642 return ret;
2643
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002644 rbd_osd_submit(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002645 return 0;
2646}
2647
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002648static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2649{
2650 u32 i;
2651
2652 rbd_assert(!obj_req->copyup_bvecs);
2653 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2654 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2655 sizeof(*obj_req->copyup_bvecs),
2656 GFP_NOIO);
2657 if (!obj_req->copyup_bvecs)
2658 return -ENOMEM;
2659
2660 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2661 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2662
2663 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2664 if (!obj_req->copyup_bvecs[i].bv_page)
2665 return -ENOMEM;
2666
2667 obj_req->copyup_bvecs[i].bv_offset = 0;
2668 obj_req->copyup_bvecs[i].bv_len = len;
2669 obj_overlap -= len;
2670 }
2671
2672 rbd_assert(!obj_overlap);
2673 return 0;
2674}
2675
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002676/*
2677 * The target object doesn't exist. Read the data for the entire
2678 * target object up to the overlap point (if any) from the parent,
2679 * so we can use it for a copyup.
2680 */
Ilya Dryomov793333a302019-06-13 17:44:08 +02002681static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002682{
2683 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002684 int ret;
2685
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002686 rbd_assert(obj_req->num_img_extents);
2687 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2688 rbd_dev->parent_overlap);
2689 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002690 /*
2691 * The overlap has become 0 (most likely because the
Ilya Dryomov3a482502019-02-28 10:49:12 +01002692 * image has been flattened). Re-submit the original write
2693 * request -- pass MODS_ONLY since the copyup isn't needed
2694 * anymore.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002695 */
Ilya Dryomov793333a302019-06-13 17:44:08 +02002696 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002697 }
2698
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002699 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002700 if (ret)
2701 return ret;
2702
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002703 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002704}
2705
Ilya Dryomov793333a302019-06-13 17:44:08 +02002706static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
2707{
2708 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
2709 int ret;
2710
2711 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
2712
2713 /*
2714 * Only send non-zero copyup data to save some I/O and network
2715 * bandwidth -- zero copyup data is equivalent to the object not
2716 * existing.
2717 */
2718 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
2719 bytes = 0;
2720
2721 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
2722 /*
2723 * Send a copyup request with an empty snapshot context to
2724 * deep-copyup the object through all existing snapshots.
2725 * A second request with the current snapshot context will be
2726 * sent for the actual modification.
2727 */
2728 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
2729 if (ret) {
2730 obj_req->pending.result = ret;
2731 return;
2732 }
2733
2734 obj_req->pending.num_pending++;
2735 bytes = MODS_ONLY;
2736 }
2737
2738 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
2739 if (ret) {
2740 obj_req->pending.result = ret;
2741 return;
2742 }
2743
2744 obj_req->pending.num_pending++;
2745}
2746
2747static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
2748{
2749 int ret;
2750
2751again:
2752 switch (obj_req->copyup_state) {
2753 case RBD_OBJ_COPYUP_START:
2754 rbd_assert(!*result);
2755
2756 ret = rbd_obj_copyup_read_parent(obj_req);
2757 if (ret) {
2758 *result = ret;
2759 return true;
2760 }
2761 if (obj_req->num_img_extents)
2762 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
2763 else
2764 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
2765 return false;
2766 case RBD_OBJ_COPYUP_READ_PARENT:
2767 if (*result)
2768 return true;
2769
2770 if (is_zero_bvecs(obj_req->copyup_bvecs,
2771 rbd_obj_img_extents_bytes(obj_req))) {
2772 dout("%s %p detected zeros\n", __func__, obj_req);
2773 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
2774 }
2775
2776 rbd_obj_copyup_write_object(obj_req);
2777 if (!obj_req->pending.num_pending) {
2778 *result = obj_req->pending.result;
2779 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
2780 goto again;
2781 }
2782 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
2783 return false;
2784 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
2785 if (!pending_result_dec(&obj_req->pending, result))
2786 return false;
2787 /* fall through */
2788 case RBD_OBJ_COPYUP_WRITE_OBJECT:
2789 return true;
2790 default:
2791 BUG();
2792 }
2793}
2794
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002795static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002796{
Ilya Dryomov793333a302019-06-13 17:44:08 +02002797 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002798 int ret;
2799
Ilya Dryomov793333a302019-06-13 17:44:08 +02002800again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002801 switch (obj_req->write_state) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002802 case RBD_OBJ_WRITE_START:
2803 rbd_assert(!*result);
2804
2805 ret = rbd_obj_write_object(obj_req);
2806 if (ret) {
2807 *result = ret;
2808 return true;
2809 }
2810 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
2811 return false;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002812 case RBD_OBJ_WRITE_OBJECT:
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002813 if (*result == -ENOENT) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002814 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
Ilya Dryomov793333a302019-06-13 17:44:08 +02002815 *result = 0;
2816 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
2817 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
2818 goto again;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002819 }
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002820 /*
2821 * On a non-existent object:
2822 * delete - -ENOENT, truncate/zero - 0
2823 */
2824 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
2825 *result = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002826 }
Ilya Dryomov793333a302019-06-13 17:44:08 +02002827 if (*result)
2828 return true;
2829
2830 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
2831 goto again;
2832 case __RBD_OBJ_WRITE_COPYUP:
2833 if (!rbd_obj_advance_copyup(obj_req, result))
2834 return false;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002835 /* fall through */
Ilya Dryomov793333a302019-06-13 17:44:08 +02002836 case RBD_OBJ_WRITE_COPYUP:
2837 if (*result)
2838 rbd_warn(rbd_dev, "copyup failed: %d", *result);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002839 return true;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002840 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002841 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002842 }
2843}
2844
2845/*
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002846 * Return true if @obj_req is completed.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002847 */
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002848static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
2849 int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002850{
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002851 struct rbd_img_request *img_req = obj_req->img_request;
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002852 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002853 bool done;
2854
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002855 mutex_lock(&obj_req->state_mutex);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002856 if (!rbd_img_is_write(img_req))
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002857 done = rbd_obj_advance_read(obj_req, result);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002858 else
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002859 done = rbd_obj_advance_write(obj_req, result);
2860 mutex_unlock(&obj_req->state_mutex);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002861
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002862 if (done && *result) {
2863 rbd_assert(*result < 0);
2864 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
2865 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2866 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
2867 }
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002868 return done;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002869}
2870
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002871/*
2872 * This is open-coded in rbd_img_handle_request() to avoid parent chain
2873 * recursion.
2874 */
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002875static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002876{
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002877 if (__rbd_obj_handle_request(obj_req, &result))
2878 rbd_img_handle_request(obj_req->img_request, result);
2879}
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002880
Ilya Dryomove1fddc82019-05-30 16:07:48 +02002881static bool need_exclusive_lock(struct rbd_img_request *img_req)
2882{
2883 struct rbd_device *rbd_dev = img_req->rbd_dev;
2884
2885 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
2886 return false;
2887
2888 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
2889 return false;
2890
2891 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2892 if (rbd_dev->opts->lock_on_read)
2893 return true;
2894
2895 return rbd_img_is_write(img_req);
2896}
2897
2898static void rbd_lock_add_request(struct rbd_img_request *img_req)
2899{
2900 struct rbd_device *rbd_dev = img_req->rbd_dev;
2901
2902 lockdep_assert_held(&rbd_dev->lock_rwsem);
2903 spin_lock(&rbd_dev->lock_lists_lock);
2904 rbd_assert(list_empty(&img_req->lock_item));
2905 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
2906 spin_unlock(&rbd_dev->lock_lists_lock);
2907}
2908
2909static void rbd_lock_del_request(struct rbd_img_request *img_req)
2910{
2911 struct rbd_device *rbd_dev = img_req->rbd_dev;
2912 bool need_wakeup;
2913
2914 lockdep_assert_held(&rbd_dev->lock_rwsem);
2915 spin_lock(&rbd_dev->lock_lists_lock);
2916 rbd_assert(!list_empty(&img_req->lock_item));
2917 list_del_init(&img_req->lock_item);
2918 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
2919 list_empty(&rbd_dev->running_list));
2920 spin_unlock(&rbd_dev->lock_lists_lock);
2921 if (need_wakeup)
2922 complete(&rbd_dev->releasing_wait);
2923}
2924
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002925static void rbd_img_object_requests(struct rbd_img_request *img_req)
2926{
2927 struct rbd_obj_request *obj_req;
2928
2929 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
2930
2931 for_each_obj_request(img_req, obj_req) {
2932 int result = 0;
2933
2934 if (__rbd_obj_handle_request(obj_req, &result)) {
2935 if (result) {
2936 img_req->pending.result = result;
2937 return;
2938 }
2939 } else {
2940 img_req->pending.num_pending++;
2941 }
2942 }
2943}
2944
2945static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
2946{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002947again:
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002948 switch (img_req->state) {
2949 case RBD_IMG_START:
2950 rbd_assert(!*result);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002951
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002952 rbd_img_object_requests(img_req);
2953 if (!img_req->pending.num_pending) {
2954 *result = img_req->pending.result;
2955 img_req->state = RBD_IMG_OBJECT_REQUESTS;
2956 goto again;
2957 }
2958 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
2959 return false;
2960 case __RBD_IMG_OBJECT_REQUESTS:
2961 if (!pending_result_dec(&img_req->pending, result))
2962 return false;
2963 /* fall through */
2964 case RBD_IMG_OBJECT_REQUESTS:
2965 return true;
2966 default:
2967 BUG();
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002968 }
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002969}
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002970
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002971/*
2972 * Return true if @img_req is completed.
2973 */
2974static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
2975 int *result)
2976{
2977 struct rbd_device *rbd_dev = img_req->rbd_dev;
2978 bool done;
2979
Ilya Dryomove1fddc82019-05-30 16:07:48 +02002980 if (need_exclusive_lock(img_req)) {
2981 down_read(&rbd_dev->lock_rwsem);
2982 mutex_lock(&img_req->state_mutex);
2983 done = rbd_img_advance(img_req, result);
2984 if (done)
2985 rbd_lock_del_request(img_req);
2986 mutex_unlock(&img_req->state_mutex);
2987 up_read(&rbd_dev->lock_rwsem);
2988 } else {
2989 mutex_lock(&img_req->state_mutex);
2990 done = rbd_img_advance(img_req, result);
2991 mutex_unlock(&img_req->state_mutex);
2992 }
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002993
2994 if (done && *result) {
2995 rbd_assert(*result < 0);
2996 rbd_warn(rbd_dev, "%s%s result %d",
2997 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
2998 obj_op_name(img_req->op_type), *result);
2999 }
3000 return done;
3001}
3002
3003static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3004{
3005again:
3006 if (!__rbd_img_handle_request(img_req, &result))
3007 return;
3008
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003009 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003010 struct rbd_obj_request *obj_req = img_req->obj_request;
3011
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02003012 rbd_img_request_put(img_req);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003013 if (__rbd_obj_handle_request(obj_req, &result)) {
3014 img_req = obj_req->img_request;
3015 goto again;
3016 }
3017 } else {
3018 struct request *rq = img_req->rq;
3019
3020 rbd_img_request_put(img_req);
3021 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003022 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06003023}
3024
Ilya Dryomoved95b212016-08-12 16:40:02 +02003025static const struct rbd_client_id rbd_empty_cid;
3026
3027static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3028 const struct rbd_client_id *rhs)
3029{
3030 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3031}
3032
3033static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3034{
3035 struct rbd_client_id cid;
3036
3037 mutex_lock(&rbd_dev->watch_mutex);
3038 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3039 cid.handle = rbd_dev->watch_cookie;
3040 mutex_unlock(&rbd_dev->watch_mutex);
3041 return cid;
3042}
3043
3044/*
3045 * lock_rwsem must be held for write
3046 */
3047static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3048 const struct rbd_client_id *cid)
3049{
3050 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3051 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3052 cid->gid, cid->handle);
3053 rbd_dev->owner_cid = *cid; /* struct */
3054}
3055
3056static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3057{
3058 mutex_lock(&rbd_dev->watch_mutex);
3059 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3060 mutex_unlock(&rbd_dev->watch_mutex);
3061}
3062
Florian Margaineedd8ca82017-12-13 16:43:59 +01003063static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3064{
3065 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3066
Ilya Dryomova2b1da02019-05-30 11:15:23 +02003067 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01003068 strcpy(rbd_dev->lock_cookie, cookie);
3069 rbd_set_owner_cid(rbd_dev, &cid);
3070 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3071}
3072
Ilya Dryomoved95b212016-08-12 16:40:02 +02003073/*
3074 * lock_rwsem must be held for write
3075 */
3076static int rbd_lock(struct rbd_device *rbd_dev)
3077{
3078 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003079 char cookie[32];
3080 int ret;
3081
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003082 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3083 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02003084
3085 format_lock_cookie(rbd_dev, cookie);
3086 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3087 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3088 RBD_LOCK_TAG, "", 0);
3089 if (ret)
3090 return ret;
3091
Florian Margaineedd8ca82017-12-13 16:43:59 +01003092 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003093 return 0;
3094}
3095
3096/*
3097 * lock_rwsem must be held for write
3098 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02003099static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003100{
3101 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003102 int ret;
3103
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003104 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3105 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02003106
Ilya Dryomoved95b212016-08-12 16:40:02 +02003107 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003108 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02003109 if (ret && ret != -ENOENT)
3110 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003111
Ilya Dryomovbbead742017-04-13 12:17:38 +02003112 /* treat errors as the image is unlocked */
3113 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003114 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02003115 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3116 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003117}
3118
3119static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3120 enum rbd_notify_op notify_op,
3121 struct page ***preply_pages,
3122 size_t *preply_len)
3123{
3124 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3125 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07003126 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3127 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003128 void *p = buf;
3129
3130 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3131
3132 /* encode *LockPayload NotifyMessage (op + ClientId) */
3133 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3134 ceph_encode_32(&p, notify_op);
3135 ceph_encode_64(&p, cid.gid);
3136 ceph_encode_64(&p, cid.handle);
3137
3138 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3139 &rbd_dev->header_oloc, buf, buf_size,
3140 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3141}
3142
3143static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3144 enum rbd_notify_op notify_op)
3145{
3146 struct page **reply_pages;
3147 size_t reply_len;
3148
3149 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3150 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3151}
3152
3153static void rbd_notify_acquired_lock(struct work_struct *work)
3154{
3155 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3156 acquired_lock_work);
3157
3158 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3159}
3160
3161static void rbd_notify_released_lock(struct work_struct *work)
3162{
3163 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3164 released_lock_work);
3165
3166 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3167}
3168
3169static int rbd_request_lock(struct rbd_device *rbd_dev)
3170{
3171 struct page **reply_pages;
3172 size_t reply_len;
3173 bool lock_owner_responded = false;
3174 int ret;
3175
3176 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3177
3178 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3179 &reply_pages, &reply_len);
3180 if (ret && ret != -ETIMEDOUT) {
3181 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3182 goto out;
3183 }
3184
3185 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3186 void *p = page_address(reply_pages[0]);
3187 void *const end = p + reply_len;
3188 u32 n;
3189
3190 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3191 while (n--) {
3192 u8 struct_v;
3193 u32 len;
3194
3195 ceph_decode_need(&p, end, 8 + 8, e_inval);
3196 p += 8 + 8; /* skip gid and cookie */
3197
3198 ceph_decode_32_safe(&p, end, len, e_inval);
3199 if (!len)
3200 continue;
3201
3202 if (lock_owner_responded) {
3203 rbd_warn(rbd_dev,
3204 "duplicate lock owners detected");
3205 ret = -EIO;
3206 goto out;
3207 }
3208
3209 lock_owner_responded = true;
3210 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3211 &struct_v, &len);
3212 if (ret) {
3213 rbd_warn(rbd_dev,
3214 "failed to decode ResponseMessage: %d",
3215 ret);
3216 goto e_inval;
3217 }
3218
3219 ret = ceph_decode_32(&p);
3220 }
3221 }
3222
3223 if (!lock_owner_responded) {
3224 rbd_warn(rbd_dev, "no lock owners detected");
3225 ret = -ETIMEDOUT;
3226 }
3227
3228out:
3229 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3230 return ret;
3231
3232e_inval:
3233 ret = -EINVAL;
3234 goto out;
3235}
3236
3237static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3238{
3239 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3240
3241 cancel_delayed_work(&rbd_dev->lock_dwork);
3242 if (wake_all)
3243 wake_up_all(&rbd_dev->lock_waitq);
3244 else
3245 wake_up(&rbd_dev->lock_waitq);
3246}
3247
3248static int get_lock_owner_info(struct rbd_device *rbd_dev,
3249 struct ceph_locker **lockers, u32 *num_lockers)
3250{
3251 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3252 u8 lock_type;
3253 char *lock_tag;
3254 int ret;
3255
3256 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3257
3258 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3259 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3260 &lock_type, &lock_tag, lockers, num_lockers);
3261 if (ret)
3262 return ret;
3263
3264 if (*num_lockers == 0) {
3265 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3266 goto out;
3267 }
3268
3269 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3270 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3271 lock_tag);
3272 ret = -EBUSY;
3273 goto out;
3274 }
3275
3276 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3277 rbd_warn(rbd_dev, "shared lock type detected");
3278 ret = -EBUSY;
3279 goto out;
3280 }
3281
3282 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3283 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3284 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3285 (*lockers)[0].id.cookie);
3286 ret = -EBUSY;
3287 goto out;
3288 }
3289
3290out:
3291 kfree(lock_tag);
3292 return ret;
3293}
3294
3295static int find_watcher(struct rbd_device *rbd_dev,
3296 const struct ceph_locker *locker)
3297{
3298 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3299 struct ceph_watch_item *watchers;
3300 u32 num_watchers;
3301 u64 cookie;
3302 int i;
3303 int ret;
3304
3305 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3306 &rbd_dev->header_oloc, &watchers,
3307 &num_watchers);
3308 if (ret)
3309 return ret;
3310
3311 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3312 for (i = 0; i < num_watchers; i++) {
3313 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3314 sizeof(locker->info.addr)) &&
3315 watchers[i].cookie == cookie) {
3316 struct rbd_client_id cid = {
3317 .gid = le64_to_cpu(watchers[i].name.num),
3318 .handle = cookie,
3319 };
3320
3321 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3322 rbd_dev, cid.gid, cid.handle);
3323 rbd_set_owner_cid(rbd_dev, &cid);
3324 ret = 1;
3325 goto out;
3326 }
3327 }
3328
3329 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3330 ret = 0;
3331out:
3332 kfree(watchers);
3333 return ret;
3334}
3335
3336/*
3337 * lock_rwsem must be held for write
3338 */
3339static int rbd_try_lock(struct rbd_device *rbd_dev)
3340{
3341 struct ceph_client *client = rbd_dev->rbd_client->client;
3342 struct ceph_locker *lockers;
3343 u32 num_lockers;
3344 int ret;
3345
3346 for (;;) {
3347 ret = rbd_lock(rbd_dev);
3348 if (ret != -EBUSY)
3349 return ret;
3350
3351 /* determine if the current lock holder is still alive */
3352 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3353 if (ret)
3354 return ret;
3355
3356 if (num_lockers == 0)
3357 goto again;
3358
3359 ret = find_watcher(rbd_dev, lockers);
3360 if (ret) {
3361 if (ret > 0)
3362 ret = 0; /* have to request lock */
3363 goto out;
3364 }
3365
3366 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3367 ENTITY_NAME(lockers[0].id.name));
3368
3369 ret = ceph_monc_blacklist_add(&client->monc,
3370 &lockers[0].info.addr);
3371 if (ret) {
3372 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3373 ENTITY_NAME(lockers[0].id.name), ret);
3374 goto out;
3375 }
3376
3377 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3378 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3379 lockers[0].id.cookie,
3380 &lockers[0].id.name);
3381 if (ret && ret != -ENOENT)
3382 goto out;
3383
3384again:
3385 ceph_free_lockers(lockers, num_lockers);
3386 }
3387
3388out:
3389 ceph_free_lockers(lockers, num_lockers);
3390 return ret;
3391}
3392
3393/*
3394 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3395 */
3396static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3397 int *pret)
3398{
3399 enum rbd_lock_state lock_state;
3400
3401 down_read(&rbd_dev->lock_rwsem);
3402 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3403 rbd_dev->lock_state);
3404 if (__rbd_is_lock_owner(rbd_dev)) {
3405 lock_state = rbd_dev->lock_state;
3406 up_read(&rbd_dev->lock_rwsem);
3407 return lock_state;
3408 }
3409
3410 up_read(&rbd_dev->lock_rwsem);
3411 down_write(&rbd_dev->lock_rwsem);
3412 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3413 rbd_dev->lock_state);
3414 if (!__rbd_is_lock_owner(rbd_dev)) {
3415 *pret = rbd_try_lock(rbd_dev);
3416 if (*pret)
3417 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3418 }
3419
3420 lock_state = rbd_dev->lock_state;
3421 up_write(&rbd_dev->lock_rwsem);
3422 return lock_state;
3423}
3424
3425static void rbd_acquire_lock(struct work_struct *work)
3426{
3427 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3428 struct rbd_device, lock_dwork);
3429 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003430 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003431
3432 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3433again:
3434 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3435 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3436 if (lock_state == RBD_LOCK_STATE_LOCKED)
3437 wake_requests(rbd_dev, true);
3438 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3439 rbd_dev, lock_state, ret);
3440 return;
3441 }
3442
3443 ret = rbd_request_lock(rbd_dev);
3444 if (ret == -ETIMEDOUT) {
3445 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003446 } else if (ret == -EROFS) {
3447 rbd_warn(rbd_dev, "peer will not release lock");
3448 /*
3449 * If this is rbd_add_acquire_lock(), we want to fail
3450 * immediately -- reuse BLACKLISTED flag. Otherwise we
3451 * want to block.
3452 */
3453 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3454 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3455 /* wake "rbd map --exclusive" process */
3456 wake_requests(rbd_dev, false);
3457 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003458 } else if (ret < 0) {
3459 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3460 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3461 RBD_RETRY_DELAY);
3462 } else {
3463 /*
3464 * lock owner acked, but resend if we don't see them
3465 * release the lock
3466 */
3467 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3468 rbd_dev);
3469 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3470 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3471 }
3472}
3473
Ilya Dryomova2b1da02019-05-30 11:15:23 +02003474static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003475{
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003476 bool need_wait;
3477
Ilya Dryomova2b1da02019-05-30 11:15:23 +02003478 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3479 lockdep_assert_held_exclusive(&rbd_dev->lock_rwsem);
3480
Ilya Dryomoved95b212016-08-12 16:40:02 +02003481 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3482 return false;
3483
Ilya Dryomoved95b212016-08-12 16:40:02 +02003484 /*
3485 * Ensure that all in-flight IO is flushed.
Ilya Dryomoved95b212016-08-12 16:40:02 +02003486 */
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003487 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3488 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
3489 need_wait = !list_empty(&rbd_dev->running_list);
3490 downgrade_write(&rbd_dev->lock_rwsem);
3491 if (need_wait)
3492 wait_for_completion(&rbd_dev->releasing_wait);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003493 up_read(&rbd_dev->lock_rwsem);
3494
3495 down_write(&rbd_dev->lock_rwsem);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003496 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3497 return false;
3498
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003499 rbd_assert(list_empty(&rbd_dev->running_list));
Ilya Dryomova2b1da02019-05-30 11:15:23 +02003500 return true;
3501}
3502
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003503static void __rbd_release_lock(struct rbd_device *rbd_dev)
3504{
3505 rbd_assert(list_empty(&rbd_dev->running_list));
3506
3507 rbd_unlock(rbd_dev);
3508}
3509
Ilya Dryomova2b1da02019-05-30 11:15:23 +02003510/*
3511 * lock_rwsem must be held for write
3512 */
3513static void rbd_release_lock(struct rbd_device *rbd_dev)
3514{
3515 if (!rbd_quiesce_lock(rbd_dev))
3516 return;
3517
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003518 __rbd_release_lock(rbd_dev);
Ilya Dryomova2b1da02019-05-30 11:15:23 +02003519
Ilya Dryomovbbead742017-04-13 12:17:38 +02003520 /*
3521 * Give others a chance to grab the lock - we would re-acquire
3522 * almost immediately if we got new IO during ceph_osdc_sync()
3523 * otherwise. We need to ack our own notifications, so this
3524 * lock_dwork will be requeued from rbd_wait_state_locked()
3525 * after wake_requests() in rbd_handle_released_lock().
3526 */
3527 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003528}
3529
3530static void rbd_release_lock_work(struct work_struct *work)
3531{
3532 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3533 unlock_work);
3534
3535 down_write(&rbd_dev->lock_rwsem);
3536 rbd_release_lock(rbd_dev);
3537 up_write(&rbd_dev->lock_rwsem);
3538}
3539
3540static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3541 void **p)
3542{
3543 struct rbd_client_id cid = { 0 };
3544
3545 if (struct_v >= 2) {
3546 cid.gid = ceph_decode_64(p);
3547 cid.handle = ceph_decode_64(p);
3548 }
3549
3550 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3551 cid.handle);
3552 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3553 down_write(&rbd_dev->lock_rwsem);
3554 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3555 /*
3556 * we already know that the remote client is
3557 * the owner
3558 */
3559 up_write(&rbd_dev->lock_rwsem);
3560 return;
3561 }
3562
3563 rbd_set_owner_cid(rbd_dev, &cid);
3564 downgrade_write(&rbd_dev->lock_rwsem);
3565 } else {
3566 down_read(&rbd_dev->lock_rwsem);
3567 }
3568
3569 if (!__rbd_is_lock_owner(rbd_dev))
3570 wake_requests(rbd_dev, false);
3571 up_read(&rbd_dev->lock_rwsem);
3572}
3573
3574static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3575 void **p)
3576{
3577 struct rbd_client_id cid = { 0 };
3578
3579 if (struct_v >= 2) {
3580 cid.gid = ceph_decode_64(p);
3581 cid.handle = ceph_decode_64(p);
3582 }
3583
3584 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3585 cid.handle);
3586 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3587 down_write(&rbd_dev->lock_rwsem);
3588 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3589 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3590 __func__, rbd_dev, cid.gid, cid.handle,
3591 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3592 up_write(&rbd_dev->lock_rwsem);
3593 return;
3594 }
3595
3596 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3597 downgrade_write(&rbd_dev->lock_rwsem);
3598 } else {
3599 down_read(&rbd_dev->lock_rwsem);
3600 }
3601
3602 if (!__rbd_is_lock_owner(rbd_dev))
3603 wake_requests(rbd_dev, false);
3604 up_read(&rbd_dev->lock_rwsem);
3605}
3606
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003607/*
3608 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3609 * ResponseMessage is needed.
3610 */
3611static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3612 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003613{
3614 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3615 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003616 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003617
3618 if (struct_v >= 2) {
3619 cid.gid = ceph_decode_64(p);
3620 cid.handle = ceph_decode_64(p);
3621 }
3622
3623 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3624 cid.handle);
3625 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003626 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003627
3628 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003629 if (__rbd_is_lock_owner(rbd_dev)) {
3630 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3631 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3632 goto out_unlock;
3633
3634 /*
3635 * encode ResponseMessage(0) so the peer can detect
3636 * a missing owner
3637 */
3638 result = 0;
3639
3640 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003641 if (!rbd_dev->opts->exclusive) {
3642 dout("%s rbd_dev %p queueing unlock_work\n",
3643 __func__, rbd_dev);
3644 queue_work(rbd_dev->task_wq,
3645 &rbd_dev->unlock_work);
3646 } else {
3647 /* refuse to release the lock */
3648 result = -EROFS;
3649 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003650 }
3651 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003652
3653out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003654 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003655 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003656}
3657
3658static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3659 u64 notify_id, u64 cookie, s32 *result)
3660{
3661 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07003662 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3663 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003664 int ret;
3665
3666 if (result) {
3667 void *p = buf;
3668
3669 /* encode ResponseMessage */
3670 ceph_start_encoding(&p, 1, 1,
3671 buf_size - CEPH_ENCODING_START_BLK_LEN);
3672 ceph_encode_32(&p, *result);
3673 } else {
3674 buf_size = 0;
3675 }
3676
3677 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3678 &rbd_dev->header_oloc, notify_id, cookie,
3679 buf, buf_size);
3680 if (ret)
3681 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3682}
3683
3684static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3685 u64 cookie)
3686{
3687 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3688 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3689}
3690
3691static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3692 u64 notify_id, u64 cookie, s32 result)
3693{
3694 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3695 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3696}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003697
3698static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3699 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003700{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003701 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003702 void *p = data;
3703 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003704 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003705 u32 len;
3706 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003707 int ret;
3708
Ilya Dryomoved95b212016-08-12 16:40:02 +02003709 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3710 __func__, rbd_dev, cookie, notify_id, data_len);
3711 if (data_len) {
3712 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3713 &struct_v, &len);
3714 if (ret) {
3715 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3716 ret);
3717 return;
3718 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003719
Ilya Dryomoved95b212016-08-12 16:40:02 +02003720 notify_op = ceph_decode_32(&p);
3721 } else {
3722 /* legacy notification for header updates */
3723 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3724 len = 0;
3725 }
Alex Elderb8d70032012-11-30 17:53:04 -06003726
Ilya Dryomoved95b212016-08-12 16:40:02 +02003727 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3728 switch (notify_op) {
3729 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3730 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3731 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3732 break;
3733 case RBD_NOTIFY_OP_RELEASED_LOCK:
3734 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3735 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3736 break;
3737 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003738 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3739 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003740 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003741 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003742 else
3743 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3744 break;
3745 case RBD_NOTIFY_OP_HEADER_UPDATE:
3746 ret = rbd_dev_refresh(rbd_dev);
3747 if (ret)
3748 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3749
3750 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3751 break;
3752 default:
3753 if (rbd_is_lock_owner(rbd_dev))
3754 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3755 cookie, -EOPNOTSUPP);
3756 else
3757 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3758 break;
3759 }
Alex Elderb8d70032012-11-30 17:53:04 -06003760}
3761
Ilya Dryomov99d16942016-08-12 16:11:41 +02003762static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3763
Ilya Dryomov922dab62016-05-26 01:15:02 +02003764static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003765{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003766 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003767
Ilya Dryomov922dab62016-05-26 01:15:02 +02003768 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003769
Ilya Dryomoved95b212016-08-12 16:40:02 +02003770 down_write(&rbd_dev->lock_rwsem);
3771 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3772 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003773
Ilya Dryomov99d16942016-08-12 16:11:41 +02003774 mutex_lock(&rbd_dev->watch_mutex);
3775 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3776 __rbd_unregister_watch(rbd_dev);
3777 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003778
Ilya Dryomov99d16942016-08-12 16:11:41 +02003779 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003780 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003781 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003782}
3783
3784/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003785 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003786 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003787static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003788{
3789 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003790 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003791
Ilya Dryomov922dab62016-05-26 01:15:02 +02003792 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003793 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003794
Ilya Dryomov922dab62016-05-26 01:15:02 +02003795 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3796 &rbd_dev->header_oloc, rbd_watch_cb,
3797 rbd_watch_errcb, rbd_dev);
3798 if (IS_ERR(handle))
3799 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003800
Ilya Dryomov922dab62016-05-26 01:15:02 +02003801 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003802 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003803}
3804
Ilya Dryomov99d16942016-08-12 16:11:41 +02003805/*
3806 * watch_mutex must be locked
3807 */
3808static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003809{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003810 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3811 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003812
Ilya Dryomov99d16942016-08-12 16:11:41 +02003813 rbd_assert(rbd_dev->watch_handle);
3814 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003815
Ilya Dryomov922dab62016-05-26 01:15:02 +02003816 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3817 if (ret)
3818 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003819
Ilya Dryomov922dab62016-05-26 01:15:02 +02003820 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003821}
3822
Ilya Dryomov99d16942016-08-12 16:11:41 +02003823static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003824{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003825 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003826
Ilya Dryomov99d16942016-08-12 16:11:41 +02003827 mutex_lock(&rbd_dev->watch_mutex);
3828 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3829 ret = __rbd_register_watch(rbd_dev);
3830 if (ret)
3831 goto out;
3832
3833 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3834 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3835
3836out:
3837 mutex_unlock(&rbd_dev->watch_mutex);
3838 return ret;
3839}
3840
3841static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3842{
3843 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3844
Ilya Dryomoved95b212016-08-12 16:40:02 +02003845 cancel_work_sync(&rbd_dev->acquired_lock_work);
3846 cancel_work_sync(&rbd_dev->released_lock_work);
3847 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3848 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003849}
3850
3851static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3852{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003853 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003854 cancel_tasks_sync(rbd_dev);
3855
3856 mutex_lock(&rbd_dev->watch_mutex);
3857 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3858 __rbd_unregister_watch(rbd_dev);
3859 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3860 mutex_unlock(&rbd_dev->watch_mutex);
3861
Dongsheng Yang23edca82018-06-04 06:24:37 -04003862 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomov811c6682016-04-15 16:22:16 +02003863 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003864}
3865
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003866/*
3867 * lock_rwsem must be held for write
3868 */
3869static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3870{
3871 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3872 char cookie[32];
3873 int ret;
3874
Ilya Dryomova2b1da02019-05-30 11:15:23 +02003875 if (!rbd_quiesce_lock(rbd_dev))
3876 return;
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003877
3878 format_lock_cookie(rbd_dev, cookie);
3879 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3880 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3881 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3882 RBD_LOCK_TAG, cookie);
3883 if (ret) {
3884 if (ret != -EOPNOTSUPP)
3885 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3886 ret);
3887
3888 /*
3889 * Lock cookie cannot be updated on older OSDs, so do
3890 * a manual release and queue an acquire.
3891 */
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003892 __rbd_release_lock(rbd_dev);
Ilya Dryomova2b1da02019-05-30 11:15:23 +02003893 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003894 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003895 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003896 }
3897}
3898
Ilya Dryomov99d16942016-08-12 16:11:41 +02003899static void rbd_reregister_watch(struct work_struct *work)
3900{
3901 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3902 struct rbd_device, watch_dwork);
3903 int ret;
3904
3905 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3906
3907 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003908 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3909 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003910 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003911 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003912
3913 ret = __rbd_register_watch(rbd_dev);
3914 if (ret) {
3915 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003916 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003917 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003918 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003919 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003920 queue_delayed_work(rbd_dev->task_wq,
3921 &rbd_dev->watch_dwork,
3922 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003923 }
3924 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003925 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003926 }
3927
3928 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3929 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3930 mutex_unlock(&rbd_dev->watch_mutex);
3931
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003932 down_write(&rbd_dev->lock_rwsem);
3933 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3934 rbd_reacquire_lock(rbd_dev);
3935 up_write(&rbd_dev->lock_rwsem);
3936
Ilya Dryomov99d16942016-08-12 16:11:41 +02003937 ret = rbd_dev_refresh(rbd_dev);
3938 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00003939 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003940}
3941
Alex Elder36be9a72013-01-19 00:30:28 -06003942/*
Alex Elderf40eb342013-04-25 15:09:42 -05003943 * Synchronous osd object method call. Returns the number of bytes
3944 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003945 */
3946static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003947 struct ceph_object_id *oid,
3948 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003949 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003950 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003951 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003952 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003953 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003954{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003955 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3956 struct page *req_page = NULL;
3957 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003958 int ret;
3959
3960 /*
Alex Elder6010a452013-04-05 01:27:11 -05003961 * Method calls are ultimately read operations. The result
3962 * should placed into the inbound buffer provided. They
3963 * also supply outbound data--parameters for the object
3964 * method. Currently if this is present it will be a
3965 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003966 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003967 if (outbound) {
3968 if (outbound_size > PAGE_SIZE)
3969 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003970
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003971 req_page = alloc_page(GFP_KERNEL);
3972 if (!req_page)
3973 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003974
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003975 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003976 }
Alex Elder430c28c2013-04-03 21:32:51 -05003977
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003978 reply_page = alloc_page(GFP_KERNEL);
3979 if (!reply_page) {
3980 if (req_page)
3981 __free_page(req_page);
3982 return -ENOMEM;
3983 }
Alex Elder36be9a72013-01-19 00:30:28 -06003984
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003985 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3986 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3987 reply_page, &inbound_size);
3988 if (!ret) {
3989 memcpy(inbound, page_address(reply_page), inbound_size);
3990 ret = inbound_size;
3991 }
Alex Elder57385b52013-04-21 12:14:45 -05003992
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003993 if (req_page)
3994 __free_page(req_page);
3995 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003996 return ret;
3997}
3998
Ilya Dryomoved95b212016-08-12 16:40:02 +02003999/*
4000 * lock_rwsem must be held for read
4001 */
Ilya Dryomov2f18d462018-04-04 10:15:38 +02004002static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004003{
4004 DEFINE_WAIT(wait);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04004005 unsigned long timeout;
Ilya Dryomov2f18d462018-04-04 10:15:38 +02004006 int ret = 0;
4007
4008 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
4009 return -EBLACKLISTED;
4010
4011 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4012 return 0;
4013
4014 if (!may_acquire) {
4015 rbd_warn(rbd_dev, "exclusive lock required");
4016 return -EROFS;
4017 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02004018
4019 do {
4020 /*
4021 * Note the use of mod_delayed_work() in rbd_acquire_lock()
4022 * and cancel_delayed_work() in wake_requests().
4023 */
4024 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
4025 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4026 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
4027 TASK_UNINTERRUPTIBLE);
4028 up_read(&rbd_dev->lock_rwsem);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04004029 timeout = schedule_timeout(ceph_timeout_jiffies(
4030 rbd_dev->opts->lock_timeout));
Ilya Dryomoved95b212016-08-12 16:40:02 +02004031 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02004032 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
4033 ret = -EBLACKLISTED;
4034 break;
4035 }
Dongsheng Yang34f55d02018-03-26 10:22:55 -04004036 if (!timeout) {
4037 rbd_warn(rbd_dev, "timed out waiting for lock");
4038 ret = -ETIMEDOUT;
4039 break;
4040 }
Ilya Dryomov2f18d462018-04-04 10:15:38 +02004041 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004042
Ilya Dryomoved95b212016-08-12 16:40:02 +02004043 finish_wait(&rbd_dev->lock_waitq, &wait);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02004044 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004045}
4046
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004047static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004048{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004049 struct request *rq = blk_mq_rq_from_pdu(work);
4050 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004051 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07004052 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004053 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4054 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004055 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07004056 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02004057 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004058 int result;
4059
Christoph Hellwigaebf5262017-01-31 16:57:31 +01004060 switch (req_op(rq)) {
4061 case REQ_OP_DISCARD:
4062 op_type = OBJ_OP_DISCARD;
4063 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01004064 case REQ_OP_WRITE_ZEROES:
4065 op_type = OBJ_OP_ZEROOUT;
4066 break;
Christoph Hellwigaebf5262017-01-31 16:57:31 +01004067 case REQ_OP_WRITE:
4068 op_type = OBJ_OP_WRITE;
4069 break;
4070 case REQ_OP_READ:
4071 op_type = OBJ_OP_READ;
4072 break;
4073 default:
4074 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004075 result = -EIO;
4076 goto err;
4077 }
4078
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004079 /* Ignore/skip any zero-length requests */
4080
4081 if (!length) {
4082 dout("%s: zero-length request\n", __func__);
4083 result = 0;
4084 goto err_rq;
4085 }
4086
Ilya Dryomovb91a7bd2019-05-03 17:27:03 +02004087 if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) {
4088 rbd_warn(rbd_dev, "%s on read-only snapshot",
4089 obj_op_name(op_type));
4090 result = -EIO;
4091 goto err;
4092 }
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004093
4094 /*
4095 * Quit early if the mapped snapshot no longer exists. It's
4096 * still possible the snapshot will have disappeared by the
4097 * time our request arrives at the osd, but there's no sense in
4098 * sending it if we already know.
4099 */
4100 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4101 dout("request for non-existent snapshot");
4102 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4103 result = -ENXIO;
4104 goto err_rq;
4105 }
4106
4107 if (offset && length > U64_MAX - offset + 1) {
4108 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4109 length);
4110 result = -EINVAL;
4111 goto err_rq; /* Shouldn't happen */
4112 }
4113
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004114 blk_mq_start_request(rq);
4115
Josh Durgin4e752f02014-04-08 11:12:11 -07004116 down_read(&rbd_dev->header_rwsem);
4117 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004118 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07004119 snapc = rbd_dev->header.snapc;
4120 ceph_get_snap_context(snapc);
4121 }
4122 up_read(&rbd_dev->header_rwsem);
4123
4124 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004125 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07004126 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004127 result = -EIO;
4128 goto err_rq;
4129 }
4130
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02004131 must_be_locked =
4132 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
4133 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004134 if (must_be_locked) {
4135 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02004136 result = rbd_wait_state_locked(rbd_dev,
4137 !rbd_dev->opts->exclusive);
4138 if (result)
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004139 goto err_unlock;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004140 }
4141
Ilya Dryomovdfd98752018-02-06 19:26:35 +01004142 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004143 if (!img_request) {
4144 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004145 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004146 }
4147 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01004148 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004149
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01004150 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
Ilya Dryomov5a237812018-02-06 19:26:34 +01004151 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004152 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01004153 result = rbd_img_fill_from_bio(img_request, offset, length,
4154 rq->bio);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02004155 if (result)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004156 goto err_img_request;
4157
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004158 if (must_be_locked) {
4159 rbd_lock_add_request(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004160 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004161 }
4162
4163 rbd_img_handle_request(img_request, 0);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004164 return;
4165
4166err_img_request:
4167 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004168err_unlock:
4169 if (must_be_locked)
4170 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004171err_rq:
4172 if (result)
4173 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004174 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01004175 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004176err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02004177 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004178}
4179
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004180static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004181 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004182{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004183 struct request *rq = bd->rq;
4184 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004185
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004186 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004187 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06004188}
4189
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004190static void rbd_free_disk(struct rbd_device *rbd_dev)
4191{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004192 blk_cleanup_queue(rbd_dev->disk->queue);
4193 blk_mq_free_tag_set(&rbd_dev->tag_set);
4194 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05004195 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004196}
4197
Alex Elder788e2df2013-01-17 12:25:27 -06004198static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004199 struct ceph_object_id *oid,
4200 struct ceph_object_locator *oloc,
4201 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06004202
4203{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004204 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4205 struct ceph_osd_request *req;
4206 struct page **pages;
4207 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06004208 int ret;
4209
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004210 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4211 if (!req)
4212 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06004213
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004214 ceph_oid_copy(&req->r_base_oid, oid);
4215 ceph_oloc_copy(&req->r_base_oloc, oloc);
4216 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06004217
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004218 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4219 if (IS_ERR(pages)) {
4220 ret = PTR_ERR(pages);
4221 goto out_req;
4222 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06004223
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004224 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4225 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4226 true);
Alex Elder788e2df2013-01-17 12:25:27 -06004227
Ilya Dryomov26f887e2018-10-15 16:11:37 +02004228 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4229 if (ret)
4230 goto out_req;
4231
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004232 ceph_osdc_start_request(osdc, req, false);
4233 ret = ceph_osdc_wait_request(osdc, req);
4234 if (ret >= 0)
4235 ceph_copy_from_page_vector(pages, buf, 0, ret);
4236
4237out_req:
4238 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06004239 return ret;
4240}
4241
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004242/*
Alex Elder662518b2013-05-06 09:51:29 -05004243 * Read the complete header for the given rbd device. On successful
4244 * return, the rbd_dev->header field will contain up-to-date
4245 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05004246 */
Alex Elder99a41eb2013-05-06 09:51:30 -05004247static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05004248{
4249 struct rbd_image_header_ondisk *ondisk = NULL;
4250 u32 snap_count = 0;
4251 u64 names_size = 0;
4252 u32 want_count;
4253 int ret;
4254
4255 /*
4256 * The complete header will include an array of its 64-bit
4257 * snapshot ids, followed by the names of those snapshots as
4258 * a contiguous block of NUL-terminated strings. Note that
4259 * the number of snapshots could change by the time we read
4260 * it in, in which case we re-read it.
4261 */
4262 do {
4263 size_t size;
4264
4265 kfree(ondisk);
4266
4267 size = sizeof (*ondisk);
4268 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4269 size += names_size;
4270 ondisk = kmalloc(size, GFP_KERNEL);
4271 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05004272 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05004273
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004274 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4275 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05004276 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05004277 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004278 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05004279 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004280 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4281 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05004282 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004283 }
4284 if (!rbd_dev_ondisk_valid(ondisk)) {
4285 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004286 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05004287 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004288 }
4289
4290 names_size = le64_to_cpu(ondisk->snap_names_len);
4291 want_count = snap_count;
4292 snap_count = le32_to_cpu(ondisk->snap_count);
4293 } while (snap_count != want_count);
4294
Alex Elder662518b2013-05-06 09:51:29 -05004295 ret = rbd_header_from_disk(rbd_dev, ondisk);
4296out:
Alex Elder4156d992012-08-02 11:29:46 -05004297 kfree(ondisk);
4298
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004299 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004300}
4301
Alex Elder15228ed2013-05-01 12:43:03 -05004302/*
4303 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
4304 * has disappeared from the (just updated) snapshot context.
4305 */
4306static void rbd_exists_validate(struct rbd_device *rbd_dev)
4307{
4308 u64 snap_id;
4309
4310 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
4311 return;
4312
4313 snap_id = rbd_dev->spec->snap_id;
4314 if (snap_id == CEPH_NOSNAP)
4315 return;
4316
4317 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
4318 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4319}
4320
Josh Durgin98752012013-08-29 17:26:31 -07004321static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4322{
4323 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07004324
4325 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02004326 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4327 * try to update its size. If REMOVING is set, updating size
4328 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07004329 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02004330 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4331 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07004332 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4333 dout("setting size to %llu sectors", (unsigned long long)size);
4334 set_capacity(rbd_dev->disk, size);
4335 revalidate_disk(rbd_dev->disk);
4336 }
4337}
4338
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004339static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05004340{
Alex Eldere627db02013-05-06 07:40:30 -05004341 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05004342 int ret;
4343
Alex Eldercfbf6372013-05-31 17:40:45 -05004344 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004345 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004346
4347 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004348 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004349 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004350
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004351 /*
4352 * If there is a parent, see if it has disappeared due to the
4353 * mapped image getting flattened.
4354 */
4355 if (rbd_dev->parent) {
4356 ret = rbd_dev_v2_parent_info(rbd_dev);
4357 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004358 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004359 }
4360
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004361 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004362 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004363 } else {
4364 /* validate mapped snapshot's EXISTS flag */
4365 rbd_exists_validate(rbd_dev);
4366 }
Alex Elder15228ed2013-05-01 12:43:03 -05004367
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004368out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004369 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004370 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004371 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004372
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004373 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004374}
4375
Christoph Hellwigd6296d392017-05-01 10:19:08 -06004376static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4377 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004378{
4379 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4380
4381 INIT_WORK(work, rbd_queue_workfn);
4382 return 0;
4383}
4384
Eric Biggersf363b082017-03-30 13:39:16 -07004385static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004386 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004387 .init_request = rbd_init_request,
4388};
4389
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004390static int rbd_init_disk(struct rbd_device *rbd_dev)
4391{
4392 struct gendisk *disk;
4393 struct request_queue *q;
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004394 unsigned int objset_bytes =
4395 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004396 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004397
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004398 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004399 disk = alloc_disk(single_major ?
4400 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4401 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004402 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004403 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004404
Alex Elderf0f8cef2012-01-29 13:57:44 -06004405 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004406 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004407 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004408 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004409 if (single_major)
4410 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004411 disk->fops = &rbd_bd_ops;
4412 disk->private_data = rbd_dev;
4413
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004414 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4415 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004416 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004417 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ming Lei56d18f62019-02-15 19:13:24 +08004418 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004419 rbd_dev->tag_set.nr_hw_queues = 1;
4420 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4421
4422 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4423 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004424 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004425
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004426 q = blk_mq_init_queue(&rbd_dev->tag_set);
4427 if (IS_ERR(q)) {
4428 err = PTR_ERR(q);
4429 goto out_tag_set;
4430 }
4431
Bart Van Assche8b904b52018-03-07 17:10:10 -08004432 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004433 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004434
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004435 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004436 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01004437 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01004438 blk_queue_max_segment_size(q, UINT_MAX);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01004439 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
4440 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004441
Ilya Dryomovd9360542018-03-23 06:14:47 +01004442 if (rbd_dev->opts->trim) {
4443 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01004444 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
Ilya Dryomovd9360542018-03-23 06:14:47 +01004445 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4446 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4447 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004448
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004449 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01004450 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004451
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004452 /*
4453 * disk_release() expects a queue ref from add_disk() and will
4454 * put it. Hold an extra ref until add_disk() is called.
4455 */
4456 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004457 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004458 q->queuedata = rbd_dev;
4459
4460 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004461
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004462 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004463out_tag_set:
4464 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004465out_disk:
4466 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004467 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004468}
4469
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004470/*
4471 sysfs
4472*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004473
Alex Elder593a9e72012-02-07 12:03:37 -06004474static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4475{
4476 return container_of(dev, struct rbd_device, dev);
4477}
4478
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004479static ssize_t rbd_size_show(struct device *dev,
4480 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004481{
Alex Elder593a9e72012-02-07 12:03:37 -06004482 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004483
Alex Elderfc71d832013-04-26 15:44:36 -05004484 return sprintf(buf, "%llu\n",
4485 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004486}
4487
Alex Elder34b13182012-07-13 20:35:12 -05004488/*
4489 * Note this shows the features for whatever's mapped, which is not
4490 * necessarily the base image.
4491 */
4492static ssize_t rbd_features_show(struct device *dev,
4493 struct device_attribute *attr, char *buf)
4494{
4495 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4496
4497 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004498 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004499}
4500
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004501static ssize_t rbd_major_show(struct device *dev,
4502 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004503{
Alex Elder593a9e72012-02-07 12:03:37 -06004504 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004505
Alex Elderfc71d832013-04-26 15:44:36 -05004506 if (rbd_dev->major)
4507 return sprintf(buf, "%d\n", rbd_dev->major);
4508
4509 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004510}
Alex Elderfc71d832013-04-26 15:44:36 -05004511
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004512static ssize_t rbd_minor_show(struct device *dev,
4513 struct device_attribute *attr, char *buf)
4514{
4515 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4516
4517 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004518}
4519
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004520static ssize_t rbd_client_addr_show(struct device *dev,
4521 struct device_attribute *attr, char *buf)
4522{
4523 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4524 struct ceph_entity_addr *client_addr =
4525 ceph_client_addr(rbd_dev->rbd_client->client);
4526
4527 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4528 le32_to_cpu(client_addr->nonce));
4529}
4530
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004531static ssize_t rbd_client_id_show(struct device *dev,
4532 struct device_attribute *attr, char *buf)
4533{
Alex Elder593a9e72012-02-07 12:03:37 -06004534 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004535
Alex Elder1dbb4392012-01-24 10:08:37 -06004536 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004537 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004538}
4539
Mike Christie267fb902016-08-18 18:38:43 +02004540static ssize_t rbd_cluster_fsid_show(struct device *dev,
4541 struct device_attribute *attr, char *buf)
4542{
4543 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4544
4545 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4546}
4547
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004548static ssize_t rbd_config_info_show(struct device *dev,
4549 struct device_attribute *attr, char *buf)
4550{
4551 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4552
4553 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004554}
4555
4556static ssize_t rbd_pool_show(struct device *dev,
4557 struct device_attribute *attr, char *buf)
4558{
Alex Elder593a9e72012-02-07 12:03:37 -06004559 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004560
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004561 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004562}
4563
Alex Elder9bb2f332012-07-12 10:46:35 -05004564static ssize_t rbd_pool_id_show(struct device *dev,
4565 struct device_attribute *attr, char *buf)
4566{
4567 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4568
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004569 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004570 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004571}
4572
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004573static ssize_t rbd_pool_ns_show(struct device *dev,
4574 struct device_attribute *attr, char *buf)
4575{
4576 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4577
4578 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4579}
4580
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004581static ssize_t rbd_name_show(struct device *dev,
4582 struct device_attribute *attr, char *buf)
4583{
Alex Elder593a9e72012-02-07 12:03:37 -06004584 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004585
Alex Eldera92ffdf2012-10-30 19:40:33 -05004586 if (rbd_dev->spec->image_name)
4587 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4588
4589 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004590}
4591
Alex Elder589d30e2012-07-10 20:30:11 -05004592static ssize_t rbd_image_id_show(struct device *dev,
4593 struct device_attribute *attr, char *buf)
4594{
4595 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4596
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004597 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004598}
4599
Alex Elder34b13182012-07-13 20:35:12 -05004600/*
4601 * Shows the name of the currently-mapped snapshot (or
4602 * RBD_SNAP_HEAD_NAME for the base image).
4603 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004604static ssize_t rbd_snap_show(struct device *dev,
4605 struct device_attribute *attr,
4606 char *buf)
4607{
Alex Elder593a9e72012-02-07 12:03:37 -06004608 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004609
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004610 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004611}
4612
Mike Christie92a58672016-08-18 18:38:44 +02004613static ssize_t rbd_snap_id_show(struct device *dev,
4614 struct device_attribute *attr, char *buf)
4615{
4616 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4617
4618 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4619}
4620
Alex Elder86b00e02012-10-25 23:34:42 -05004621/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004622 * For a v2 image, shows the chain of parent images, separated by empty
4623 * lines. For v1 images or if there is no parent, shows "(no parent
4624 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004625 */
4626static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004627 struct device_attribute *attr,
4628 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004629{
4630 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004631 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004632
Ilya Dryomovff961282014-07-22 21:53:07 +04004633 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004634 return sprintf(buf, "(no parent image)\n");
4635
Ilya Dryomovff961282014-07-22 21:53:07 +04004636 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4637 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004638
Ilya Dryomovff961282014-07-22 21:53:07 +04004639 count += sprintf(&buf[count], "%s"
4640 "pool_id %llu\npool_name %s\n"
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004641 "pool_ns %s\n"
Ilya Dryomovff961282014-07-22 21:53:07 +04004642 "image_id %s\nimage_name %s\n"
4643 "snap_id %llu\nsnap_name %s\n"
4644 "overlap %llu\n",
4645 !count ? "" : "\n", /* first? */
4646 spec->pool_id, spec->pool_name,
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004647 spec->pool_ns ?: "",
Ilya Dryomovff961282014-07-22 21:53:07 +04004648 spec->image_id, spec->image_name ?: "(unknown)",
4649 spec->snap_id, spec->snap_name,
4650 rbd_dev->parent_overlap);
4651 }
Alex Elder86b00e02012-10-25 23:34:42 -05004652
Ilya Dryomovff961282014-07-22 21:53:07 +04004653 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004654}
4655
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004656static ssize_t rbd_image_refresh(struct device *dev,
4657 struct device_attribute *attr,
4658 const char *buf,
4659 size_t size)
4660{
Alex Elder593a9e72012-02-07 12:03:37 -06004661 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004662 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004663
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004664 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004665 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004666 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004667
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004668 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004669}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004670
Joe Perches5657a812018-05-24 13:38:59 -06004671static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
4672static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
4673static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
4674static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
4675static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
4676static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
4677static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
4678static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
4679static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
4680static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004681static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
Joe Perches5657a812018-05-24 13:38:59 -06004682static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
4683static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
4684static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
4685static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
4686static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
4687static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004688
4689static struct attribute *rbd_attrs[] = {
4690 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004691 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004692 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004693 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004694 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004695 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004696 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004697 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004698 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004699 &dev_attr_pool_id.attr,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004700 &dev_attr_pool_ns.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004701 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004702 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004703 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004704 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004705 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004706 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004707 NULL
4708};
4709
4710static struct attribute_group rbd_attr_group = {
4711 .attrs = rbd_attrs,
4712};
4713
4714static const struct attribute_group *rbd_attr_groups[] = {
4715 &rbd_attr_group,
4716 NULL
4717};
4718
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004719static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004720
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304721static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004722 .name = "rbd",
4723 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004724 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004725};
4726
Alex Elder8b8fb992012-10-26 17:25:24 -05004727static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4728{
4729 kref_get(&spec->kref);
4730
4731 return spec;
4732}
4733
4734static void rbd_spec_free(struct kref *kref);
4735static void rbd_spec_put(struct rbd_spec *spec)
4736{
4737 if (spec)
4738 kref_put(&spec->kref, rbd_spec_free);
4739}
4740
4741static struct rbd_spec *rbd_spec_alloc(void)
4742{
4743 struct rbd_spec *spec;
4744
4745 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4746 if (!spec)
4747 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004748
4749 spec->pool_id = CEPH_NOPOOL;
4750 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004751 kref_init(&spec->kref);
4752
Alex Elder8b8fb992012-10-26 17:25:24 -05004753 return spec;
4754}
4755
4756static void rbd_spec_free(struct kref *kref)
4757{
4758 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4759
4760 kfree(spec->pool_name);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004761 kfree(spec->pool_ns);
Alex Elder8b8fb992012-10-26 17:25:24 -05004762 kfree(spec->image_id);
4763 kfree(spec->image_name);
4764 kfree(spec->snap_name);
4765 kfree(spec);
4766}
4767
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004768static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004769{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004770 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004771 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004772
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004773 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004774 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004775 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004776
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004777 rbd_put_client(rbd_dev->rbd_client);
4778 rbd_spec_put(rbd_dev->spec);
4779 kfree(rbd_dev->opts);
4780 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004781}
4782
4783static void rbd_dev_release(struct device *dev)
4784{
4785 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4786 bool need_put = !!rbd_dev->opts;
4787
4788 if (need_put) {
4789 destroy_workqueue(rbd_dev->task_wq);
4790 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4791 }
4792
4793 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004794
4795 /*
4796 * This is racy, but way better than putting module outside of
4797 * the release callback. The race window is pretty small, so
4798 * doing something similar to dm (dm-builtin.c) is overkill.
4799 */
4800 if (need_put)
4801 module_put(THIS_MODULE);
4802}
4803
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004804static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4805 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004806{
4807 struct rbd_device *rbd_dev;
4808
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004809 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004810 if (!rbd_dev)
4811 return NULL;
4812
4813 spin_lock_init(&rbd_dev->lock);
4814 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004815 init_rwsem(&rbd_dev->header_rwsem);
4816
Ilya Dryomov7e973322017-01-25 18:16:22 +01004817 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004818 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004819 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004820 if (spec->pool_ns) {
4821 WARN_ON(!*spec->pool_ns);
4822 rbd_dev->header_oloc.pool_ns =
4823 ceph_find_or_create_string(spec->pool_ns,
4824 strlen(spec->pool_ns));
4825 }
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004826
Ilya Dryomov99d16942016-08-12 16:11:41 +02004827 mutex_init(&rbd_dev->watch_mutex);
4828 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4829 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4830
Ilya Dryomoved95b212016-08-12 16:40:02 +02004831 init_rwsem(&rbd_dev->lock_rwsem);
4832 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4833 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4834 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4835 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4836 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004837 spin_lock_init(&rbd_dev->lock_lists_lock);
4838 INIT_LIST_HEAD(&rbd_dev->running_list);
4839 init_completion(&rbd_dev->releasing_wait);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004840 init_waitqueue_head(&rbd_dev->lock_waitq);
4841
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004842 rbd_dev->dev.bus = &rbd_bus_type;
4843 rbd_dev->dev.type = &rbd_device_type;
4844 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004845 device_initialize(&rbd_dev->dev);
4846
Alex Elderc53d5892012-10-25 23:34:42 -05004847 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004848 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004849
Alex Elderc53d5892012-10-25 23:34:42 -05004850 return rbd_dev;
4851}
4852
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004853/*
4854 * Create a mapping rbd_dev.
4855 */
4856static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4857 struct rbd_spec *spec,
4858 struct rbd_options *opts)
4859{
4860 struct rbd_device *rbd_dev;
4861
4862 rbd_dev = __rbd_dev_create(rbdc, spec);
4863 if (!rbd_dev)
4864 return NULL;
4865
4866 rbd_dev->opts = opts;
4867
4868 /* get an id and fill in device name */
4869 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4870 minor_to_rbd_dev_id(1 << MINORBITS),
4871 GFP_KERNEL);
4872 if (rbd_dev->dev_id < 0)
4873 goto fail_rbd_dev;
4874
4875 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4876 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4877 rbd_dev->name);
4878 if (!rbd_dev->task_wq)
4879 goto fail_dev_id;
4880
4881 /* we have a ref from do_rbd_add() */
4882 __module_get(THIS_MODULE);
4883
4884 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4885 return rbd_dev;
4886
4887fail_dev_id:
4888 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4889fail_rbd_dev:
4890 rbd_dev_free(rbd_dev);
4891 return NULL;
4892}
4893
Alex Elderc53d5892012-10-25 23:34:42 -05004894static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4895{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004896 if (rbd_dev)
4897 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004898}
4899
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004900/*
Alex Elder9d475de2012-07-03 16:01:19 -05004901 * Get the size and object order for an image snapshot, or if
4902 * snap_id is CEPH_NOSNAP, gets this information for the base
4903 * image.
4904 */
4905static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4906 u8 *order, u64 *snap_size)
4907{
4908 __le64 snapid = cpu_to_le64(snap_id);
4909 int ret;
4910 struct {
4911 u8 order;
4912 __le64 size;
4913 } __attribute__ ((packed)) size_buf = { 0 };
4914
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004915 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4916 &rbd_dev->header_oloc, "get_size",
4917 &snapid, sizeof(snapid),
4918 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004919 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004920 if (ret < 0)
4921 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004922 if (ret < sizeof (size_buf))
4923 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004924
Josh Durginc3545572013-08-28 17:08:10 -07004925 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004926 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004927 dout(" order %u", (unsigned int)*order);
4928 }
Alex Elder9d475de2012-07-03 16:01:19 -05004929 *snap_size = le64_to_cpu(size_buf.size);
4930
Josh Durginc3545572013-08-28 17:08:10 -07004931 dout(" snap_id 0x%016llx snap_size = %llu\n",
4932 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004933 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004934
4935 return 0;
4936}
4937
4938static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4939{
4940 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4941 &rbd_dev->header.obj_order,
4942 &rbd_dev->header.image_size);
4943}
4944
Alex Elder1e130192012-07-03 16:01:19 -05004945static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4946{
4947 void *reply_buf;
4948 int ret;
4949 void *p;
4950
4951 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4952 if (!reply_buf)
4953 return -ENOMEM;
4954
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004955 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4956 &rbd_dev->header_oloc, "get_object_prefix",
4957 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004958 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004959 if (ret < 0)
4960 goto out;
4961
4962 p = reply_buf;
4963 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004964 p + ret, NULL, GFP_NOIO);
4965 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004966
4967 if (IS_ERR(rbd_dev->header.object_prefix)) {
4968 ret = PTR_ERR(rbd_dev->header.object_prefix);
4969 rbd_dev->header.object_prefix = NULL;
4970 } else {
4971 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4972 }
Alex Elder1e130192012-07-03 16:01:19 -05004973out:
4974 kfree(reply_buf);
4975
4976 return ret;
4977}
4978
Alex Elderb1b54022012-07-03 16:01:19 -05004979static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4980 u64 *snap_features)
4981{
4982 __le64 snapid = cpu_to_le64(snap_id);
4983 struct {
4984 __le64 features;
4985 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004986 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004987 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004988 int ret;
4989
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004990 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4991 &rbd_dev->header_oloc, "get_features",
4992 &snapid, sizeof(snapid),
4993 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004994 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004995 if (ret < 0)
4996 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004997 if (ret < sizeof (features_buf))
4998 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004999
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005000 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5001 if (unsup) {
5002 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5003 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05005004 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005005 }
Alex Elderd8891402012-10-09 13:50:17 -07005006
Alex Elderb1b54022012-07-03 16:01:19 -05005007 *snap_features = le64_to_cpu(features_buf.features);
5008
5009 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05005010 (unsigned long long)snap_id,
5011 (unsigned long long)*snap_features,
5012 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05005013
5014 return 0;
5015}
5016
5017static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5018{
5019 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5020 &rbd_dev->header.features);
5021}
5022
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005023struct parent_image_info {
5024 u64 pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005025 const char *pool_ns;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005026 const char *image_id;
5027 u64 snap_id;
5028
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005029 bool has_overlap;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005030 u64 overlap;
5031};
5032
5033/*
5034 * The caller is responsible for @pii.
5035 */
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005036static int decode_parent_image_spec(void **p, void *end,
5037 struct parent_image_info *pii)
5038{
5039 u8 struct_v;
5040 u32 struct_len;
5041 int ret;
5042
5043 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5044 &struct_v, &struct_len);
5045 if (ret)
5046 return ret;
5047
5048 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5049 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5050 if (IS_ERR(pii->pool_ns)) {
5051 ret = PTR_ERR(pii->pool_ns);
5052 pii->pool_ns = NULL;
5053 return ret;
5054 }
5055 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5056 if (IS_ERR(pii->image_id)) {
5057 ret = PTR_ERR(pii->image_id);
5058 pii->image_id = NULL;
5059 return ret;
5060 }
5061 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5062 return 0;
5063
5064e_inval:
5065 return -EINVAL;
5066}
5067
5068static int __get_parent_info(struct rbd_device *rbd_dev,
5069 struct page *req_page,
5070 struct page *reply_page,
5071 struct parent_image_info *pii)
5072{
5073 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5074 size_t reply_len = PAGE_SIZE;
5075 void *p, *end;
5076 int ret;
5077
5078 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5079 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
5080 req_page, sizeof(u64), reply_page, &reply_len);
5081 if (ret)
5082 return ret == -EOPNOTSUPP ? 1 : ret;
5083
5084 p = page_address(reply_page);
5085 end = p + reply_len;
5086 ret = decode_parent_image_spec(&p, end, pii);
5087 if (ret)
5088 return ret;
5089
5090 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5091 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
5092 req_page, sizeof(u64), reply_page, &reply_len);
5093 if (ret)
5094 return ret;
5095
5096 p = page_address(reply_page);
5097 end = p + reply_len;
5098 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5099 if (pii->has_overlap)
5100 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5101
5102 return 0;
5103
5104e_inval:
5105 return -EINVAL;
5106}
5107
5108/*
5109 * The caller is responsible for @pii.
5110 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005111static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5112 struct page *req_page,
5113 struct page *reply_page,
5114 struct parent_image_info *pii)
5115{
5116 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5117 size_t reply_len = PAGE_SIZE;
5118 void *p, *end;
5119 int ret;
5120
5121 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5122 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
5123 req_page, sizeof(u64), reply_page, &reply_len);
5124 if (ret)
5125 return ret;
5126
5127 p = page_address(reply_page);
5128 end = p + reply_len;
5129 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5130 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5131 if (IS_ERR(pii->image_id)) {
5132 ret = PTR_ERR(pii->image_id);
5133 pii->image_id = NULL;
5134 return ret;
5135 }
5136 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005137 pii->has_overlap = true;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005138 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5139
5140 return 0;
5141
5142e_inval:
5143 return -EINVAL;
5144}
5145
5146static int get_parent_info(struct rbd_device *rbd_dev,
5147 struct parent_image_info *pii)
5148{
5149 struct page *req_page, *reply_page;
5150 void *p;
5151 int ret;
5152
5153 req_page = alloc_page(GFP_KERNEL);
5154 if (!req_page)
5155 return -ENOMEM;
5156
5157 reply_page = alloc_page(GFP_KERNEL);
5158 if (!reply_page) {
5159 __free_page(req_page);
5160 return -ENOMEM;
5161 }
5162
5163 p = page_address(req_page);
5164 ceph_encode_64(&p, rbd_dev->spec->snap_id);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005165 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5166 if (ret > 0)
5167 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5168 pii);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005169
5170 __free_page(req_page);
5171 __free_page(reply_page);
5172 return ret;
5173}
5174
Alex Elder86b00e02012-10-25 23:34:42 -05005175static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5176{
5177 struct rbd_spec *parent_spec;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005178 struct parent_image_info pii = { 0 };
Alex Elder86b00e02012-10-25 23:34:42 -05005179 int ret;
5180
5181 parent_spec = rbd_spec_alloc();
5182 if (!parent_spec)
5183 return -ENOMEM;
5184
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005185 ret = get_parent_info(rbd_dev, &pii);
5186 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05005187 goto out_err;
5188
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005189 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5190 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5191 pii.has_overlap, pii.overlap);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005192
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005193 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
Alex Elder392a9da2013-05-06 17:40:33 -05005194 /*
5195 * Either the parent never existed, or we have
5196 * record of it but the image got flattened so it no
5197 * longer has a parent. When the parent of a
5198 * layered image disappears we immediately set the
5199 * overlap to 0. The effect of this is that all new
5200 * requests will be treated as if the image had no
5201 * parent.
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005202 *
5203 * If !pii.has_overlap, the parent image spec is not
5204 * applicable. It's there to avoid duplication in each
5205 * snapshot record.
Alex Elder392a9da2013-05-06 17:40:33 -05005206 */
5207 if (rbd_dev->parent_overlap) {
5208 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05005209 rbd_dev_parent_put(rbd_dev);
5210 pr_info("%s: clone image has been flattened\n",
5211 rbd_dev->disk->disk_name);
5212 }
5213
Alex Elder86b00e02012-10-25 23:34:42 -05005214 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05005215 }
Alex Elder86b00e02012-10-25 23:34:42 -05005216
Alex Elder0903e872012-11-14 12:25:19 -06005217 /* The ceph file layout needs to fit pool id in 32 bits */
5218
5219 ret = -EIO;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005220 if (pii.pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04005221 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005222 (unsigned long long)pii.pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05005223 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005224 }
Alex Elder0903e872012-11-14 12:25:19 -06005225
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005226 /*
5227 * The parent won't change (except when the clone is
5228 * flattened, already handled that). So we only need to
5229 * record the parent spec we have not already done so.
5230 */
5231 if (!rbd_dev->parent_spec) {
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005232 parent_spec->pool_id = pii.pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005233 if (pii.pool_ns && *pii.pool_ns) {
5234 parent_spec->pool_ns = pii.pool_ns;
5235 pii.pool_ns = NULL;
5236 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005237 parent_spec->image_id = pii.image_id;
5238 pii.image_id = NULL;
5239 parent_spec->snap_id = pii.snap_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005240
Alex Elder70cf49c2013-05-06 17:40:33 -05005241 rbd_dev->parent_spec = parent_spec;
5242 parent_spec = NULL; /* rbd_dev now owns this */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005243 }
5244
5245 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005246 * We always update the parent overlap. If it's zero we issue
5247 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005248 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005249 if (!pii.overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005250 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005251 /* refresh, careful to warn just once */
5252 if (rbd_dev->parent_overlap)
5253 rbd_warn(rbd_dev,
5254 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005255 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005256 /* initial probe */
5257 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005258 }
Alex Elder70cf49c2013-05-06 17:40:33 -05005259 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005260 rbd_dev->parent_overlap = pii.overlap;
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005261
Alex Elder86b00e02012-10-25 23:34:42 -05005262out:
5263 ret = 0;
5264out_err:
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005265 kfree(pii.pool_ns);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005266 kfree(pii.image_id);
Alex Elder86b00e02012-10-25 23:34:42 -05005267 rbd_spec_put(parent_spec);
Alex Elder86b00e02012-10-25 23:34:42 -05005268 return ret;
5269}
5270
Alex Eldercc070d52013-04-21 12:14:45 -05005271static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5272{
5273 struct {
5274 __le64 stripe_unit;
5275 __le64 stripe_count;
5276 } __attribute__ ((packed)) striping_info_buf = { 0 };
5277 size_t size = sizeof (striping_info_buf);
5278 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05005279 int ret;
5280
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005281 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5282 &rbd_dev->header_oloc, "get_stripe_unit_count",
5283 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05005284 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5285 if (ret < 0)
5286 return ret;
5287 if (ret < size)
5288 return -ERANGE;
5289
Alex Eldercc070d52013-04-21 12:14:45 -05005290 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01005291 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5292 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05005293 return 0;
5294}
5295
Ilya Dryomov7e973322017-01-25 18:16:22 +01005296static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5297{
5298 __le64 data_pool_id;
5299 int ret;
5300
5301 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5302 &rbd_dev->header_oloc, "get_data_pool",
5303 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5304 if (ret < 0)
5305 return ret;
5306 if (ret < sizeof(data_pool_id))
5307 return -EBADMSG;
5308
5309 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5310 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5311 return 0;
5312}
5313
Alex Elder9e15b772012-10-30 19:40:33 -05005314static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5315{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005316 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05005317 size_t image_id_size;
5318 char *image_id;
5319 void *p;
5320 void *end;
5321 size_t size;
5322 void *reply_buf = NULL;
5323 size_t len = 0;
5324 char *image_name = NULL;
5325 int ret;
5326
5327 rbd_assert(!rbd_dev->spec->image_name);
5328
Alex Elder69e7a022012-11-01 08:39:26 -05005329 len = strlen(rbd_dev->spec->image_id);
5330 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05005331 image_id = kmalloc(image_id_size, GFP_KERNEL);
5332 if (!image_id)
5333 return NULL;
5334
5335 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05005336 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05005337 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05005338
5339 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5340 reply_buf = kmalloc(size, GFP_KERNEL);
5341 if (!reply_buf)
5342 goto out;
5343
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005344 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5345 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5346 "dir_get_name", image_id, image_id_size,
5347 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005348 if (ret < 0)
5349 goto out;
5350 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005351 end = reply_buf + ret;
5352
Alex Elder9e15b772012-10-30 19:40:33 -05005353 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5354 if (IS_ERR(image_name))
5355 image_name = NULL;
5356 else
5357 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5358out:
5359 kfree(reply_buf);
5360 kfree(image_id);
5361
5362 return image_name;
5363}
5364
Alex Elder2ad3d712013-04-30 00:44:33 -05005365static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5366{
5367 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5368 const char *snap_name;
5369 u32 which = 0;
5370
5371 /* Skip over names until we find the one we are looking for */
5372
5373 snap_name = rbd_dev->header.snap_names;
5374 while (which < snapc->num_snaps) {
5375 if (!strcmp(name, snap_name))
5376 return snapc->snaps[which];
5377 snap_name += strlen(snap_name) + 1;
5378 which++;
5379 }
5380 return CEPH_NOSNAP;
5381}
5382
5383static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5384{
5385 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5386 u32 which;
5387 bool found = false;
5388 u64 snap_id;
5389
5390 for (which = 0; !found && which < snapc->num_snaps; which++) {
5391 const char *snap_name;
5392
5393 snap_id = snapc->snaps[which];
5394 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005395 if (IS_ERR(snap_name)) {
5396 /* ignore no-longer existing snapshots */
5397 if (PTR_ERR(snap_name) == -ENOENT)
5398 continue;
5399 else
5400 break;
5401 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005402 found = !strcmp(name, snap_name);
5403 kfree(snap_name);
5404 }
5405 return found ? snap_id : CEPH_NOSNAP;
5406}
5407
5408/*
5409 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5410 * no snapshot by that name is found, or if an error occurs.
5411 */
5412static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5413{
5414 if (rbd_dev->image_format == 1)
5415 return rbd_v1_snap_id_by_name(rbd_dev, name);
5416
5417 return rbd_v2_snap_id_by_name(rbd_dev, name);
5418}
5419
Alex Elder9e15b772012-10-30 19:40:33 -05005420/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005421 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005422 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005423static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5424{
5425 struct rbd_spec *spec = rbd_dev->spec;
5426
5427 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5428 rbd_assert(spec->image_id && spec->image_name);
5429 rbd_assert(spec->snap_name);
5430
5431 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5432 u64 snap_id;
5433
5434 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5435 if (snap_id == CEPH_NOSNAP)
5436 return -ENOENT;
5437
5438 spec->snap_id = snap_id;
5439 } else {
5440 spec->snap_id = CEPH_NOSNAP;
5441 }
5442
5443 return 0;
5444}
5445
5446/*
5447 * A parent image will have all ids but none of the names.
5448 *
5449 * All names in an rbd spec are dynamically allocated. It's OK if we
5450 * can't figure out the name for an image id.
5451 */
5452static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005453{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005454 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5455 struct rbd_spec *spec = rbd_dev->spec;
5456 const char *pool_name;
5457 const char *image_name;
5458 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005459 int ret;
5460
Ilya Dryomov04077592014-07-23 17:11:20 +04005461 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5462 rbd_assert(spec->image_id);
5463 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005464
Alex Elder2e9f7f12013-04-26 09:43:48 -05005465 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005466
Alex Elder2e9f7f12013-04-26 09:43:48 -05005467 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5468 if (!pool_name) {
5469 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005470 return -EIO;
5471 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005472 pool_name = kstrdup(pool_name, GFP_KERNEL);
5473 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005474 return -ENOMEM;
5475
5476 /* Fetch the image name; tolerate failure here */
5477
Alex Elder2e9f7f12013-04-26 09:43:48 -05005478 image_name = rbd_dev_image_name(rbd_dev);
5479 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005480 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005481
Ilya Dryomov04077592014-07-23 17:11:20 +04005482 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005483
Alex Elder2e9f7f12013-04-26 09:43:48 -05005484 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005485 if (IS_ERR(snap_name)) {
5486 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005487 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005488 }
5489
5490 spec->pool_name = pool_name;
5491 spec->image_name = image_name;
5492 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005493
5494 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005495
Alex Elder9e15b772012-10-30 19:40:33 -05005496out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005497 kfree(image_name);
5498 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005499 return ret;
5500}
5501
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005502static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005503{
5504 size_t size;
5505 int ret;
5506 void *reply_buf;
5507 void *p;
5508 void *end;
5509 u64 seq;
5510 u32 snap_count;
5511 struct ceph_snap_context *snapc;
5512 u32 i;
5513
5514 /*
5515 * We'll need room for the seq value (maximum snapshot id),
5516 * snapshot count, and array of that many snapshot ids.
5517 * For now we have a fixed upper limit on the number we're
5518 * prepared to receive.
5519 */
5520 size = sizeof (__le64) + sizeof (__le32) +
5521 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5522 reply_buf = kzalloc(size, GFP_KERNEL);
5523 if (!reply_buf)
5524 return -ENOMEM;
5525
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005526 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5527 &rbd_dev->header_oloc, "get_snapcontext",
5528 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005529 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005530 if (ret < 0)
5531 goto out;
5532
Alex Elder35d489f2012-07-03 16:01:19 -05005533 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005534 end = reply_buf + ret;
5535 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005536 ceph_decode_64_safe(&p, end, seq, out);
5537 ceph_decode_32_safe(&p, end, snap_count, out);
5538
5539 /*
5540 * Make sure the reported number of snapshot ids wouldn't go
5541 * beyond the end of our buffer. But before checking that,
5542 * make sure the computed size of the snapshot context we
5543 * allocate is representable in a size_t.
5544 */
5545 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5546 / sizeof (u64)) {
5547 ret = -EINVAL;
5548 goto out;
5549 }
5550 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5551 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005552 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005553
Alex Elder812164f82013-04-30 00:44:32 -05005554 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005555 if (!snapc) {
5556 ret = -ENOMEM;
5557 goto out;
5558 }
Alex Elder35d489f2012-07-03 16:01:19 -05005559 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005560 for (i = 0; i < snap_count; i++)
5561 snapc->snaps[i] = ceph_decode_64(&p);
5562
Alex Elder49ece552013-05-06 08:37:00 -05005563 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005564 rbd_dev->header.snapc = snapc;
5565
5566 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005567 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005568out:
5569 kfree(reply_buf);
5570
Alex Elder57385b52013-04-21 12:14:45 -05005571 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005572}
5573
Alex Elder54cac612013-04-30 00:44:33 -05005574static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5575 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005576{
5577 size_t size;
5578 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005579 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005580 int ret;
5581 void *p;
5582 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005583 char *snap_name;
5584
5585 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5586 reply_buf = kmalloc(size, GFP_KERNEL);
5587 if (!reply_buf)
5588 return ERR_PTR(-ENOMEM);
5589
Alex Elder54cac612013-04-30 00:44:33 -05005590 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005591 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5592 &rbd_dev->header_oloc, "get_snapshot_name",
5593 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005594 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005595 if (ret < 0) {
5596 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005597 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005598 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005599
5600 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005601 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005602 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005603 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005604 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005605
Alex Elderf40eb342013-04-25 15:09:42 -05005606 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005607 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005608out:
5609 kfree(reply_buf);
5610
Alex Elderf40eb342013-04-25 15:09:42 -05005611 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005612}
5613
Alex Elder2df3fac2013-05-06 09:51:30 -05005614static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005615{
Alex Elder2df3fac2013-05-06 09:51:30 -05005616 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005617 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005618
Josh Durgin1617e402013-06-12 14:43:10 -07005619 ret = rbd_dev_v2_image_size(rbd_dev);
5620 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005621 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005622
Alex Elder2df3fac2013-05-06 09:51:30 -05005623 if (first_time) {
5624 ret = rbd_dev_v2_header_onetime(rbd_dev);
5625 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005626 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005627 }
5628
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005629 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005630 if (ret && first_time) {
5631 kfree(rbd_dev->header.object_prefix);
5632 rbd_dev->header.object_prefix = NULL;
5633 }
Alex Elder117973f2012-08-31 17:29:55 -05005634
5635 return ret;
5636}
5637
Ilya Dryomova720ae02014-07-23 17:11:19 +04005638static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5639{
5640 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5641
5642 if (rbd_dev->image_format == 1)
5643 return rbd_dev_v1_header_info(rbd_dev);
5644
5645 return rbd_dev_v2_header_info(rbd_dev);
5646}
5647
Alex Elder1ddbe942012-01-29 13:57:44 -06005648/*
Alex Eldere28fff262012-02-02 08:13:30 -06005649 * Skips over white space at *buf, and updates *buf to point to the
5650 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005651 * the token (string of non-white space characters) found. Note
5652 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005653 */
5654static inline size_t next_token(const char **buf)
5655{
5656 /*
5657 * These are the characters that produce nonzero for
5658 * isspace() in the "C" and "POSIX" locales.
5659 */
5660 const char *spaces = " \f\n\r\t\v";
5661
5662 *buf += strspn(*buf, spaces); /* Find start of token */
5663
5664 return strcspn(*buf, spaces); /* Return token length */
5665}
5666
5667/*
Alex Elderea3352f2012-07-09 21:04:23 -05005668 * Finds the next token in *buf, dynamically allocates a buffer big
5669 * enough to hold a copy of it, and copies the token into the new
5670 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5671 * that a duplicate buffer is created even for a zero-length token.
5672 *
5673 * Returns a pointer to the newly-allocated duplicate, or a null
5674 * pointer if memory for the duplicate was not available. If
5675 * the lenp argument is a non-null pointer, the length of the token
5676 * (not including the '\0') is returned in *lenp.
5677 *
5678 * If successful, the *buf pointer will be updated to point beyond
5679 * the end of the found token.
5680 *
5681 * Note: uses GFP_KERNEL for allocation.
5682 */
5683static inline char *dup_token(const char **buf, size_t *lenp)
5684{
5685 char *dup;
5686 size_t len;
5687
5688 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005689 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005690 if (!dup)
5691 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005692 *(dup + len) = '\0';
5693 *buf += len;
5694
5695 if (lenp)
5696 *lenp = len;
5697
5698 return dup;
5699}
5700
5701/*
Alex Elder859c31d2012-10-25 23:34:42 -05005702 * Parse the options provided for an "rbd add" (i.e., rbd image
5703 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5704 * and the data written is passed here via a NUL-terminated buffer.
5705 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005706 *
Alex Elder859c31d2012-10-25 23:34:42 -05005707 * The information extracted from these options is recorded in
5708 * the other parameters which return dynamically-allocated
5709 * structures:
5710 * ceph_opts
5711 * The address of a pointer that will refer to a ceph options
5712 * structure. Caller must release the returned pointer using
5713 * ceph_destroy_options() when it is no longer needed.
5714 * rbd_opts
5715 * Address of an rbd options pointer. Fully initialized by
5716 * this function; caller must release with kfree().
5717 * spec
5718 * Address of an rbd image specification pointer. Fully
5719 * initialized by this function based on parsed options.
5720 * Caller must release with rbd_spec_put().
5721 *
5722 * The options passed take this form:
5723 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5724 * where:
5725 * <mon_addrs>
5726 * A comma-separated list of one or more monitor addresses.
5727 * A monitor address is an ip address, optionally followed
5728 * by a port number (separated by a colon).
5729 * I.e.: ip1[:port1][,ip2[:port2]...]
5730 * <options>
5731 * A comma-separated list of ceph and/or rbd options.
5732 * <pool_name>
5733 * The name of the rados pool containing the rbd image.
5734 * <image_name>
5735 * The name of the image in that pool to map.
5736 * <snap_id>
5737 * An optional snapshot id. If provided, the mapping will
5738 * present data from the image at the time that snapshot was
5739 * created. The image head is used if no snapshot id is
5740 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005741 */
Alex Elder859c31d2012-10-25 23:34:42 -05005742static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005743 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005744 struct rbd_options **opts,
5745 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005746{
Alex Elderd22f76e2012-07-12 10:46:35 -05005747 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005748 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005749 const char *mon_addrs;
Alex Elderecb4dc222013-04-26 09:43:47 -05005750 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005751 size_t mon_addrs_size;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005752 struct parse_rbd_opts_ctx pctx = { 0 };
Alex Elder859c31d2012-10-25 23:34:42 -05005753 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005754 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005755
5756 /* The first four tokens are required */
5757
Alex Elder7ef32142012-02-02 08:13:30 -06005758 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005759 if (!len) {
5760 rbd_warn(NULL, "no monitor address(es) provided");
5761 return -EINVAL;
5762 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005763 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005764 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005765 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005766
Alex Elderdc79b112012-10-25 23:34:41 -05005767 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005768 options = dup_token(&buf, NULL);
5769 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005770 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005771 if (!*options) {
5772 rbd_warn(NULL, "no options provided");
5773 goto out_err;
5774 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005775
Ilya Dryomovc3001562018-07-03 15:28:43 +02005776 pctx.spec = rbd_spec_alloc();
5777 if (!pctx.spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005778 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005779
Ilya Dryomovc3001562018-07-03 15:28:43 +02005780 pctx.spec->pool_name = dup_token(&buf, NULL);
5781 if (!pctx.spec->pool_name)
Alex Elder859c31d2012-10-25 23:34:42 -05005782 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005783 if (!*pctx.spec->pool_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005784 rbd_warn(NULL, "no pool name provided");
5785 goto out_err;
5786 }
Alex Eldere28fff262012-02-02 08:13:30 -06005787
Ilya Dryomovc3001562018-07-03 15:28:43 +02005788 pctx.spec->image_name = dup_token(&buf, NULL);
5789 if (!pctx.spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005790 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005791 if (!*pctx.spec->image_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005792 rbd_warn(NULL, "no image name provided");
5793 goto out_err;
5794 }
Alex Eldere28fff262012-02-02 08:13:30 -06005795
Alex Elderf28e5652012-10-25 23:34:41 -05005796 /*
5797 * Snapshot name is optional; default is to use "-"
5798 * (indicating the head/no snapshot).
5799 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005800 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005801 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005802 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5803 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005804 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005805 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005806 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005807 }
Alex Elderecb4dc222013-04-26 09:43:47 -05005808 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5809 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005810 goto out_mem;
Alex Elderecb4dc222013-04-26 09:43:47 -05005811 *(snap_name + len) = '\0';
Ilya Dryomovc3001562018-07-03 15:28:43 +02005812 pctx.spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005813
Alex Elder0ddebc02012-10-25 23:34:41 -05005814 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005815
Ilya Dryomovc3001562018-07-03 15:28:43 +02005816 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5817 if (!pctx.opts)
Alex Elder4e9afeb2012-10-25 23:34:41 -05005818 goto out_mem;
5819
Ilya Dryomovc3001562018-07-03 15:28:43 +02005820 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5821 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01005822 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005823 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5824 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5825 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5826 pctx.opts->trim = RBD_TRIM_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005827
Alex Elder859c31d2012-10-25 23:34:42 -05005828 copts = ceph_parse_options(options, mon_addrs,
Ilya Dryomovc3001562018-07-03 15:28:43 +02005829 mon_addrs + mon_addrs_size - 1,
5830 parse_rbd_opts_token, &pctx);
Alex Elder859c31d2012-10-25 23:34:42 -05005831 if (IS_ERR(copts)) {
5832 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005833 goto out_err;
5834 }
Alex Elder859c31d2012-10-25 23:34:42 -05005835 kfree(options);
5836
5837 *ceph_opts = copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005838 *opts = pctx.opts;
5839 *rbd_spec = pctx.spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005840
Alex Elderdc79b112012-10-25 23:34:41 -05005841 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005842out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005843 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005844out_err:
Ilya Dryomovc3001562018-07-03 15:28:43 +02005845 kfree(pctx.opts);
5846 rbd_spec_put(pctx.spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005847 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005848
Alex Elderdc79b112012-10-25 23:34:41 -05005849 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005850}
5851
Ilya Dryomove010dd02017-04-13 12:17:39 +02005852static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5853{
5854 down_write(&rbd_dev->lock_rwsem);
5855 if (__rbd_is_lock_owner(rbd_dev))
Ilya Dryomove1fddc82019-05-30 16:07:48 +02005856 __rbd_release_lock(rbd_dev);
Ilya Dryomove010dd02017-04-13 12:17:39 +02005857 up_write(&rbd_dev->lock_rwsem);
5858}
5859
5860static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5861{
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005862 int ret;
5863
Ilya Dryomove010dd02017-04-13 12:17:39 +02005864 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5865 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5866 return -EINVAL;
5867 }
5868
5869 /* FIXME: "rbd map --exclusive" should be in interruptible */
5870 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005871 ret = rbd_wait_state_locked(rbd_dev, true);
Ilya Dryomove010dd02017-04-13 12:17:39 +02005872 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005873 if (ret) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02005874 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5875 return -EROFS;
5876 }
5877
5878 return 0;
5879}
5880
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005881/*
Alex Elder589d30e2012-07-10 20:30:11 -05005882 * An rbd format 2 image has a unique identifier, distinct from the
5883 * name given to it by the user. Internally, that identifier is
5884 * what's used to specify the names of objects related to the image.
5885 *
5886 * A special "rbd id" object is used to map an rbd image name to its
5887 * id. If that object doesn't exist, then there is no v2 rbd image
5888 * with the supplied name.
5889 *
5890 * This function will record the given rbd_dev's image_id field if
5891 * it can be determined, and in that case will return 0. If any
5892 * errors occur a negative errno will be returned and the rbd_dev's
5893 * image_id field will be unchanged (and should be NULL).
5894 */
5895static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5896{
5897 int ret;
5898 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005899 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005900 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005901 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005902
Alex Elder589d30e2012-07-10 20:30:11 -05005903 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005904 * When probing a parent image, the image id is already
5905 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005906 * need to fetch the image id again in this case. We
5907 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005908 */
Alex Elderc0fba362013-04-25 23:15:08 -05005909 if (rbd_dev->spec->image_id) {
5910 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5911
Alex Elder2c0d0a12012-10-30 19:40:33 -05005912 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005913 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005914
5915 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005916 * First, see if the format 2 image id file exists, and if
5917 * so, get the image's persistent id from it.
5918 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005919 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5920 rbd_dev->spec->image_name);
5921 if (ret)
5922 return ret;
5923
5924 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005925
5926 /* Response will be an encoded string, which includes a length */
5927
5928 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5929 response = kzalloc(size, GFP_NOIO);
5930 if (!response) {
5931 ret = -ENOMEM;
5932 goto out;
5933 }
5934
Alex Elderc0fba362013-04-25 23:15:08 -05005935 /* If it doesn't exist we'll assume it's a format 1 image */
5936
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005937 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5938 "get_id", NULL, 0,
5939 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005940 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005941 if (ret == -ENOENT) {
5942 image_id = kstrdup("", GFP_KERNEL);
5943 ret = image_id ? 0 : -ENOMEM;
5944 if (!ret)
5945 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005946 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005947 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005948
Alex Elderc0fba362013-04-25 23:15:08 -05005949 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005950 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005951 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005952 if (!ret)
5953 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005954 }
5955
5956 if (!ret) {
5957 rbd_dev->spec->image_id = image_id;
5958 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005959 }
5960out:
5961 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005962 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005963 return ret;
5964}
5965
Alex Elder3abef3b2013-05-13 20:35:37 -05005966/*
5967 * Undo whatever state changes are made by v1 or v2 header info
5968 * call.
5969 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005970static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5971{
5972 struct rbd_image_header *header;
5973
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005974 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005975
5976 /* Free dynamic fields from the header, then zero it out */
5977
5978 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005979 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005980 kfree(header->snap_sizes);
5981 kfree(header->snap_names);
5982 kfree(header->object_prefix);
5983 memset(header, 0, sizeof (*header));
5984}
5985
Alex Elder2df3fac2013-05-06 09:51:30 -05005986static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005987{
5988 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005989
Alex Elder1e130192012-07-03 16:01:19 -05005990 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005991 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005992 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005993
Alex Elder2df3fac2013-05-06 09:51:30 -05005994 /*
5995 * Get the and check features for the image. Currently the
5996 * features are assumed to never change.
5997 */
Alex Elderb1b54022012-07-03 16:01:19 -05005998 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005999 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05006000 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05006001
Alex Eldercc070d52013-04-21 12:14:45 -05006002 /* If the image supports fancy striping, get its parameters */
6003
6004 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6005 ret = rbd_dev_v2_striping_info(rbd_dev);
6006 if (ret < 0)
6007 goto out_err;
6008 }
Alex Eldera30b71b2012-07-10 20:30:11 -05006009
Ilya Dryomov7e973322017-01-25 18:16:22 +01006010 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6011 ret = rbd_dev_v2_data_pool(rbd_dev);
6012 if (ret)
6013 goto out_err;
6014 }
6015
Ilya Dryomov263423f2017-01-25 18:16:22 +01006016 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05006017 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01006018
Alex Elder9d475de2012-07-03 16:01:19 -05006019out_err:
Alex Elder642a2532013-05-06 17:40:33 -05006020 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05006021 kfree(rbd_dev->header.object_prefix);
6022 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05006023 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006024}
6025
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006026/*
6027 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6028 * rbd_dev_image_probe() recursion depth, which means it's also the
6029 * length of the already discovered part of the parent chain.
6030 */
6031static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05006032{
Alex Elder2f82ee52012-10-30 19:40:33 -05006033 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05006034 int ret;
6035
6036 if (!rbd_dev->parent_spec)
6037 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05006038
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006039 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6040 pr_info("parent chain is too long (%d)\n", depth);
6041 ret = -EINVAL;
6042 goto out_err;
6043 }
6044
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006045 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006046 if (!parent) {
6047 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05006048 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006049 }
6050
6051 /*
6052 * Images related by parent/child relationships always share
6053 * rbd_client and spec/parent_spec, so bump their refcounts.
6054 */
6055 __rbd_get_client(rbd_dev->rbd_client);
6056 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05006057
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006058 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05006059 if (ret < 0)
6060 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006061
Alex Elder124afba2013-04-26 15:44:36 -05006062 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05006063 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05006064 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05006065
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006066out_err:
6067 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01006068 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05006069 return ret;
6070}
6071
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006072static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6073{
6074 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6075 rbd_dev_mapping_clear(rbd_dev);
6076 rbd_free_disk(rbd_dev);
6077 if (!single_major)
6078 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6079}
6080
Ilya Dryomov811c6682016-04-15 16:22:16 +02006081/*
6082 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6083 * upon return.
6084 */
Alex Elder200a6a82013-04-28 23:32:34 -05006085static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05006086{
Alex Elder83a06262012-10-30 15:47:17 -05006087 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05006088
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006089 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05006090
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006091 if (!single_major) {
6092 ret = register_blkdev(0, rbd_dev->name);
6093 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006094 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006095
6096 rbd_dev->major = ret;
6097 rbd_dev->minor = 0;
6098 } else {
6099 rbd_dev->major = rbd_major;
6100 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6101 }
Alex Elder83a06262012-10-30 15:47:17 -05006102
6103 /* Set up the blkdev mapping. */
6104
6105 ret = rbd_init_disk(rbd_dev);
6106 if (ret)
6107 goto err_out_blkdev;
6108
Alex Elderf35a4de2013-05-06 09:51:29 -05006109 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05006110 if (ret)
6111 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04006112
Alex Elderf35a4de2013-05-06 09:51:29 -05006113 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02006114 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05006115
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006116 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05006117 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006118 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05006119
Alex Elder129b79d2013-04-26 15:44:36 -05006120 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006121 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006122 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05006123
Alex Elderf35a4de2013-05-06 09:51:29 -05006124err_out_mapping:
6125 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05006126err_out_disk:
6127 rbd_free_disk(rbd_dev);
6128err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006129 if (!single_major)
6130 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006131err_out_unlock:
6132 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05006133 return ret;
6134}
6135
Alex Elder332bb122013-04-27 09:59:30 -05006136static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6137{
6138 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006139 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05006140
6141 /* Record the header object name for this rbd image. */
6142
6143 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05006144 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006145 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6146 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05006147 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006148 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6149 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05006150
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006151 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05006152}
6153
Alex Elder200a6a82013-04-28 23:32:34 -05006154static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6155{
Alex Elder6fd48b32013-04-28 23:32:34 -05006156 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006157 if (rbd_dev->opts)
6158 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006159 rbd_dev->image_format = 0;
6160 kfree(rbd_dev->spec->image_id);
6161 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05006162}
6163
Alex Eldera30b71b2012-07-10 20:30:11 -05006164/*
6165 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05006166 * device. If this image is the one being mapped (i.e., not a
6167 * parent), initiate a watch on its header object before using that
6168 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05006169 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006170static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05006171{
6172 int ret;
6173
6174 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05006175 * Get the id from the image id object. Unless there's an
6176 * error, rbd_dev->spec->image_id will be filled in with
6177 * a dynamically-allocated string, and rbd_dev->image_format
6178 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05006179 */
6180 ret = rbd_dev_image_id(rbd_dev);
6181 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05006182 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05006183
Alex Elder332bb122013-04-27 09:59:30 -05006184 ret = rbd_dev_header_name(rbd_dev);
6185 if (ret)
6186 goto err_out_format;
6187
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006188 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02006189 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006190 if (ret) {
6191 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006192 pr_info("image %s/%s%s%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006193 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006194 rbd_dev->spec->pool_ns ?: "",
6195 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006196 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006197 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006198 }
Alex Elder1f3ef782013-05-06 17:40:33 -05006199 }
Alex Elderb644de22013-04-27 09:59:31 -05006200
Ilya Dryomova720ae02014-07-23 17:11:19 +04006201 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05006202 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05006203 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05006204
Ilya Dryomov04077592014-07-23 17:11:20 +04006205 /*
6206 * If this image is the one being mapped, we have pool name and
6207 * id, image name and id, and snap name - need to fill snap id.
6208 * Otherwise this is a parent image, identified by pool, image
6209 * and snap ids - need to fill in names for those ids.
6210 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006211 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04006212 ret = rbd_spec_fill_snap_id(rbd_dev);
6213 else
6214 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006215 if (ret) {
6216 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006217 pr_info("snap %s/%s%s%s@%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006218 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006219 rbd_dev->spec->pool_ns ?: "",
6220 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006221 rbd_dev->spec->image_name,
6222 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05006223 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006224 }
Alex Elder9bb81c92013-04-27 09:59:30 -05006225
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006226 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6227 ret = rbd_dev_v2_parent_info(rbd_dev);
6228 if (ret)
6229 goto err_out_probe;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006230 }
6231
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006232 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05006233 if (ret)
6234 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05006235
Alex Elder30d60ba2013-05-06 09:51:30 -05006236 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006237 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05006238 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006239
Alex Elder6fd48b32013-04-28 23:32:34 -05006240err_out_probe:
6241 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05006242err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006243 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02006244 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05006245err_out_format:
6246 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05006247 kfree(rbd_dev->spec->image_id);
6248 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05006249 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006250}
6251
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006252static ssize_t do_rbd_add(struct bus_type *bus,
6253 const char *buf,
6254 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006255{
Alex Eldercb8627c2012-07-09 21:04:23 -05006256 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05006257 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05006258 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05006259 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05006260 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006261 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006262
6263 if (!try_module_get(THIS_MODULE))
6264 return -ENODEV;
6265
Alex Eldera725f65e2012-02-02 08:13:30 -06006266 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05006267 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05006268 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006269 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06006270
Alex Elder9d3997f2012-10-25 23:34:42 -05006271 rbdc = rbd_get_client(ceph_opts);
6272 if (IS_ERR(rbdc)) {
6273 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006274 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05006275 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006276
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006277 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01006278 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006279 if (rc < 0) {
6280 if (rc == -ENOENT)
6281 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006282 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006283 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05006284 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05006285
Ilya Dryomovd1475432015-06-22 13:24:48 +03006286 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006287 if (!rbd_dev) {
6288 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05006289 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006290 }
Alex Elderc53d5892012-10-25 23:34:42 -05006291 rbdc = NULL; /* rbd_dev now owns this */
6292 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03006293 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006294
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006295 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
6296 if (!rbd_dev->config_info) {
6297 rc = -ENOMEM;
6298 goto err_out_rbd_dev;
6299 }
6300
Ilya Dryomov811c6682016-04-15 16:22:16 +02006301 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006302 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006303 if (rc < 0) {
6304 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05006305 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006306 }
Alex Elder05fd6f62012-08-29 17:11:07 -05006307
Alex Elder7ce4eef2013-05-06 17:40:33 -05006308 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05006309 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02006310 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05006311
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01006312 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
6313 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
6314 rbd_dev->layout.object_size);
6315 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
6316 }
6317
Alex Elderb536f692013-04-28 23:32:34 -05006318 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006319 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006320 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05006321
Ilya Dryomove010dd02017-04-13 12:17:39 +02006322 if (rbd_dev->opts->exclusive) {
6323 rc = rbd_add_acquire_lock(rbd_dev);
6324 if (rc)
6325 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05006326 }
6327
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006328 /* Everything's ready. Announce the disk to the world. */
6329
6330 rc = device_add(&rbd_dev->dev);
6331 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02006332 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006333
6334 add_disk(rbd_dev->disk);
6335 /* see rbd_init_disk() */
6336 blk_put_queue(rbd_dev->disk->queue);
6337
6338 spin_lock(&rbd_dev_list_lock);
6339 list_add_tail(&rbd_dev->node, &rbd_dev_list);
6340 spin_unlock(&rbd_dev_list_lock);
6341
6342 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
6343 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
6344 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006345 rc = count;
6346out:
6347 module_put(THIS_MODULE);
6348 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05006349
Ilya Dryomove010dd02017-04-13 12:17:39 +02006350err_out_image_lock:
6351 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006352err_out_device_setup:
6353 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006354err_out_image_probe:
6355 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05006356err_out_rbd_dev:
6357 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05006358err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05006359 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006360err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05006361 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03006362 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006363 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006364}
6365
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006366static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006367{
6368 if (single_major)
6369 return -EINVAL;
6370
6371 return do_rbd_add(bus, buf, count);
6372}
6373
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006374static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
6375 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006376{
6377 return do_rbd_add(bus, buf, count);
6378}
6379
Alex Elder05a46af2013-04-26 15:44:36 -05006380static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6381{
Alex Elderad945fc2013-04-26 15:44:36 -05006382 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05006383 struct rbd_device *first = rbd_dev;
6384 struct rbd_device *second = first->parent;
6385 struct rbd_device *third;
6386
6387 /*
6388 * Follow to the parent with no grandparent and
6389 * remove it.
6390 */
6391 while (second && (third = second->parent)) {
6392 first = second;
6393 second = third;
6394 }
Alex Elderad945fc2013-04-26 15:44:36 -05006395 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006396 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006397 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05006398 first->parent = NULL;
6399 first->parent_overlap = 0;
6400
6401 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05006402 rbd_spec_put(first->parent_spec);
6403 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05006404 }
6405}
6406
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006407static ssize_t do_rbd_remove(struct bus_type *bus,
6408 const char *buf,
6409 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006410{
6411 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05006412 struct list_head *tmp;
6413 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02006414 char opt_buf[6];
Mike Christie0276dca2016-08-18 18:38:45 +02006415 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05006416 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006417
Mike Christie0276dca2016-08-18 18:38:45 +02006418 dev_id = -1;
6419 opt_buf[0] = '\0';
6420 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6421 if (dev_id < 0) {
6422 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006423 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02006424 }
6425 if (opt_buf[0] != '\0') {
6426 if (!strcmp(opt_buf, "force")) {
6427 force = true;
6428 } else {
6429 pr_err("bad remove option at '%s'\n", opt_buf);
6430 return -EINVAL;
6431 }
6432 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006433
Alex Elder751cc0e2013-05-31 15:17:01 -05006434 ret = -ENOENT;
6435 spin_lock(&rbd_dev_list_lock);
6436 list_for_each(tmp, &rbd_dev_list) {
6437 rbd_dev = list_entry(tmp, struct rbd_device, node);
6438 if (rbd_dev->dev_id == dev_id) {
6439 ret = 0;
6440 break;
6441 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006442 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006443 if (!ret) {
6444 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006445 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006446 ret = -EBUSY;
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006447 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6448 &rbd_dev->flags))
6449 ret = -EINPROGRESS;
Alex Elder751cc0e2013-05-31 15:17:01 -05006450 spin_unlock_irq(&rbd_dev->lock);
6451 }
6452 spin_unlock(&rbd_dev_list_lock);
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006453 if (ret)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006454 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006455
Mike Christie0276dca2016-08-18 18:38:45 +02006456 if (force) {
6457 /*
6458 * Prevent new IO from being queued and wait for existing
6459 * IO to complete/fail.
6460 */
6461 blk_mq_freeze_queue(rbd_dev->disk->queue);
6462 blk_set_queue_dying(rbd_dev->disk->queue);
6463 }
6464
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006465 del_gendisk(rbd_dev->disk);
6466 spin_lock(&rbd_dev_list_lock);
6467 list_del_init(&rbd_dev->node);
6468 spin_unlock(&rbd_dev_list_lock);
6469 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006470
Ilya Dryomove010dd02017-04-13 12:17:39 +02006471 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006472 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006473 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006474 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006475 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006476}
6477
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006478static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006479{
6480 if (single_major)
6481 return -EINVAL;
6482
6483 return do_rbd_remove(bus, buf, count);
6484}
6485
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006486static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
6487 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006488{
6489 return do_rbd_remove(bus, buf, count);
6490}
6491
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006492/*
6493 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006494 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006495 */
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006496static int __init rbd_sysfs_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006497{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006498 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006499
Alex Elderfed4c142012-02-07 12:03:36 -06006500 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006501 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006502 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006503
Alex Elderfed4c142012-02-07 12:03:36 -06006504 ret = bus_register(&rbd_bus_type);
6505 if (ret < 0)
6506 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006507
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006508 return ret;
6509}
6510
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006511static void __exit rbd_sysfs_cleanup(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006512{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006513 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006514 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006515}
6516
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006517static int __init rbd_slab_init(void)
Alex Elder1c2a9df2013-05-01 12:43:03 -05006518{
6519 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006520 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006521 if (!rbd_img_request_cache)
6522 return -ENOMEM;
6523
6524 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006525 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006526 if (!rbd_obj_request_cache)
6527 goto out_err;
6528
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006529 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006530
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006531out_err:
Alex Elder868311b2013-05-01 12:43:03 -05006532 kmem_cache_destroy(rbd_img_request_cache);
6533 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006534 return -ENOMEM;
6535}
6536
6537static void rbd_slab_exit(void)
6538{
Alex Elder868311b2013-05-01 12:43:03 -05006539 rbd_assert(rbd_obj_request_cache);
6540 kmem_cache_destroy(rbd_obj_request_cache);
6541 rbd_obj_request_cache = NULL;
6542
Alex Elder1c2a9df2013-05-01 12:43:03 -05006543 rbd_assert(rbd_img_request_cache);
6544 kmem_cache_destroy(rbd_img_request_cache);
6545 rbd_img_request_cache = NULL;
6546}
6547
Alex Eldercc344fa2013-02-19 12:25:56 -06006548static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006549{
6550 int rc;
6551
Alex Elder1e32d342013-01-30 11:13:33 -06006552 if (!libceph_compatible(NULL)) {
6553 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006554 return -EINVAL;
6555 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006556
Alex Elder1c2a9df2013-05-01 12:43:03 -05006557 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006558 if (rc)
6559 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006560
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006561 /*
6562 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006563 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006564 */
6565 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6566 if (!rbd_wq) {
6567 rc = -ENOMEM;
6568 goto err_out_slab;
6569 }
6570
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006571 if (single_major) {
6572 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6573 if (rbd_major < 0) {
6574 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006575 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006576 }
6577 }
6578
Alex Elder1c2a9df2013-05-01 12:43:03 -05006579 rc = rbd_sysfs_init();
6580 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006581 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006582
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006583 if (single_major)
6584 pr_info("loaded (major %d)\n", rbd_major);
6585 else
6586 pr_info("loaded\n");
6587
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006588 return 0;
6589
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006590err_out_blkdev:
6591 if (single_major)
6592 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006593err_out_wq:
6594 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006595err_out_slab:
6596 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006597 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006598}
6599
Alex Eldercc344fa2013-02-19 12:25:56 -06006600static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006601{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006602 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006603 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006604 if (single_major)
6605 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006606 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006607 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006608}
6609
6610module_init(rbd_init);
6611module_exit(rbd_exit);
6612
Alex Elderd552c612013-05-31 20:13:09 -05006613MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006614MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6615MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006616/* following authorship retained from original osdblk.c */
6617MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6618
Ilya Dryomov90da2582013-12-13 15:28:56 +02006619MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006620MODULE_LICENSE("GPL");