blob: 011539039693210f1ce5d413cef0eab10cbe3d76 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
David Howells82995cc2019-03-25 16:38:32 +000037#include <linux/fs_parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
Mark Rutlandbfc18e32018-06-21 13:13:04 +010064 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
Alex Eldera2acd002013-05-08 22:50:04 -050065 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200118#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
119#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100120#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100121#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100122#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100123
Ilya Dryomoved95b212016-08-12 16:40:02 +0200124#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
125 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100126 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200127 RBD_FEATURE_OBJECT_MAP | \
128 RBD_FEATURE_FAST_DIFF | \
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100129 RBD_FEATURE_DEEP_FLATTEN | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100130 RBD_FEATURE_DATA_POOL | \
131 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700132
133/* Features supported by this (client software) implementation. */
134
Alex Elder770eba62012-10-25 23:34:40 -0500135#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700136
Alex Elder81a89792012-02-02 08:13:30 -0600137/*
138 * An RBD device name will be "rbd#", where the "rbd" comes from
139 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600140 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700141#define DEV_NAME_LEN 32
142
143/*
144 * block device image metadata (in-memory version)
145 */
146struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500147 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500148 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700149 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500150 u64 stripe_unit;
151 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100152 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500153 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154
Alex Elderf84344f2012-08-31 17:29:51 -0500155 /* The remaining fields need to be updated occasionally */
156 u64 image_size;
157 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500158 char *snap_names; /* format 1 only */
159 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700160};
161
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500162/*
163 * An rbd image specification.
164 *
165 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500166 * identify an image. Each rbd_dev structure includes a pointer to
167 * an rbd_spec structure that encapsulates this identity.
168 *
169 * Each of the id's in an rbd_spec has an associated name. For a
170 * user-mapped image, the names are supplied and the id's associated
171 * with them are looked up. For a layered image, a parent image is
172 * defined by the tuple, and the names are looked up.
173 *
174 * An rbd_dev structure contains a parent_spec pointer which is
175 * non-null if the image it represents is a child in a layered
176 * image. This pointer will refer to the rbd_spec structure used
177 * by the parent rbd_dev for its own identity (i.e., the structure
178 * is shared between the parent and child).
179 *
180 * Since these structures are populated once, during the discovery
181 * phase of image construction, they are effectively immutable so
182 * we make no effort to synchronize access to them.
183 *
184 * Note that code herein does not assume the image name is known (it
185 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500186 */
187struct rbd_spec {
188 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500189 const char *pool_name;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200190 const char *pool_ns; /* NULL if default, never "" */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500191
Alex Elderecb4dc22013-04-26 09:43:47 -0500192 const char *image_id;
193 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500194
195 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500196 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500197
198 struct kref kref;
199};
200
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700201/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600202 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700203 */
204struct rbd_client {
205 struct ceph_client *client;
206 struct kref kref;
207 struct list_head node;
208};
209
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200210struct pending_result {
211 int result; /* first nonzero result */
212 int num_pending;
213};
214
Alex Elderbf0d5f502012-11-22 00:00:08 -0600215struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600216
Alex Elder9969ebc2013-01-18 12:31:10 -0600217enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100218 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100219 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100220 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100221 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600222};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600223
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800224enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100225 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800226 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800227 OBJ_OP_DISCARD,
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100228 OBJ_OP_ZEROOUT,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800229};
230
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200231#define RBD_OBJ_FLAG_DELETION (1U << 0)
232#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
Ilya Dryomov793333a302019-06-13 17:44:08 +0200233#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200234#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
235#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200236
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200237enum rbd_obj_read_state {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200238 RBD_OBJ_READ_START = 1,
239 RBD_OBJ_READ_OBJECT,
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200240 RBD_OBJ_READ_PARENT,
241};
242
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100243/*
244 * Writes go through the following state machine to deal with
245 * layering:
246 *
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100247 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248 * . | .
249 * . v .
250 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
251 * . | . .
252 * . v v (deep-copyup .
253 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
254 * flattened) v | . .
255 * . v . .
256 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
257 * | not needed) v
258 * v .
259 * done . . . . . . . . . . . . . . . . . .
260 * ^
261 * |
262 * RBD_OBJ_WRITE_FLAT
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100263 *
264 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100265 * assert_exists guard is needed or not (in some cases it's not needed
266 * even if there is a parent).
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100267 */
268enum rbd_obj_write_state {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200269 RBD_OBJ_WRITE_START = 1,
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200270 RBD_OBJ_WRITE_PRE_OBJECT_MAP,
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200271 RBD_OBJ_WRITE_OBJECT,
Ilya Dryomov793333a302019-06-13 17:44:08 +0200272 __RBD_OBJ_WRITE_COPYUP,
273 RBD_OBJ_WRITE_COPYUP,
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200274 RBD_OBJ_WRITE_POST_OBJECT_MAP,
Ilya Dryomov793333a302019-06-13 17:44:08 +0200275};
276
277enum rbd_obj_copyup_state {
278 RBD_OBJ_COPYUP_START = 1,
279 RBD_OBJ_COPYUP_READ_PARENT,
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200280 __RBD_OBJ_COPYUP_OBJECT_MAPS,
281 RBD_OBJ_COPYUP_OBJECT_MAPS,
Ilya Dryomov793333a302019-06-13 17:44:08 +0200282 __RBD_OBJ_COPYUP_WRITE_OBJECT,
283 RBD_OBJ_COPYUP_WRITE_OBJECT,
Alex Elder926f9b32013-02-11 12:33:24 -0600284};
285
Alex Elderbf0d5f502012-11-22 00:00:08 -0600286struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100287 struct ceph_object_extent ex;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200288 unsigned int flags; /* RBD_OBJ_FLAG_* */
Alex Elderc5b5ef62013-02-11 12:33:24 -0600289 union {
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200290 enum rbd_obj_read_state read_state; /* for reads */
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100291 enum rbd_obj_write_state write_state; /* for writes */
292 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293
Ilya Dryomov51c35092018-01-29 14:04:08 +0100294 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100295 struct ceph_file_extent *img_extents;
296 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600297
Alex Elder788e2df2013-01-17 12:25:27 -0600298 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100299 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600300 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100301 struct ceph_bvec_iter bvec_pos;
302 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100303 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600304 };
305 };
Ilya Dryomov793333a302019-06-13 17:44:08 +0200306
307 enum rbd_obj_copyup_state copyup_state;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100308 struct bio_vec *copyup_bvecs;
309 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600310
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +0200311 struct list_head osd_reqs; /* w/ r_private_item */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600312
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200313 struct mutex state_mutex;
Ilya Dryomov793333a302019-06-13 17:44:08 +0200314 struct pending_result pending;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600315 struct kref kref;
316};
317
Alex Elder0c425242013-02-08 09:55:49 -0600318enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600319 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600320 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600321};
322
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200323enum rbd_img_state {
324 RBD_IMG_START = 1,
Ilya Dryomov637cd062019-06-06 17:14:49 +0200325 RBD_IMG_EXCLUSIVE_LOCK,
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200326 __RBD_IMG_OBJECT_REQUESTS,
327 RBD_IMG_OBJECT_REQUESTS,
328};
329
Alex Elderbf0d5f502012-11-22 00:00:08 -0600330struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600331 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100332 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100333 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600334 unsigned long flags;
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200335 enum rbd_img_state state;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600336 union {
Alex Elder9849e982013-01-24 16:13:36 -0600337 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600338 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600339 };
Ilya Dryomov59e542c2020-02-12 15:23:58 +0100340 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600341
Ilya Dryomove1fddc82019-05-30 16:07:48 +0200342 struct list_head lock_item;
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100343 struct list_head object_extents; /* obj_req.ex structs */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600344
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200345 struct mutex state_mutex;
346 struct pending_result pending;
347 struct work_struct work;
348 int work_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600349};
350
351#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100352 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600353#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100354 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600355
Ilya Dryomov99d16942016-08-12 16:11:41 +0200356enum rbd_watch_state {
357 RBD_WATCH_STATE_UNREGISTERED,
358 RBD_WATCH_STATE_REGISTERED,
359 RBD_WATCH_STATE_ERROR,
360};
361
Ilya Dryomoved95b212016-08-12 16:40:02 +0200362enum rbd_lock_state {
363 RBD_LOCK_STATE_UNLOCKED,
364 RBD_LOCK_STATE_LOCKED,
365 RBD_LOCK_STATE_RELEASING,
366};
367
368/* WatchNotify::ClientId */
369struct rbd_client_id {
370 u64 gid;
371 u64 handle;
372};
373
Alex Elderf84344f2012-08-31 17:29:51 -0500374struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500375 u64 size;
Alex Elderf84344f2012-08-31 17:29:51 -0500376};
377
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700378/*
379 * a single device
380 */
381struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500382 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383
384 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200385 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700386 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700387
Alex Eldera30b71b2012-07-10 20:30:11 -0500388 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700389 struct rbd_client *rbd_client;
390
391 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392
Alex Elderb82d1672013-01-14 12:43:31 -0600393 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394
395 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600396 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500397 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300398 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200399 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700400
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200401 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200402 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500403
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200404 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600405
Ilya Dryomov99d16942016-08-12 16:11:41 +0200406 struct mutex watch_mutex;
407 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200408 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200409 u64 watch_cookie;
410 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700411
Ilya Dryomoved95b212016-08-12 16:40:02 +0200412 struct rw_semaphore lock_rwsem;
413 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200414 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200415 struct rbd_client_id owner_cid;
416 struct work_struct acquired_lock_work;
417 struct work_struct released_lock_work;
418 struct delayed_work lock_dwork;
419 struct work_struct unlock_work;
Ilya Dryomove1fddc82019-05-30 16:07:48 +0200420 spinlock_t lock_lists_lock;
Ilya Dryomov637cd062019-06-06 17:14:49 +0200421 struct list_head acquiring_list;
Ilya Dryomove1fddc82019-05-30 16:07:48 +0200422 struct list_head running_list;
Ilya Dryomov637cd062019-06-06 17:14:49 +0200423 struct completion acquire_wait;
424 int acquire_err;
Ilya Dryomove1fddc82019-05-30 16:07:48 +0200425 struct completion releasing_wait;
Ilya Dryomoved95b212016-08-12 16:40:02 +0200426
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200427 spinlock_t object_map_lock;
428 u8 *object_map;
429 u64 object_map_size; /* in objects */
430 u64 object_map_flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200432 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433
Alex Elder86b00e02012-10-25 23:34:42 -0500434 struct rbd_spec *parent_spec;
435 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500436 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500437 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500438
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100439 /* Block layer tags. */
440 struct blk_mq_tag_set tag_set;
441
Josh Durginc6666012011-11-21 17:11:12 -0800442 /* protects updating the header */
443 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500444
445 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446
447 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800448
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800449 /* sysfs related */
450 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600451 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800452};
453
Alex Elderb82d1672013-01-14 12:43:31 -0600454/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200455 * Flag bits for rbd_dev->flags:
456 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
457 * by rbd_dev->lock
Alex Elderb82d1672013-01-14 12:43:31 -0600458 */
Alex Elder6d292902013-01-14 12:43:31 -0600459enum rbd_dev_flags {
Ilya Dryomov686238b2019-11-18 12:51:02 +0100460 RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */
Alex Elderb82d1672013-01-14 12:43:31 -0600461 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov39258aa2019-11-07 17:16:23 +0100462 RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */
Alex Elder6d292902013-01-14 12:43:31 -0600463};
464
Alex Eldercfbf6372013-05-31 17:40:45 -0500465static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600466
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600468static DEFINE_SPINLOCK(rbd_dev_list_lock);
469
Alex Elder432b8582012-01-29 13:57:44 -0600470static LIST_HEAD(rbd_client_list); /* clients */
471static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700472
Alex Elder78c2a442013-05-01 12:43:04 -0500473/* Slab caches for frequently-allocated structures */
474
Alex Elder1c2a9df2013-05-01 12:43:03 -0500475static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500476static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500477
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200478static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200479static DEFINE_IDA(rbd_dev_id_ida);
480
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400481static struct workqueue_struct *rbd_wq;
482
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100483static struct ceph_snap_context rbd_empty_snapc = {
484 .nref = REFCOUNT_INIT(1),
485};
486
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200487/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100488 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200489 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100490static bool single_major = true;
Joe Perches5657a812018-05-24 13:38:59 -0600491module_param(single_major, bool, 0444);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100492MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200493
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100494static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
495static ssize_t remove_store(struct bus_type *bus, const char *buf,
496 size_t count);
497static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
498 size_t count);
499static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
500 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200501static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600502
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200503static int rbd_dev_id_to_minor(int dev_id)
504{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200505 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200506}
507
508static int minor_to_rbd_dev_id(int minor)
509{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200510 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200511}
512
Ilya Dryomov39258aa2019-11-07 17:16:23 +0100513static bool rbd_is_ro(struct rbd_device *rbd_dev)
514{
515 return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
516}
517
Ilya Dryomovf3c0e452019-11-07 16:22:10 +0100518static bool rbd_is_snap(struct rbd_device *rbd_dev)
519{
520 return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521}
522
Ilya Dryomoved95b212016-08-12 16:40:02 +0200523static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524{
Ilya Dryomov637cd062019-06-06 17:14:49 +0200525 lockdep_assert_held(&rbd_dev->lock_rwsem);
526
Ilya Dryomoved95b212016-08-12 16:40:02 +0200527 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529}
530
531static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532{
533 bool is_lock_owner;
534
535 down_read(&rbd_dev->lock_rwsem);
536 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537 up_read(&rbd_dev->lock_rwsem);
538 return is_lock_owner;
539}
540
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100541static ssize_t supported_features_show(struct bus_type *bus, char *buf)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100542{
543 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
544}
545
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100546static BUS_ATTR_WO(add);
547static BUS_ATTR_WO(remove);
548static BUS_ATTR_WO(add_single_major);
549static BUS_ATTR_WO(remove_single_major);
550static BUS_ATTR_RO(supported_features);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700551
552static struct attribute *rbd_bus_attrs[] = {
553 &bus_attr_add.attr,
554 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200555 &bus_attr_add_single_major.attr,
556 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100557 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700558 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600559};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200560
561static umode_t rbd_bus_is_visible(struct kobject *kobj,
562 struct attribute *attr, int index)
563{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200564 if (!single_major &&
565 (attr == &bus_attr_add_single_major.attr ||
566 attr == &bus_attr_remove_single_major.attr))
567 return 0;
568
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200569 return attr->mode;
570}
571
572static const struct attribute_group rbd_bus_group = {
573 .attrs = rbd_bus_attrs,
574 .is_visible = rbd_bus_is_visible,
575};
576__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600577
578static struct bus_type rbd_bus_type = {
579 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700580 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600581};
582
583static void rbd_root_dev_release(struct device *dev)
584{
585}
586
587static struct device rbd_root_dev = {
588 .init_name = "rbd",
589 .release = rbd_root_dev_release,
590};
591
Alex Elder06ecc6c2012-11-01 10:17:15 -0500592static __printf(2, 3)
593void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
594{
595 struct va_format vaf;
596 va_list args;
597
598 va_start(args, fmt);
599 vaf.fmt = fmt;
600 vaf.va = &args;
601
602 if (!rbd_dev)
603 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
604 else if (rbd_dev->disk)
605 printk(KERN_WARNING "%s: %s: %pV\n",
606 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
607 else if (rbd_dev->spec && rbd_dev->spec->image_name)
608 printk(KERN_WARNING "%s: image %s: %pV\n",
609 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
610 else if (rbd_dev->spec && rbd_dev->spec->image_id)
611 printk(KERN_WARNING "%s: id %s: %pV\n",
612 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
613 else /* punt */
614 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
615 RBD_DRV_NAME, rbd_dev, &vaf);
616 va_end(args);
617}
618
Alex Elderaafb2302012-09-06 16:00:54 -0500619#ifdef RBD_DEBUG
620#define rbd_assert(expr) \
621 if (unlikely(!(expr))) { \
622 printk(KERN_ERR "\nAssertion failure in %s() " \
623 "at line %d:\n\n" \
624 "\trbd_assert(%s);\n\n", \
625 __func__, __LINE__, #expr); \
626 BUG(); \
627 }
628#else /* !RBD_DEBUG */
629# define rbd_assert(expr) ((void) 0)
630#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800631
Alex Elder05a46af2013-04-26 15:44:36 -0500632static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600633
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500634static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500635static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400636static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400637static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500638static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
639 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500640static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
641 u8 *order, u64 *snap_size);
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200642static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700643
Ilya Dryomov54ab3b22019-05-11 16:21:49 +0200644static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200645static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
646
647/*
648 * Return true if nothing else is pending.
649 */
650static bool pending_result_dec(struct pending_result *pending, int *result)
651{
652 rbd_assert(pending->num_pending > 0);
653
654 if (*result && !pending->result)
655 pending->result = *result;
656 if (--pending->num_pending)
657 return false;
658
659 *result = pending->result;
660 return true;
661}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662
663static int rbd_open(struct block_device *bdev, fmode_t mode)
664{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600665 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600666 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667
Alex Eldera14ea262013-02-05 13:23:12 -0600668 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600669 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
670 removing = true;
671 else
672 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600673 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600674 if (removing)
675 return -ENOENT;
676
Alex Elderc3e946c2012-11-16 09:29:16 -0600677 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700678
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679 return 0;
680}
681
Al Virodb2a1442013-05-05 21:52:57 -0400682static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800683{
684 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600685 unsigned long open_count_before;
686
Alex Eldera14ea262013-02-05 13:23:12 -0600687 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600688 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600689 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600690 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800691
Alex Elderc3e946c2012-11-16 09:29:16 -0600692 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800693}
694
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800695static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
696{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200697 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800698
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200699 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800700 return -EFAULT;
701
Ilya Dryomovc1b62052019-11-12 19:50:55 +0100702 /*
703 * Both images mapped read-only and snapshots can't be marked
704 * read-write.
705 */
706 if (!ro) {
707 if (rbd_is_ro(rbd_dev))
708 return -EROFS;
709
710 rbd_assert(!rbd_is_snap(rbd_dev));
711 }
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800712
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200713 /* Let blkdev_roset() handle it */
714 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800715}
716
717static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
718 unsigned int cmd, unsigned long arg)
719{
720 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200721 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800722
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800723 switch (cmd) {
724 case BLKROSET:
725 ret = rbd_ioctl_set_ro(rbd_dev, arg);
726 break;
727 default:
728 ret = -ENOTTY;
729 }
730
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800731 return ret;
732}
733
734#ifdef CONFIG_COMPAT
735static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
736 unsigned int cmd, unsigned long arg)
737{
738 return rbd_ioctl(bdev, mode, cmd, arg);
739}
740#endif /* CONFIG_COMPAT */
741
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742static const struct block_device_operations rbd_bd_ops = {
743 .owner = THIS_MODULE,
744 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800745 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800746 .ioctl = rbd_ioctl,
747#ifdef CONFIG_COMPAT
748 .compat_ioctl = rbd_compat_ioctl,
749#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700750};
751
752/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500753 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500754 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700755 */
Alex Elderf8c38922012-08-10 13:12:07 -0700756static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700757{
758 struct rbd_client *rbdc;
759 int ret = -ENOMEM;
760
Alex Elder37206ee2013-02-20 17:32:08 -0600761 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700762 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
763 if (!rbdc)
764 goto out_opt;
765
766 kref_init(&rbdc->kref);
767 INIT_LIST_HEAD(&rbdc->node);
768
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100769 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500771 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500772 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700773
774 ret = ceph_open_session(rbdc->client);
775 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500776 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700777
Alex Elder432b8582012-01-29 13:57:44 -0600778 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700779 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600780 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700781
Alex Elder37206ee2013-02-20 17:32:08 -0600782 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600783
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500785out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700786 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500787out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700788 kfree(rbdc);
789out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500790 if (ceph_opts)
791 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600792 dout("%s: error %d\n", __func__, ret);
793
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400794 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700795}
796
Alex Elder2f82ee52012-10-30 19:40:33 -0500797static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
798{
799 kref_get(&rbdc->kref);
800
801 return rbdc;
802}
803
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700804/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700805 * Find a ceph client with specific addr and configuration. If
806 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700807 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700808static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700809{
810 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700811 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700812
Alex Elder43ae4702012-07-03 16:01:18 -0500813 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700814 return NULL;
815
Alex Elder1f7ba332012-08-10 13:12:07 -0700816 spin_lock(&rbd_client_list_lock);
817 list_for_each_entry(client_node, &rbd_client_list, node) {
818 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500819 __rbd_get_client(client_node);
820
Alex Elder1f7ba332012-08-10 13:12:07 -0700821 found = true;
822 break;
823 }
824 }
825 spin_unlock(&rbd_client_list_lock);
826
827 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700828}
829
830/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300831 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700832 */
833enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300834 Opt_queue_depth,
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100835 Opt_alloc_size,
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400836 Opt_lock_timeout,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700837 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200838 Opt_pool_ns,
Ilya Dryomovdc1dad82020-05-29 20:51:23 +0200839 Opt_compression_hint,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700840 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700841 Opt_read_only,
842 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200843 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200844 Opt_exclusive,
Ilya Dryomovd9360542018-03-23 06:14:47 +0100845 Opt_notrim,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700846};
847
Ilya Dryomovdc1dad82020-05-29 20:51:23 +0200848enum {
849 Opt_compression_hint_none,
850 Opt_compression_hint_compressible,
851 Opt_compression_hint_incompressible,
852};
853
854static const struct constant_table rbd_param_compression_hint[] = {
855 {"none", Opt_compression_hint_none},
856 {"compressible", Opt_compression_hint_compressible},
857 {"incompressible", Opt_compression_hint_incompressible},
858 {}
859};
860
Al Virod7167b12019-09-07 07:23:15 -0400861static const struct fs_parameter_spec rbd_parameters[] = {
David Howells82995cc2019-03-25 16:38:32 +0000862 fsparam_u32 ("alloc_size", Opt_alloc_size),
Ilya Dryomovdc1dad82020-05-29 20:51:23 +0200863 fsparam_enum ("compression_hint", Opt_compression_hint,
864 rbd_param_compression_hint),
David Howells82995cc2019-03-25 16:38:32 +0000865 fsparam_flag ("exclusive", Opt_exclusive),
866 fsparam_flag ("lock_on_read", Opt_lock_on_read),
867 fsparam_u32 ("lock_timeout", Opt_lock_timeout),
868 fsparam_flag ("notrim", Opt_notrim),
869 fsparam_string ("_pool_ns", Opt_pool_ns),
870 fsparam_u32 ("queue_depth", Opt_queue_depth),
871 fsparam_flag ("read_only", Opt_read_only),
872 fsparam_flag ("read_write", Opt_read_write),
873 fsparam_flag ("ro", Opt_read_only),
874 fsparam_flag ("rw", Opt_read_write),
875 {}
876};
877
Alex Elder98571b52013-01-20 14:44:42 -0600878struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300879 int queue_depth;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100880 int alloc_size;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400881 unsigned long lock_timeout;
Alex Elder98571b52013-01-20 14:44:42 -0600882 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200883 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200884 bool exclusive;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100885 bool trim;
Ilya Dryomovdc1dad82020-05-29 20:51:23 +0200886
887 u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
Alex Elder98571b52013-01-20 14:44:42 -0600888};
889
Ilya Dryomovb5584182015-06-23 16:21:19 +0300890#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100891#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400892#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
Alex Elder98571b52013-01-20 14:44:42 -0600893#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200894#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200895#define RBD_EXCLUSIVE_DEFAULT false
Ilya Dryomovd9360542018-03-23 06:14:47 +0100896#define RBD_TRIM_DEFAULT true
Alex Elder98571b52013-01-20 14:44:42 -0600897
David Howells82995cc2019-03-25 16:38:32 +0000898struct rbd_parse_opts_ctx {
Ilya Dryomovc3001562018-07-03 15:28:43 +0200899 struct rbd_spec *spec;
David Howells82995cc2019-03-25 16:38:32 +0000900 struct ceph_options *copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +0200901 struct rbd_options *opts;
902};
903
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800904static char* obj_op_name(enum obj_operation_type op_type)
905{
906 switch (op_type) {
907 case OBJ_OP_READ:
908 return "read";
909 case OBJ_OP_WRITE:
910 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800911 case OBJ_OP_DISCARD:
912 return "discard";
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100913 case OBJ_OP_ZEROOUT:
914 return "zeroout";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800915 default:
916 return "???";
917 }
918}
919
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700920/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700921 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600922 *
Alex Elder432b8582012-01-29 13:57:44 -0600923 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700924 */
925static void rbd_client_release(struct kref *kref)
926{
927 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
928
Alex Elder37206ee2013-02-20 17:32:08 -0600929 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500930 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700931 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500932 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700933
934 ceph_destroy_client(rbdc->client);
935 kfree(rbdc);
936}
937
938/*
939 * Drop reference to ceph client node. If it's not referenced anymore, release
940 * it.
941 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500942static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700943{
Alex Elderc53d5892012-10-25 23:34:42 -0500944 if (rbdc)
945 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700946}
947
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100948/*
949 * Get a ceph client with specific addr and configuration, if one does
950 * not exist create it. Either way, ceph_opts is consumed by this
951 * function.
952 */
953static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
954{
955 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100956 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100957
Ilya Dryomova32e4142019-05-02 15:56:00 +0200958 mutex_lock(&client_mutex);
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100959 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100960 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100961 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100962
963 /*
964 * Using an existing client. Make sure ->pg_pools is up to
965 * date before we look up the pool id in do_rbd_add().
966 */
Ilya Dryomov9d4a2272019-03-20 10:58:05 +0100967 ret = ceph_wait_for_latest_osdmap(rbdc->client,
968 rbdc->client->options->mount_timeout);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100969 if (ret) {
970 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
971 rbd_put_client(rbdc);
972 rbdc = ERR_PTR(ret);
973 }
974 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100975 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100976 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100977 mutex_unlock(&client_mutex);
978
979 return rbdc;
980}
981
Alex Eldera30b71b2012-07-10 20:30:11 -0500982static bool rbd_image_format_valid(u32 image_format)
983{
984 return image_format == 1 || image_format == 2;
985}
986
Alex Elder8e94af82012-07-25 09:32:40 -0500987static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
988{
Alex Elder103a1502012-08-02 11:29:45 -0500989 size_t size;
990 u32 snap_count;
991
992 /* The header has to start with the magic rbd header text */
993 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
994 return false;
995
Alex Elderdb2388b2012-10-20 22:17:27 -0500996 /* The bio layer requires at least sector-sized I/O */
997
998 if (ondisk->options.order < SECTOR_SHIFT)
999 return false;
1000
1001 /* If we use u64 in a few spots we may be able to loosen this */
1002
1003 if (ondisk->options.order > 8 * sizeof (int) - 1)
1004 return false;
1005
Alex Elder103a1502012-08-02 11:29:45 -05001006 /*
1007 * The size of a snapshot header has to fit in a size_t, and
1008 * that limits the number of snapshots.
1009 */
1010 snap_count = le32_to_cpu(ondisk->snap_count);
1011 size = SIZE_MAX - sizeof (struct ceph_snap_context);
1012 if (snap_count > size / sizeof (__le64))
1013 return false;
1014
1015 /*
1016 * Not only that, but the size of the entire the snapshot
1017 * header must also be representable in a size_t.
1018 */
1019 size -= snap_count * sizeof (__le64);
1020 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1021 return false;
1022
1023 return true;
Alex Elder8e94af82012-07-25 09:32:40 -05001024}
1025
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001026/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001027 * returns the size of an object in the image
1028 */
1029static u32 rbd_obj_bytes(struct rbd_image_header *header)
1030{
1031 return 1U << header->obj_order;
1032}
1033
Ilya Dryomov263423f2017-01-25 18:16:22 +01001034static void rbd_init_layout(struct rbd_device *rbd_dev)
1035{
1036 if (rbd_dev->header.stripe_unit == 0 ||
1037 rbd_dev->header.stripe_count == 0) {
1038 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1039 rbd_dev->header.stripe_count = 1;
1040 }
1041
1042 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1043 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1044 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +01001045 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1046 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001047 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1048}
1049
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001050/*
Alex Elderbb23e372013-05-06 09:51:29 -05001051 * Fill an rbd image header with information from the given format 1
1052 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001053 */
Alex Elder662518b2013-05-06 09:51:29 -05001054static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001055 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001056{
Alex Elder662518b2013-05-06 09:51:29 -05001057 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001058 bool first_time = header->object_prefix == NULL;
1059 struct ceph_snap_context *snapc;
1060 char *object_prefix = NULL;
1061 char *snap_names = NULL;
1062 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001063 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001064 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001065 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001066
Alex Elderbb23e372013-05-06 09:51:29 -05001067 /* Allocate this now to avoid having to handle failure below */
1068
1069 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001070 object_prefix = kstrndup(ondisk->object_prefix,
1071 sizeof(ondisk->object_prefix),
1072 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001073 if (!object_prefix)
1074 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001075 }
1076
1077 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001078
Alex Elder103a1502012-08-02 11:29:45 -05001079 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001080 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1081 if (!snapc)
1082 goto out_err;
1083 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001085 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001086 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1087
Alex Elderbb23e372013-05-06 09:51:29 -05001088 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001089
Alex Elderbb23e372013-05-06 09:51:29 -05001090 if (snap_names_len > (u64)SIZE_MAX)
1091 goto out_2big;
1092 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1093 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001094 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001095
1096 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001097 snap_sizes = kmalloc_array(snap_count,
1098 sizeof(*header->snap_sizes),
1099 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001100 if (!snap_sizes)
1101 goto out_err;
1102
Alex Elderf785cc12012-08-23 23:22:06 -05001103 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001104 * Copy the names, and fill in each snapshot's id
1105 * and size.
1106 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001107 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001108 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001109 * snap_names_len bytes beyond the end of the
1110 * snapshot id array, this memcpy() is safe.
1111 */
Alex Elderbb23e372013-05-06 09:51:29 -05001112 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1113 snaps = ondisk->snaps;
1114 for (i = 0; i < snap_count; i++) {
1115 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1116 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1117 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001118 }
Alex Elder849b4262012-07-09 21:04:24 -05001119
Alex Elderbb23e372013-05-06 09:51:29 -05001120 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001121
Alex Elderbb23e372013-05-06 09:51:29 -05001122 if (first_time) {
1123 header->object_prefix = object_prefix;
1124 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001125 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001126 } else {
1127 ceph_put_snap_context(header->snapc);
1128 kfree(header->snap_names);
1129 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001130 }
1131
1132 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001133
Alex Elderf84344f2012-08-31 17:29:51 -05001134 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001135 header->snapc = snapc;
1136 header->snap_names = snap_names;
1137 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001138
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001139 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001140out_2big:
1141 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001142out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001143 kfree(snap_sizes);
1144 kfree(snap_names);
1145 ceph_put_snap_context(snapc);
1146 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001147
Alex Elderbb23e372013-05-06 09:51:29 -05001148 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001149}
1150
Alex Elder9682fc62013-04-30 00:44:33 -05001151static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1152{
1153 const char *snap_name;
1154
1155 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1156
1157 /* Skip over names until we find the one we are looking for */
1158
1159 snap_name = rbd_dev->header.snap_names;
1160 while (which--)
1161 snap_name += strlen(snap_name) + 1;
1162
1163 return kstrdup(snap_name, GFP_KERNEL);
1164}
1165
Alex Elder30d1cff2013-05-01 12:43:03 -05001166/*
1167 * Snapshot id comparison function for use with qsort()/bsearch().
1168 * Note that result is for snapshots in *descending* order.
1169 */
1170static int snapid_compare_reverse(const void *s1, const void *s2)
1171{
1172 u64 snap_id1 = *(u64 *)s1;
1173 u64 snap_id2 = *(u64 *)s2;
1174
1175 if (snap_id1 < snap_id2)
1176 return 1;
1177 return snap_id1 == snap_id2 ? 0 : -1;
1178}
1179
1180/*
1181 * Search a snapshot context to see if the given snapshot id is
1182 * present.
1183 *
1184 * Returns the position of the snapshot id in the array if it's found,
1185 * or BAD_SNAP_INDEX otherwise.
1186 *
1187 * Note: The snapshot array is in kept sorted (by the osd) in
1188 * reverse order, highest snapshot id first.
1189 */
Alex Elder9682fc62013-04-30 00:44:33 -05001190static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1191{
1192 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001193 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001194
Alex Elder30d1cff2013-05-01 12:43:03 -05001195 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1196 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001197
Alex Elder30d1cff2013-05-01 12:43:03 -05001198 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001199}
1200
Alex Elder2ad3d712013-04-30 00:44:33 -05001201static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1202 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001203{
1204 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001205 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001206
1207 which = rbd_dev_snap_index(rbd_dev, snap_id);
1208 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001209 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001210
Josh Durginda6a6b62013-09-04 17:57:31 -07001211 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1212 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001213}
1214
Alex Elder9e15b772012-10-30 19:40:33 -05001215static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1216{
Alex Elder9e15b772012-10-30 19:40:33 -05001217 if (snap_id == CEPH_NOSNAP)
1218 return RBD_SNAP_HEAD_NAME;
1219
Alex Elder54cac612013-04-30 00:44:33 -05001220 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1221 if (rbd_dev->image_format == 1)
1222 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001223
Alex Elder54cac612013-04-30 00:44:33 -05001224 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001225}
1226
Alex Elder2ad3d712013-04-30 00:44:33 -05001227static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1228 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001229{
Alex Elder2ad3d712013-04-30 00:44:33 -05001230 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1231 if (snap_id == CEPH_NOSNAP) {
1232 *snap_size = rbd_dev->header.image_size;
1233 } else if (rbd_dev->image_format == 1) {
1234 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001235
Alex Elder2ad3d712013-04-30 00:44:33 -05001236 which = rbd_dev_snap_index(rbd_dev, snap_id);
1237 if (which == BAD_SNAP_INDEX)
1238 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001239
Alex Elder2ad3d712013-04-30 00:44:33 -05001240 *snap_size = rbd_dev->header.snap_sizes[which];
1241 } else {
1242 u64 size = 0;
1243 int ret;
1244
1245 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1246 if (ret)
1247 return ret;
1248
1249 *snap_size = size;
1250 }
1251 return 0;
1252}
1253
Alex Elderd1cf5782013-04-27 09:59:30 -05001254static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001256 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001257 u64 size = 0;
Alex Elder2ad3d712013-04-30 00:44:33 -05001258 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001259
Alex Elder2ad3d712013-04-30 00:44:33 -05001260 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1261 if (ret)
1262 return ret;
Alex Elder2ad3d712013-04-30 00:44:33 -05001263
1264 rbd_dev->mapping.size = size;
Alex Elder8b0241f2013-04-25 23:15:08 -05001265 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001266}
1267
Alex Elderd1cf5782013-04-27 09:59:30 -05001268static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1269{
1270 rbd_dev->mapping.size = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001271}
1272
Ilya Dryomov5359a172018-01-20 10:30:10 +01001273static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001274{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001275 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001276 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001277
Ilya Dryomov5359a172018-01-20 10:30:10 +01001278 buf = bvec_kmap_irq(bv, &flags);
1279 memset(buf, 0, bv->bv_len);
1280 flush_dcache_page(bv->bv_page);
1281 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001282}
1283
Ilya Dryomov5359a172018-01-20 10:30:10 +01001284static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001285{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001286 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001287
Ilya Dryomov5359a172018-01-20 10:30:10 +01001288 ceph_bio_iter_advance(&it, off);
1289 ceph_bio_iter_advance_step(&it, bytes, ({
1290 zero_bvec(&bv);
1291 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001292}
1293
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001294static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001295{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001296 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001297
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001298 ceph_bvec_iter_advance(&it, off);
1299 ceph_bvec_iter_advance_step(&it, bytes, ({
1300 zero_bvec(&bv);
1301 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001302}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001303
Alex Elderf7760da2012-10-20 22:17:27 -05001304/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001305 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001306 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001307 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001308 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001309 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001310static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1311 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001312{
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001313 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1314
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001315 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001316 case OBJ_REQUEST_BIO:
1317 zero_bios(&obj_req->bio_pos, off, bytes);
1318 break;
1319 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001320 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001321 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1322 break;
1323 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001324 BUG();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001325 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001326}
1327
1328static void rbd_obj_request_destroy(struct kref *kref);
1329static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1330{
1331 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001332 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001333 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001334 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1335}
1336
Alex Elderbf0d5f502012-11-22 00:00:08 -06001337static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1338 struct rbd_obj_request *obj_request)
1339{
Alex Elder25dcf952013-01-25 17:08:55 -06001340 rbd_assert(obj_request->img_request == NULL);
1341
Alex Elderb155e862013-04-15 14:50:37 -05001342 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001343 obj_request->img_request = img_request;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001344 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001345}
1346
1347static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1348 struct rbd_obj_request *obj_request)
1349{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001350 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001351 list_del(&obj_request->ex.oe_item);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001352 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001353 rbd_obj_request_put(obj_request);
1354}
1355
Ilya Dryomova086a1b2019-06-12 18:33:31 +02001356static void rbd_osd_submit(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001357{
Ilya Dryomova086a1b2019-06-12 18:33:31 +02001358 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov980917f2016-09-12 18:59:42 +02001359
Ilya Dryomova086a1b2019-06-12 18:33:31 +02001360 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1361 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1362 obj_req->ex.oe_off, obj_req->ex.oe_len);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001363 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001364}
1365
Alex Elder0c425242013-02-08 09:55:49 -06001366/*
1367 * The default/initial value for all image request flags is 0. Each
1368 * is conditionally set to 1 at image request initialization time
1369 * and currently never change thereafter.
1370 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001371static void img_request_layered_set(struct rbd_img_request *img_request)
1372{
1373 set_bit(IMG_REQ_LAYERED, &img_request->flags);
Alex Elderd0b2e942013-01-24 16:13:36 -06001374}
1375
1376static bool img_request_layered_test(struct rbd_img_request *img_request)
1377{
Alex Elderd0b2e942013-01-24 16:13:36 -06001378 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1379}
1380
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001381static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001382{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001383 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1384
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001385 return !obj_req->ex.oe_off &&
1386 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001387}
1388
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001389static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001390{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001391 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001392
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001393 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001394 rbd_dev->layout.object_size;
1395}
1396
Ilya Dryomov13488d52019-02-25 12:37:50 +01001397/*
1398 * Must be called after rbd_obj_calc_img_extents().
1399 */
1400static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1401{
1402 if (!obj_req->num_img_extents ||
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01001403 (rbd_obj_is_entire(obj_req) &&
1404 !obj_req->img_request->snapc->num_snaps))
Ilya Dryomov13488d52019-02-25 12:37:50 +01001405 return false;
1406
1407 return true;
1408}
1409
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001410static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1411{
1412 return ceph_file_extents_bytes(obj_req->img_extents,
1413 obj_req->num_img_extents);
1414}
1415
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001416static bool rbd_img_is_write(struct rbd_img_request *img_req)
1417{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001418 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001419 case OBJ_OP_READ:
1420 return false;
1421 case OBJ_OP_WRITE:
1422 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001423 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001424 return true;
1425 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02001426 BUG();
Alex Elder6e2a4502013-03-27 09:16:30 -05001427 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001428}
1429
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001430static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001431{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001432 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001433 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001434
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001435 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1436 osd_req->r_result, obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001437
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001438 /*
1439 * Writes aren't allowed to return a data payload. In some
1440 * guarded write cases (e.g. stat + zero on an empty object)
1441 * a stat response makes it through, but we don't care.
1442 */
1443 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1444 result = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001445 else
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001446 result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001447
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001448 rbd_obj_handle_request(obj_req, result);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001449}
1450
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001451static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
Alex Elder430c28c2013-04-03 21:32:51 -05001452{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001453 struct rbd_obj_request *obj_request = osd_req->r_priv;
Ilya Dryomov22d2cfd2020-06-04 11:12:34 +02001454 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1455 struct ceph_options *opt = rbd_dev->rbd_client->client->options;
Alex Elder430c28c2013-04-03 21:32:51 -05001456
Ilya Dryomov22d2cfd2020-06-04 11:12:34 +02001457 osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001458 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001459}
1460
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001461static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
Alex Elder9d4df012013-04-19 15:34:50 -05001462{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001463 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elder9d4df012013-04-19 15:34:50 -05001464
Ilya Dryomova162b302018-01-30 17:52:10 +01001465 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001466 ktime_get_real_ts64(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001467 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001468}
1469
Ilya Dryomovbc812072017-01-25 18:16:23 +01001470static struct ceph_osd_request *
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001471__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1472 struct ceph_snap_context *snapc, int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001473{
Ilya Dryomove28eded2019-02-25 11:42:26 +01001474 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001475 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1476 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001477 const char *name_format = rbd_dev->image_format == 1 ?
1478 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001479 int ret;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001480
Ilya Dryomove28eded2019-02-25 11:42:26 +01001481 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001482 if (!req)
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001483 return ERR_PTR(-ENOMEM);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001484
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001485 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001486 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001487 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001488
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001489 /*
1490 * Data objects may be stored in a separate pool, but always in
1491 * the same namespace in that pool as the header in its pool.
1492 */
1493 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001494 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001495
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001496 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1497 rbd_dev->header.object_prefix,
1498 obj_req->ex.oe_objno);
1499 if (ret)
1500 return ERR_PTR(ret);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001501
Ilya Dryomovbc812072017-01-25 18:16:23 +01001502 return req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001503}
1504
Ilya Dryomove28eded2019-02-25 11:42:26 +01001505static struct ceph_osd_request *
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001506rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
Ilya Dryomove28eded2019-02-25 11:42:26 +01001507{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001508 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1509 num_ops);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001510}
1511
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001512static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001513{
1514 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001515
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001516 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001517 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001518 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001519
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001520 ceph_object_extent_init(&obj_request->ex);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001521 INIT_LIST_HEAD(&obj_request->osd_reqs);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001522 mutex_init(&obj_request->state_mutex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001523 kref_init(&obj_request->kref);
1524
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001525 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001526 return obj_request;
1527}
1528
1529static void rbd_obj_request_destroy(struct kref *kref)
1530{
1531 struct rbd_obj_request *obj_request;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001532 struct ceph_osd_request *osd_req;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001533 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001534
1535 obj_request = container_of(kref, struct rbd_obj_request, kref);
1536
Alex Elder37206ee2013-02-20 17:32:08 -06001537 dout("%s: obj %p\n", __func__, obj_request);
1538
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001539 while (!list_empty(&obj_request->osd_reqs)) {
1540 osd_req = list_first_entry(&obj_request->osd_reqs,
1541 struct ceph_osd_request, r_private_item);
1542 list_del_init(&osd_req->r_private_item);
1543 ceph_osdc_put_request(osd_req);
1544 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001545
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001546 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001547 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001548 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001549 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001550 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001551 case OBJ_REQUEST_OWN_BVECS:
1552 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001553 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001554 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001555 BUG();
Alex Elderbf0d5f502012-11-22 00:00:08 -06001556 }
1557
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001558 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001559 if (obj_request->copyup_bvecs) {
1560 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1561 if (obj_request->copyup_bvecs[i].bv_page)
1562 __free_page(obj_request->copyup_bvecs[i].bv_page);
1563 }
1564 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001565 }
1566
Alex Elder868311b2013-05-01 12:43:03 -05001567 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001568}
1569
Alex Elderfb65d2282013-05-08 22:50:04 -05001570/* It's OK to call this for a device with no parent */
1571
1572static void rbd_spec_put(struct rbd_spec *spec);
1573static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1574{
1575 rbd_dev_remove_parent(rbd_dev);
1576 rbd_spec_put(rbd_dev->parent_spec);
1577 rbd_dev->parent_spec = NULL;
1578 rbd_dev->parent_overlap = 0;
1579}
1580
Alex Elderbf0d5f502012-11-22 00:00:08 -06001581/*
Alex Eldera2acd002013-05-08 22:50:04 -05001582 * Parent image reference counting is used to determine when an
1583 * image's parent fields can be safely torn down--after there are no
1584 * more in-flight requests to the parent image. When the last
1585 * reference is dropped, cleaning them up is safe.
1586 */
1587static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1588{
1589 int counter;
1590
1591 if (!rbd_dev->parent_spec)
1592 return;
1593
1594 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1595 if (counter > 0)
1596 return;
1597
1598 /* Last reference; clean up parent data structures */
1599
1600 if (!counter)
1601 rbd_dev_unparent(rbd_dev);
1602 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001603 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001604}
1605
1606/*
1607 * If an image has a non-zero parent overlap, get a reference to its
1608 * parent.
1609 *
1610 * Returns true if the rbd device has a parent with a non-zero
1611 * overlap and a reference for it was successfully taken, or
1612 * false otherwise.
1613 */
1614static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1615{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001616 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001617
1618 if (!rbd_dev->parent_spec)
1619 return false;
1620
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001621 if (rbd_dev->parent_overlap)
1622 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
Alex Eldera2acd002013-05-08 22:50:04 -05001623
1624 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001625 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001626
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001627 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001628}
1629
Ilya Dryomov59e542c2020-02-12 15:23:58 +01001630static void rbd_img_request_init(struct rbd_img_request *img_request,
1631 struct rbd_device *rbd_dev,
1632 enum obj_operation_type op_type)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001633{
Ilya Dryomov59e542c2020-02-12 15:23:58 +01001634 memset(img_request, 0, sizeof(*img_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001635
Alex Elderbf0d5f502012-11-22 00:00:08 -06001636 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001637 img_request->op_type = op_type;
Ilya Dryomova0c58952018-01-22 16:03:06 +01001638
Ilya Dryomove1fddc82019-05-30 16:07:48 +02001639 INIT_LIST_HEAD(&img_request->lock_item);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001640 INIT_LIST_HEAD(&img_request->object_extents);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02001641 mutex_init(&img_request->state_mutex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001642}
1643
Ilya Dryomova52cc682020-02-12 15:08:39 +01001644static void rbd_img_capture_header(struct rbd_img_request *img_req)
1645{
1646 struct rbd_device *rbd_dev = img_req->rbd_dev;
1647
1648 lockdep_assert_held(&rbd_dev->header_rwsem);
1649
1650 if (rbd_img_is_write(img_req))
1651 img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1652 else
1653 img_req->snap_id = rbd_dev->spec->snap_id;
1654
1655 if (rbd_dev_parent_get(rbd_dev))
1656 img_request_layered_set(img_req);
1657}
1658
Hannes Reinecke679a97d2020-01-31 11:37:36 +01001659static void rbd_img_request_destroy(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001660{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001661 struct rbd_obj_request *obj_request;
1662 struct rbd_obj_request *next_obj_request;
1663
Alex Elder37206ee2013-02-20 17:32:08 -06001664 dout("%s: img %p\n", __func__, img_request);
1665
Ilya Dryomove1fddc82019-05-30 16:07:48 +02001666 WARN_ON(!list_empty(&img_request->lock_item));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001667 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1668 rbd_img_obj_request_del(img_request, obj_request);
1669
Ilya Dryomov78b42a82020-02-12 14:34:03 +01001670 if (img_request_layered_test(img_request))
Alex Eldera2acd002013-05-08 22:50:04 -05001671 rbd_dev_parent_put(img_request->rbd_dev);
Alex Eldera2acd002013-05-08 22:50:04 -05001672
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001673 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001674 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001675
Ilya Dryomov59e542c2020-02-12 15:23:58 +01001676 if (test_bit(IMG_REQ_CHILD, &img_request->flags))
1677 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001678}
1679
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02001680#define BITS_PER_OBJ 2
1681#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
1682#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
1683
1684static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1685 u64 *index, u8 *shift)
1686{
1687 u32 off;
1688
1689 rbd_assert(objno < rbd_dev->object_map_size);
1690 *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1691 *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1692}
1693
1694static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1695{
1696 u64 index;
1697 u8 shift;
1698
1699 lockdep_assert_held(&rbd_dev->object_map_lock);
1700 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1701 return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
1702}
1703
1704static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
1705{
1706 u64 index;
1707 u8 shift;
1708 u8 *p;
1709
1710 lockdep_assert_held(&rbd_dev->object_map_lock);
1711 rbd_assert(!(val & ~OBJ_MASK));
1712
1713 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1714 p = &rbd_dev->object_map[index];
1715 *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
1716}
1717
1718static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1719{
1720 u8 state;
1721
1722 spin_lock(&rbd_dev->object_map_lock);
1723 state = __rbd_object_map_get(rbd_dev, objno);
1724 spin_unlock(&rbd_dev->object_map_lock);
1725 return state;
1726}
1727
1728static bool use_object_map(struct rbd_device *rbd_dev)
1729{
Ilya Dryomov3fe69922019-11-12 19:41:48 +01001730 /*
1731 * An image mapped read-only can't use the object map -- it isn't
1732 * loaded because the header lock isn't acquired. Someone else can
1733 * write to the image and update the object map behind our back.
1734 *
1735 * A snapshot can't be written to, so using the object map is always
1736 * safe.
1737 */
1738 if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
1739 return false;
1740
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02001741 return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1742 !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
1743}
1744
1745static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
1746{
1747 u8 state;
1748
1749 /* fall back to default logic if object map is disabled or invalid */
1750 if (!use_object_map(rbd_dev))
1751 return true;
1752
1753 state = rbd_object_map_get(rbd_dev, objno);
1754 return state != OBJECT_NONEXISTENT;
1755}
1756
1757static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1758 struct ceph_object_id *oid)
1759{
1760 if (snap_id == CEPH_NOSNAP)
1761 ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1762 rbd_dev->spec->image_id);
1763 else
1764 ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1765 rbd_dev->spec->image_id, snap_id);
1766}
1767
1768static int rbd_object_map_lock(struct rbd_device *rbd_dev)
1769{
1770 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1771 CEPH_DEFINE_OID_ONSTACK(oid);
1772 u8 lock_type;
1773 char *lock_tag;
1774 struct ceph_locker *lockers;
1775 u32 num_lockers;
1776 bool broke_lock = false;
1777 int ret;
1778
1779 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1780
1781again:
1782 ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1783 CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1784 if (ret != -EBUSY || broke_lock) {
1785 if (ret == -EEXIST)
1786 ret = 0; /* already locked by myself */
1787 if (ret)
1788 rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1789 return ret;
1790 }
1791
1792 ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1793 RBD_LOCK_NAME, &lock_type, &lock_tag,
1794 &lockers, &num_lockers);
1795 if (ret) {
1796 if (ret == -ENOENT)
1797 goto again;
1798
1799 rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
1800 return ret;
1801 }
1802
1803 kfree(lock_tag);
1804 if (num_lockers == 0)
1805 goto again;
1806
1807 rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1808 ENTITY_NAME(lockers[0].id.name));
1809
1810 ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1811 RBD_LOCK_NAME, lockers[0].id.cookie,
1812 &lockers[0].id.name);
1813 ceph_free_lockers(lockers, num_lockers);
1814 if (ret) {
1815 if (ret == -ENOENT)
1816 goto again;
1817
1818 rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1819 return ret;
1820 }
1821
1822 broke_lock = true;
1823 goto again;
1824}
1825
1826static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
1827{
1828 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1829 CEPH_DEFINE_OID_ONSTACK(oid);
1830 int ret;
1831
1832 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1833
1834 ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1835 "");
1836 if (ret && ret != -ENOENT)
1837 rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
1838}
1839
1840static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
1841{
1842 u8 struct_v;
1843 u32 struct_len;
1844 u32 header_len;
1845 void *header_end;
1846 int ret;
1847
1848 ceph_decode_32_safe(p, end, header_len, e_inval);
1849 header_end = *p + header_len;
1850
1851 ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1852 &struct_len);
1853 if (ret)
1854 return ret;
1855
1856 ceph_decode_64_safe(p, end, *object_map_size, e_inval);
1857
1858 *p = header_end;
1859 return 0;
1860
1861e_inval:
1862 return -EINVAL;
1863}
1864
1865static int __rbd_object_map_load(struct rbd_device *rbd_dev)
1866{
1867 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1868 CEPH_DEFINE_OID_ONSTACK(oid);
1869 struct page **pages;
1870 void *p, *end;
1871 size_t reply_len;
1872 u64 num_objects;
1873 u64 object_map_bytes;
1874 u64 object_map_size;
1875 int num_pages;
1876 int ret;
1877
1878 rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
1879
1880 num_objects = ceph_get_num_objects(&rbd_dev->layout,
1881 rbd_dev->mapping.size);
1882 object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1883 BITS_PER_BYTE);
1884 num_pages = calc_pages_for(0, object_map_bytes) + 1;
1885 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1886 if (IS_ERR(pages))
1887 return PTR_ERR(pages);
1888
1889 reply_len = num_pages * PAGE_SIZE;
1890 rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1891 ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1892 "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1893 NULL, 0, pages, &reply_len);
1894 if (ret)
1895 goto out;
1896
1897 p = page_address(pages[0]);
1898 end = p + min(reply_len, (size_t)PAGE_SIZE);
1899 ret = decode_object_map_header(&p, end, &object_map_size);
1900 if (ret)
1901 goto out;
1902
1903 if (object_map_size != num_objects) {
1904 rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1905 object_map_size, num_objects);
1906 ret = -EINVAL;
1907 goto out;
1908 }
1909
1910 if (offset_in_page(p) + object_map_bytes > reply_len) {
1911 ret = -EINVAL;
1912 goto out;
1913 }
1914
1915 rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
1916 if (!rbd_dev->object_map) {
1917 ret = -ENOMEM;
1918 goto out;
1919 }
1920
1921 rbd_dev->object_map_size = object_map_size;
1922 ceph_copy_from_page_vector(pages, rbd_dev->object_map,
1923 offset_in_page(p), object_map_bytes);
1924
1925out:
1926 ceph_release_page_vector(pages, num_pages);
1927 return ret;
1928}
1929
1930static void rbd_object_map_free(struct rbd_device *rbd_dev)
1931{
1932 kvfree(rbd_dev->object_map);
1933 rbd_dev->object_map = NULL;
1934 rbd_dev->object_map_size = 0;
1935}
1936
1937static int rbd_object_map_load(struct rbd_device *rbd_dev)
1938{
1939 int ret;
1940
1941 ret = __rbd_object_map_load(rbd_dev);
1942 if (ret)
1943 return ret;
1944
1945 ret = rbd_dev_v2_get_flags(rbd_dev);
1946 if (ret) {
1947 rbd_object_map_free(rbd_dev);
1948 return ret;
1949 }
1950
1951 if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
1952 rbd_warn(rbd_dev, "object map is invalid");
1953
1954 return 0;
1955}
1956
1957static int rbd_object_map_open(struct rbd_device *rbd_dev)
1958{
1959 int ret;
1960
1961 ret = rbd_object_map_lock(rbd_dev);
1962 if (ret)
1963 return ret;
1964
1965 ret = rbd_object_map_load(rbd_dev);
1966 if (ret) {
1967 rbd_object_map_unlock(rbd_dev);
1968 return ret;
1969 }
1970
1971 return 0;
1972}
1973
1974static void rbd_object_map_close(struct rbd_device *rbd_dev)
1975{
1976 rbd_object_map_free(rbd_dev);
1977 rbd_object_map_unlock(rbd_dev);
1978}
1979
1980/*
1981 * This function needs snap_id (or more precisely just something to
1982 * distinguish between HEAD and snapshot object maps), new_state and
1983 * current_state that were passed to rbd_object_map_update().
1984 *
1985 * To avoid allocating and stashing a context we piggyback on the OSD
1986 * request. A HEAD update has two ops (assert_locked). For new_state
1987 * and current_state we decode our own object_map_update op, encoded in
1988 * rbd_cls_object_map_update().
1989 */
1990static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
1991 struct ceph_osd_request *osd_req)
1992{
1993 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1994 struct ceph_osd_data *osd_data;
1995 u64 objno;
Kees Cook3f649ab2020-06-03 13:09:38 -07001996 u8 state, new_state, current_state;
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02001997 bool has_current_state;
1998 void *p;
1999
2000 if (osd_req->r_result)
2001 return osd_req->r_result;
2002
2003 /*
2004 * Nothing to do for a snapshot object map.
2005 */
2006 if (osd_req->r_num_ops == 1)
2007 return 0;
2008
2009 /*
2010 * Update in-memory HEAD object map.
2011 */
2012 rbd_assert(osd_req->r_num_ops == 2);
2013 osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
2014 rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
2015
2016 p = page_address(osd_data->pages[0]);
2017 objno = ceph_decode_64(&p);
2018 rbd_assert(objno == obj_req->ex.oe_objno);
2019 rbd_assert(ceph_decode_64(&p) == objno + 1);
2020 new_state = ceph_decode_8(&p);
2021 has_current_state = ceph_decode_8(&p);
2022 if (has_current_state)
2023 current_state = ceph_decode_8(&p);
2024
2025 spin_lock(&rbd_dev->object_map_lock);
2026 state = __rbd_object_map_get(rbd_dev, objno);
2027 if (!has_current_state || current_state == state ||
2028 (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
2029 __rbd_object_map_set(rbd_dev, objno, new_state);
2030 spin_unlock(&rbd_dev->object_map_lock);
2031
2032 return 0;
2033}
2034
2035static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
2036{
2037 struct rbd_obj_request *obj_req = osd_req->r_priv;
2038 int result;
2039
2040 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
2041 osd_req->r_result, obj_req);
2042
2043 result = rbd_object_map_update_finish(obj_req, osd_req);
2044 rbd_obj_handle_request(obj_req, result);
2045}
2046
2047static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
2048{
2049 u8 state = rbd_object_map_get(rbd_dev, objno);
2050
2051 if (state == new_state ||
2052 (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2053 (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2054 return false;
2055
2056 return true;
2057}
2058
2059static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2060 int which, u64 objno, u8 new_state,
2061 const u8 *current_state)
2062{
2063 struct page **pages;
2064 void *p, *start;
2065 int ret;
2066
2067 ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2068 if (ret)
2069 return ret;
2070
2071 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2072 if (IS_ERR(pages))
2073 return PTR_ERR(pages);
2074
2075 p = start = page_address(pages[0]);
2076 ceph_encode_64(&p, objno);
2077 ceph_encode_64(&p, objno + 1);
2078 ceph_encode_8(&p, new_state);
2079 if (current_state) {
2080 ceph_encode_8(&p, 1);
2081 ceph_encode_8(&p, *current_state);
2082 } else {
2083 ceph_encode_8(&p, 0);
2084 }
2085
2086 osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2087 false, true);
2088 return 0;
2089}
2090
2091/*
2092 * Return:
2093 * 0 - object map update sent
2094 * 1 - object map update isn't needed
2095 * <0 - error
2096 */
2097static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2098 u8 new_state, const u8 *current_state)
2099{
2100 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2101 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2102 struct ceph_osd_request *req;
2103 int num_ops = 1;
2104 int which = 0;
2105 int ret;
2106
2107 if (snap_id == CEPH_NOSNAP) {
2108 if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2109 return 1;
2110
2111 num_ops++; /* assert_locked */
2112 }
2113
2114 req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2115 if (!req)
2116 return -ENOMEM;
2117
2118 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2119 req->r_callback = rbd_object_map_callback;
2120 req->r_priv = obj_req;
2121
2122 rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2123 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2124 req->r_flags = CEPH_OSD_FLAG_WRITE;
2125 ktime_get_real_ts64(&req->r_mtime);
2126
2127 if (snap_id == CEPH_NOSNAP) {
2128 /*
2129 * Protect against possible race conditions during lock
2130 * ownership transitions.
2131 */
2132 ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2133 CEPH_CLS_LOCK_EXCLUSIVE, "", "");
2134 if (ret)
2135 return ret;
2136 }
2137
2138 ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2139 new_state, current_state);
2140 if (ret)
2141 return ret;
2142
2143 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2144 if (ret)
2145 return ret;
2146
2147 ceph_osdc_start_request(osdc, req, false);
2148 return 0;
2149}
2150
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002151static void prune_extents(struct ceph_file_extent *img_extents,
2152 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05002153{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002154 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05002155
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002156 /* drop extents completely beyond the overlap */
2157 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2158 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05002159
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002160 if (cnt) {
2161 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05002162
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002163 /* trim final overlapping extent */
2164 if (ex->fe_off + ex->fe_len > overlap)
2165 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06002166 }
2167
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002168 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05002169}
2170
Alex Elderf1a47392013-04-19 15:34:50 -05002171/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002172 * Determine the byte range(s) covered by either just the object extent
2173 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002174 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002175static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2176 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002177{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002178 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002179 int ret;
2180
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002181 if (!rbd_dev->parent_overlap)
2182 return 0;
2183
2184 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2185 entire ? 0 : obj_req->ex.oe_off,
2186 entire ? rbd_dev->layout.object_size :
2187 obj_req->ex.oe_len,
2188 &obj_req->img_extents,
2189 &obj_req->num_img_extents);
2190 if (ret)
2191 return ret;
2192
2193 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2194 rbd_dev->parent_overlap);
2195 return 0;
2196}
2197
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002198static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002199{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002200 struct rbd_obj_request *obj_req = osd_req->r_priv;
2201
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002202 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002203 case OBJ_REQUEST_BIO:
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002204 osd_req_op_extent_osd_data_bio(osd_req, which,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002205 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002206 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002207 break;
2208 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002209 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002210 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002211 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01002212 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002213 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002214 &obj_req->bvec_pos);
2215 break;
2216 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002217 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002218 }
2219}
2220
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002221static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002222{
2223 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02002224
Alex Elderc5b5ef62013-02-11 12:33:24 -06002225 /*
2226 * The response data for a STAT call consists of:
2227 * le64 length;
2228 * struct {
2229 * le32 tv_sec;
2230 * le32 tv_nsec;
2231 * } mtime;
2232 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002233 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2234 if (IS_ERR(pages))
2235 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002236
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002237 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2238 osd_req_op_raw_data_in_pages(osd_req, which, pages,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002239 8 + sizeof(struct ceph_timespec),
2240 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02002241 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002242}
2243
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02002244static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2245 u32 bytes)
Ilya Dryomov13488d52019-02-25 12:37:50 +01002246{
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02002247 struct rbd_obj_request *obj_req = osd_req->r_priv;
2248 int ret;
2249
2250 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2251 if (ret)
2252 return ret;
2253
2254 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2255 obj_req->copyup_bvec_count, bytes);
2256 return 0;
Ilya Dryomov13488d52019-02-25 12:37:50 +01002257}
2258
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002259static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
Alex Elderb454e362013-04-19 15:34:50 -05002260{
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002261 obj_req->read_state = RBD_OBJ_READ_START;
2262 return 0;
2263}
2264
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002265static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2266 int which)
Alex Elderb454e362013-04-19 15:34:50 -05002267{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002268 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002269 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2270 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05002271
Ilya Dryomov8b5bec52019-06-19 15:45:27 +02002272 if (!use_object_map(rbd_dev) ||
2273 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2274 osd_req_op_alloc_hint_init(osd_req, which++,
2275 rbd_dev->layout.object_size,
Ilya Dryomovd3798ac2020-05-29 20:31:37 +02002276 rbd_dev->layout.object_size,
Ilya Dryomovdc1dad82020-05-29 20:51:23 +02002277 rbd_dev->opts->alloc_hint_flags);
Ilya Dryomov8b5bec52019-06-19 15:45:27 +02002278 }
Alex Elderb454e362013-04-19 15:34:50 -05002279
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002280 if (rbd_obj_is_entire(obj_req))
2281 opcode = CEPH_OSD_OP_WRITEFULL;
2282 else
2283 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002284
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002285 osd_req_op_extent_init(osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002286 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002287 rbd_osd_setup_data(osd_req, which);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002288}
2289
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002290static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002291{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002292 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02002293
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002294 /* reverse map the entire object onto the parent */
2295 ret = rbd_obj_calc_img_extents(obj_req, true);
2296 if (ret)
2297 return ret;
2298
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002299 if (rbd_obj_copyup_enabled(obj_req))
2300 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002301
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002302 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002303 return 0;
2304}
2305
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002306static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2307{
2308 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2309 CEPH_OSD_OP_ZERO;
2310}
2311
Ilya Dryomov27bbd912019-05-29 17:31:37 +02002312static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2313 int which)
2314{
2315 struct rbd_obj_request *obj_req = osd_req->r_priv;
2316
2317 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2318 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2319 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
2320 } else {
2321 osd_req_op_extent_init(osd_req, which,
2322 truncate_or_zero_opcode(obj_req),
2323 obj_req->ex.oe_off, obj_req->ex.oe_len,
2324 0, 0);
2325 }
2326}
2327
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002328static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002329{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002330 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov27bbd912019-05-29 17:31:37 +02002331 u64 off, next_off;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002332 int ret;
2333
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002334 /*
2335 * Align the range to alloc_size boundary and punt on discards
2336 * that are too small to free up any space.
2337 *
2338 * alloc_size == object_size && is_tail() is a special case for
2339 * filestore with filestore_punch_hole = false, needed to allow
2340 * truncate (in addition to delete).
2341 */
2342 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2343 !rbd_obj_is_tail(obj_req)) {
Ilya Dryomov27bbd912019-05-29 17:31:37 +02002344 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2345 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2346 rbd_dev->opts->alloc_size);
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002347 if (off >= next_off)
2348 return 1;
Ilya Dryomov27bbd912019-05-29 17:31:37 +02002349
2350 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2351 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2352 off, next_off - off);
2353 obj_req->ex.oe_off = off;
2354 obj_req->ex.oe_len = next_off - off;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002355 }
2356
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002357 /* reverse map the entire object onto the parent */
2358 ret = rbd_obj_calc_img_extents(obj_req, true);
2359 if (ret)
2360 return ret;
2361
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002362 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002363 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2364 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002365
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002366 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002367 return 0;
2368}
2369
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002370static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2371 int which)
Ilya Dryomov13488d52019-02-25 12:37:50 +01002372{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002373 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002374 u16 opcode;
2375
2376 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002377 if (obj_req->num_img_extents) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002378 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002379 osd_req_op_init(osd_req, which++,
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01002380 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002381 opcode = CEPH_OSD_OP_TRUNCATE;
2382 } else {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002383 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002384 osd_req_op_init(osd_req, which++,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002385 CEPH_OSD_OP_DELETE, 0);
2386 opcode = 0;
2387 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002388 } else {
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002389 opcode = truncate_or_zero_opcode(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002390 }
2391
2392 if (opcode)
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002393 osd_req_op_extent_init(osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002394 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002395 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002396}
2397
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002398static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002399{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002400 int ret;
2401
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002402 /* reverse map the entire object onto the parent */
2403 ret = rbd_obj_calc_img_extents(obj_req, true);
2404 if (ret)
2405 return ret;
2406
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002407 if (rbd_obj_copyup_enabled(obj_req))
2408 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2409 if (!obj_req->num_img_extents) {
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002410 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002411 if (rbd_obj_is_entire(obj_req))
2412 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002413 }
2414
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002415 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002416 return 0;
2417}
2418
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002419static int count_write_ops(struct rbd_obj_request *obj_req)
2420{
Ilya Dryomov8b5bec52019-06-19 15:45:27 +02002421 struct rbd_img_request *img_req = obj_req->img_request;
2422
2423 switch (img_req->op_type) {
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002424 case OBJ_OP_WRITE:
Ilya Dryomov8b5bec52019-06-19 15:45:27 +02002425 if (!use_object_map(img_req->rbd_dev) ||
2426 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2427 return 2; /* setallochint + write/writefull */
2428
2429 return 1; /* write/writefull */
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002430 case OBJ_OP_DISCARD:
2431 return 1; /* delete/truncate/zero */
2432 case OBJ_OP_ZEROOUT:
2433 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2434 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2435 return 2; /* create + truncate */
2436
2437 return 1; /* delete/truncate/zero */
2438 default:
2439 BUG();
2440 }
2441}
2442
2443static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2444 int which)
2445{
2446 struct rbd_obj_request *obj_req = osd_req->r_priv;
2447
2448 switch (obj_req->img_request->op_type) {
2449 case OBJ_OP_WRITE:
2450 __rbd_osd_setup_write_ops(osd_req, which);
2451 break;
2452 case OBJ_OP_DISCARD:
2453 __rbd_osd_setup_discard_ops(osd_req, which);
2454 break;
2455 case OBJ_OP_ZEROOUT:
2456 __rbd_osd_setup_zeroout_ops(osd_req, which);
2457 break;
2458 default:
2459 BUG();
2460 }
2461}
2462
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002463/*
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002464 * Prune the list of object requests (adjust offset and/or length, drop
2465 * redundant requests). Prepare object request state machines and image
2466 * request state machine for execution.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002467 */
2468static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2469{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002470 struct rbd_obj_request *obj_req, *next_obj_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002471 int ret;
2472
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002473 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002474 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002475 case OBJ_OP_READ:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002476 ret = rbd_obj_init_read(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002477 break;
2478 case OBJ_OP_WRITE:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002479 ret = rbd_obj_init_write(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002480 break;
2481 case OBJ_OP_DISCARD:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002482 ret = rbd_obj_init_discard(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002483 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002484 case OBJ_OP_ZEROOUT:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002485 ret = rbd_obj_init_zeroout(obj_req);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002486 break;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002487 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002488 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002489 }
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002490 if (ret < 0)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002491 return ret;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002492 if (ret > 0) {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002493 rbd_img_obj_request_del(img_req, obj_req);
2494 continue;
2495 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002496 }
2497
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002498 img_req->state = RBD_IMG_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002499 return 0;
2500}
2501
Ilya Dryomov5a237812018-02-06 19:26:34 +01002502union rbd_img_fill_iter {
2503 struct ceph_bio_iter bio_iter;
2504 struct ceph_bvec_iter bvec_iter;
2505};
2506
2507struct rbd_img_fill_ctx {
2508 enum obj_request_type pos_type;
2509 union rbd_img_fill_iter *pos;
2510 union rbd_img_fill_iter iter;
2511 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01002512 ceph_object_extent_fn_t count_fn;
2513 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01002514};
2515
2516static struct ceph_object_extent *alloc_object_extent(void *arg)
2517{
2518 struct rbd_img_request *img_req = arg;
2519 struct rbd_obj_request *obj_req;
2520
2521 obj_req = rbd_obj_request_create();
2522 if (!obj_req)
2523 return NULL;
2524
2525 rbd_img_obj_request_add(img_req, obj_req);
2526 return &obj_req->ex;
2527}
2528
2529/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002530 * While su != os && sc == 1 is technically not fancy (it's the same
2531 * layout as su == os && sc == 1), we can't use the nocopy path for it
2532 * because ->set_pos_fn() should be called only once per object.
2533 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2534 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01002535 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002536static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2537{
2538 return l->stripe_unit != l->object_size;
2539}
2540
2541static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2542 struct ceph_file_extent *img_extents,
2543 u32 num_img_extents,
2544 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01002545{
2546 u32 i;
2547 int ret;
2548
2549 img_req->data_type = fctx->pos_type;
2550
2551 /*
2552 * Create object requests and set each object request's starting
2553 * position in the provided bio (list) or bio_vec array.
2554 */
2555 fctx->iter = *fctx->pos;
2556 for (i = 0; i < num_img_extents; i++) {
2557 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2558 img_extents[i].fe_off,
2559 img_extents[i].fe_len,
2560 &img_req->object_extents,
2561 alloc_object_extent, img_req,
2562 fctx->set_pos_fn, &fctx->iter);
2563 if (ret)
2564 return ret;
2565 }
2566
2567 return __rbd_img_fill_request(img_req);
2568}
2569
Ilya Dryomovafb97882018-02-06 19:26:35 +01002570/*
2571 * Map a list of image extents to a list of object extents, create the
2572 * corresponding object requests (normally each to a different object,
2573 * but not always) and add them to @img_req. For each object request,
2574 * set up its data descriptor to point to the corresponding chunk(s) of
2575 * @fctx->pos data buffer.
2576 *
2577 * Because ceph_file_to_extents() will merge adjacent object extents
2578 * together, each object request's data descriptor may point to multiple
2579 * different chunks of @fctx->pos data buffer.
2580 *
2581 * @fctx->pos data buffer is assumed to be large enough.
2582 */
2583static int rbd_img_fill_request(struct rbd_img_request *img_req,
2584 struct ceph_file_extent *img_extents,
2585 u32 num_img_extents,
2586 struct rbd_img_fill_ctx *fctx)
2587{
2588 struct rbd_device *rbd_dev = img_req->rbd_dev;
2589 struct rbd_obj_request *obj_req;
2590 u32 i;
2591 int ret;
2592
2593 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2594 !rbd_layout_is_fancy(&rbd_dev->layout))
2595 return rbd_img_fill_request_nocopy(img_req, img_extents,
2596 num_img_extents, fctx);
2597
2598 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2599
2600 /*
2601 * Create object requests and determine ->bvec_count for each object
2602 * request. Note that ->bvec_count sum over all object requests may
2603 * be greater than the number of bio_vecs in the provided bio (list)
2604 * or bio_vec array because when mapped, those bio_vecs can straddle
2605 * stripe unit boundaries.
2606 */
2607 fctx->iter = *fctx->pos;
2608 for (i = 0; i < num_img_extents; i++) {
2609 ret = ceph_file_to_extents(&rbd_dev->layout,
2610 img_extents[i].fe_off,
2611 img_extents[i].fe_len,
2612 &img_req->object_extents,
2613 alloc_object_extent, img_req,
2614 fctx->count_fn, &fctx->iter);
2615 if (ret)
2616 return ret;
2617 }
2618
2619 for_each_obj_request(img_req, obj_req) {
2620 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2621 sizeof(*obj_req->bvec_pos.bvecs),
2622 GFP_NOIO);
2623 if (!obj_req->bvec_pos.bvecs)
2624 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002625 }
2626
2627 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002628 * Fill in each object request's private bio_vec array, splitting and
2629 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002630 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002631 fctx->iter = *fctx->pos;
2632 for (i = 0; i < num_img_extents; i++) {
2633 ret = ceph_iterate_extents(&rbd_dev->layout,
2634 img_extents[i].fe_off,
2635 img_extents[i].fe_len,
2636 &img_req->object_extents,
2637 fctx->copy_fn, &fctx->iter);
2638 if (ret)
2639 return ret;
2640 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002641
Ilya Dryomovafb97882018-02-06 19:26:35 +01002642 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002643}
2644
Ilya Dryomov5a237812018-02-06 19:26:34 +01002645static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2646 u64 off, u64 len)
2647{
2648 struct ceph_file_extent ex = { off, len };
Arnd Bergmanna55e6012020-01-07 22:01:04 +01002649 union rbd_img_fill_iter dummy = {};
Ilya Dryomov5a237812018-02-06 19:26:34 +01002650 struct rbd_img_fill_ctx fctx = {
2651 .pos_type = OBJ_REQUEST_NODATA,
2652 .pos = &dummy,
2653 };
2654
2655 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2656}
2657
2658static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2659{
2660 struct rbd_obj_request *obj_req =
2661 container_of(ex, struct rbd_obj_request, ex);
2662 struct ceph_bio_iter *it = arg;
2663
2664 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2665 obj_req->bio_pos = *it;
2666 ceph_bio_iter_advance(it, bytes);
2667}
2668
Ilya Dryomovafb97882018-02-06 19:26:35 +01002669static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2670{
2671 struct rbd_obj_request *obj_req =
2672 container_of(ex, struct rbd_obj_request, ex);
2673 struct ceph_bio_iter *it = arg;
2674
2675 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2676 ceph_bio_iter_advance_step(it, bytes, ({
2677 obj_req->bvec_count++;
2678 }));
2679
2680}
2681
2682static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2683{
2684 struct rbd_obj_request *obj_req =
2685 container_of(ex, struct rbd_obj_request, ex);
2686 struct ceph_bio_iter *it = arg;
2687
2688 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2689 ceph_bio_iter_advance_step(it, bytes, ({
2690 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2691 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2692 }));
2693}
2694
Ilya Dryomov5a237812018-02-06 19:26:34 +01002695static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2696 struct ceph_file_extent *img_extents,
2697 u32 num_img_extents,
2698 struct ceph_bio_iter *bio_pos)
2699{
2700 struct rbd_img_fill_ctx fctx = {
2701 .pos_type = OBJ_REQUEST_BIO,
2702 .pos = (union rbd_img_fill_iter *)bio_pos,
2703 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002704 .count_fn = count_bio_bvecs,
2705 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002706 };
2707
2708 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2709 &fctx);
2710}
2711
2712static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2713 u64 off, u64 len, struct bio *bio)
2714{
2715 struct ceph_file_extent ex = { off, len };
2716 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2717
2718 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2719}
2720
2721static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2722{
2723 struct rbd_obj_request *obj_req =
2724 container_of(ex, struct rbd_obj_request, ex);
2725 struct ceph_bvec_iter *it = arg;
2726
2727 obj_req->bvec_pos = *it;
2728 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2729 ceph_bvec_iter_advance(it, bytes);
2730}
2731
Ilya Dryomovafb97882018-02-06 19:26:35 +01002732static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2733{
2734 struct rbd_obj_request *obj_req =
2735 container_of(ex, struct rbd_obj_request, ex);
2736 struct ceph_bvec_iter *it = arg;
2737
2738 ceph_bvec_iter_advance_step(it, bytes, ({
2739 obj_req->bvec_count++;
2740 }));
2741}
2742
2743static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2744{
2745 struct rbd_obj_request *obj_req =
2746 container_of(ex, struct rbd_obj_request, ex);
2747 struct ceph_bvec_iter *it = arg;
2748
2749 ceph_bvec_iter_advance_step(it, bytes, ({
2750 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2751 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2752 }));
2753}
2754
Ilya Dryomov5a237812018-02-06 19:26:34 +01002755static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2756 struct ceph_file_extent *img_extents,
2757 u32 num_img_extents,
2758 struct ceph_bvec_iter *bvec_pos)
2759{
2760 struct rbd_img_fill_ctx fctx = {
2761 .pos_type = OBJ_REQUEST_BVECS,
2762 .pos = (union rbd_img_fill_iter *)bvec_pos,
2763 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002764 .count_fn = count_bvecs,
2765 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002766 };
2767
2768 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2769 &fctx);
2770}
2771
2772static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2773 struct ceph_file_extent *img_extents,
2774 u32 num_img_extents,
2775 struct bio_vec *bvecs)
2776{
2777 struct ceph_bvec_iter it = {
2778 .bvecs = bvecs,
2779 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2780 num_img_extents) },
2781 };
2782
2783 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2784 &it);
2785}
2786
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002787static void rbd_img_handle_request_work(struct work_struct *work)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002788{
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002789 struct rbd_img_request *img_req =
2790 container_of(work, struct rbd_img_request, work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002791
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002792 rbd_img_handle_request(img_req, img_req->work_result);
2793}
Alex Elderbf0d5f502012-11-22 00:00:08 -06002794
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002795static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2796{
2797 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2798 img_req->work_result = result;
2799 queue_work(rbd_wq, &img_req->work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002800}
2801
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002802static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2803{
2804 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2805
2806 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2807 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2808 return true;
2809 }
2810
2811 dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2812 obj_req->ex.oe_objno);
2813 return false;
2814}
2815
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002816static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2817{
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002818 struct ceph_osd_request *osd_req;
2819 int ret;
2820
2821 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2822 if (IS_ERR(osd_req))
2823 return PTR_ERR(osd_req);
2824
2825 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2826 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2827 rbd_osd_setup_data(osd_req, 0);
2828 rbd_osd_format_read(osd_req);
2829
2830 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2831 if (ret)
2832 return ret;
2833
2834 rbd_osd_submit(osd_req);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002835 return 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002836}
2837
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002838static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002839{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002840 struct rbd_img_request *img_req = obj_req->img_request;
Ilya Dryomova52cc682020-02-12 15:08:39 +01002841 struct rbd_device *parent = img_req->rbd_dev->parent;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002842 struct rbd_img_request *child_img_req;
2843 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002844
Ilya Dryomov59e542c2020-02-12 15:23:58 +01002845 child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002846 if (!child_img_req)
2847 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002848
Ilya Dryomov59e542c2020-02-12 15:23:58 +01002849 rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
Ilya Dryomove93aca02018-02-06 19:26:35 +01002850 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2851 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002852
Ilya Dryomova52cc682020-02-12 15:08:39 +01002853 down_read(&parent->header_rwsem);
2854 rbd_img_capture_header(child_img_req);
2855 up_read(&parent->header_rwsem);
2856
Ilya Dryomov21ed05a2019-08-30 17:31:06 +02002857 dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2858 obj_req);
2859
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002860 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002861 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002862 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002863 ret = __rbd_img_fill_from_bio(child_img_req,
2864 obj_req->img_extents,
2865 obj_req->num_img_extents,
2866 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002867 break;
2868 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002869 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002870 ret = __rbd_img_fill_from_bvecs(child_img_req,
2871 obj_req->img_extents,
2872 obj_req->num_img_extents,
2873 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002874 break;
2875 default:
Arnd Bergmannd342a152019-03-22 15:36:37 +01002876 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002877 }
2878 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002879 ret = rbd_img_fill_from_bvecs(child_img_req,
2880 obj_req->img_extents,
2881 obj_req->num_img_extents,
2882 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002883 }
2884 if (ret) {
Hannes Reinecke679a97d2020-01-31 11:37:36 +01002885 rbd_img_request_destroy(child_img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002886 return ret;
2887 }
2888
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002889 /* avoid parent chain recursion */
2890 rbd_img_schedule(child_img_req, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002891 return 0;
2892}
2893
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002894static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002895{
2896 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2897 int ret;
2898
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002899again:
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002900 switch (obj_req->read_state) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002901 case RBD_OBJ_READ_START:
2902 rbd_assert(!*result);
2903
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002904 if (!rbd_obj_may_exist(obj_req)) {
2905 *result = -ENOENT;
2906 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2907 goto again;
2908 }
2909
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002910 ret = rbd_obj_read_object(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002911 if (ret) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002912 *result = ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002913 return true;
2914 }
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002915 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2916 return false;
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002917 case RBD_OBJ_READ_OBJECT:
2918 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2919 /* reverse map this object extent onto the parent */
2920 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002921 if (ret) {
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002922 *result = ret;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002923 return true;
2924 }
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002925 if (obj_req->num_img_extents) {
2926 ret = rbd_obj_read_from_parent(obj_req);
2927 if (ret) {
2928 *result = ret;
2929 return true;
2930 }
2931 obj_req->read_state = RBD_OBJ_READ_PARENT;
2932 return false;
2933 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002934 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002935
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002936 /*
2937 * -ENOENT means a hole in the image -- zero-fill the entire
2938 * length of the request. A short read also implies zero-fill
2939 * to the end of the request.
2940 */
2941 if (*result == -ENOENT) {
2942 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2943 *result = 0;
2944 } else if (*result >= 0) {
2945 if (*result < obj_req->ex.oe_len)
2946 rbd_obj_zero_range(obj_req, *result,
2947 obj_req->ex.oe_len - *result);
2948 else
2949 rbd_assert(*result == obj_req->ex.oe_len);
2950 *result = 0;
2951 }
2952 return true;
2953 case RBD_OBJ_READ_PARENT:
Ilya Dryomovd435c9a2019-08-27 16:45:10 +02002954 /*
2955 * The parent image is read only up to the overlap -- zero-fill
2956 * from the overlap to the end of the request.
2957 */
2958 if (!*result) {
2959 u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2960
2961 if (obj_overlap < obj_req->ex.oe_len)
2962 rbd_obj_zero_range(obj_req, obj_overlap,
2963 obj_req->ex.oe_len - obj_overlap);
2964 }
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002965 return true;
2966 default:
2967 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002968 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002969}
2970
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002971static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
2972{
2973 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2974
2975 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
2976 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2977
2978 if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
2979 (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
2980 dout("%s %p noop for nonexistent\n", __func__, obj_req);
2981 return true;
Alex Elder02c74fb2013-05-06 17:40:33 -05002982 }
2983
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002984 return false;
2985}
2986
2987/*
2988 * Return:
2989 * 0 - object map update sent
2990 * 1 - object map update isn't needed
2991 * <0 - error
2992 */
2993static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
2994{
2995 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2996 u8 new_state;
2997
2998 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
2999 return 1;
3000
3001 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3002 new_state = OBJECT_PENDING;
3003 else
3004 new_state = OBJECT_EXISTS;
3005
3006 return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
3007}
3008
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003009static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
3010{
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003011 struct ceph_osd_request *osd_req;
3012 int num_ops = count_write_ops(obj_req);
3013 int which = 0;
3014 int ret;
3015
3016 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
3017 num_ops++; /* stat */
3018
3019 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
3020 if (IS_ERR(osd_req))
3021 return PTR_ERR(osd_req);
3022
3023 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
3024 ret = rbd_osd_setup_stat(osd_req, which++);
3025 if (ret)
3026 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003027 }
3028
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003029 rbd_osd_setup_write_ops(osd_req, which);
3030 rbd_osd_format_write(osd_req);
3031
3032 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
3033 if (ret)
3034 return ret;
3035
3036 rbd_osd_submit(osd_req);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003037 return 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003038}
3039
3040/*
3041 * copyup_bvecs pages are never highmem pages
3042 */
3043static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3044{
3045 struct ceph_bvec_iter it = {
3046 .bvecs = bvecs,
3047 .iter = { .bi_size = bytes },
3048 };
3049
3050 ceph_bvec_iter_advance_step(&it, bytes, ({
3051 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3052 bv.bv_len))
3053 return false;
3054 }));
3055 return true;
3056}
3057
Ilya Dryomov3a482502019-02-28 10:49:12 +01003058#define MODS_ONLY U32_MAX
3059
Ilya Dryomov793333a302019-06-13 17:44:08 +02003060static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3061 u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003062{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003063 struct ceph_osd_request *osd_req;
Chengguang Xufe943d52018-04-12 12:04:55 +08003064 int ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003065
3066 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01003067 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003068
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003069 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3070 if (IS_ERR(osd_req))
3071 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003072
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02003073 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
Chengguang Xufe943d52018-04-12 12:04:55 +08003074 if (ret)
3075 return ret;
3076
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003077 rbd_osd_format_write(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003078
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003079 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01003080 if (ret)
3081 return ret;
3082
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003083 rbd_osd_submit(osd_req);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01003084 return 0;
3085}
3086
Ilya Dryomov793333a302019-06-13 17:44:08 +02003087static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3088 u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003089{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003090 struct ceph_osd_request *osd_req;
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003091 int num_ops = count_write_ops(obj_req);
3092 int which = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003093 int ret;
3094
3095 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003096
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003097 if (bytes != MODS_ONLY)
3098 num_ops++; /* copyup */
Ilya Dryomov13488d52019-02-25 12:37:50 +01003099
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003100 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003101 if (IS_ERR(osd_req))
3102 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003103
Ilya Dryomov3a482502019-02-28 10:49:12 +01003104 if (bytes != MODS_ONLY) {
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02003105 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
Ilya Dryomov3a482502019-02-28 10:49:12 +01003106 if (ret)
3107 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003108 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003109
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003110 rbd_osd_setup_write_ops(osd_req, which);
3111 rbd_osd_format_write(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003112
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003113 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov26f887e2018-10-15 16:11:37 +02003114 if (ret)
3115 return ret;
3116
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003117 rbd_osd_submit(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003118 return 0;
3119}
3120
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01003121static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
3122{
3123 u32 i;
3124
3125 rbd_assert(!obj_req->copyup_bvecs);
3126 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3127 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3128 sizeof(*obj_req->copyup_bvecs),
3129 GFP_NOIO);
3130 if (!obj_req->copyup_bvecs)
3131 return -ENOMEM;
3132
3133 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3134 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3135
3136 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3137 if (!obj_req->copyup_bvecs[i].bv_page)
3138 return -ENOMEM;
3139
3140 obj_req->copyup_bvecs[i].bv_offset = 0;
3141 obj_req->copyup_bvecs[i].bv_len = len;
3142 obj_overlap -= len;
3143 }
3144
3145 rbd_assert(!obj_overlap);
3146 return 0;
3147}
3148
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003149/*
3150 * The target object doesn't exist. Read the data for the entire
3151 * target object up to the overlap point (if any) from the parent,
3152 * so we can use it for a copyup.
3153 */
Ilya Dryomov793333a302019-06-13 17:44:08 +02003154static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003155{
3156 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003157 int ret;
3158
Ilya Dryomov86bd7992018-02-06 19:26:33 +01003159 rbd_assert(obj_req->num_img_extents);
3160 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3161 rbd_dev->parent_overlap);
3162 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003163 /*
3164 * The overlap has become 0 (most likely because the
Ilya Dryomov3a482502019-02-28 10:49:12 +01003165 * image has been flattened). Re-submit the original write
3166 * request -- pass MODS_ONLY since the copyup isn't needed
3167 * anymore.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003168 */
Ilya Dryomov793333a302019-06-13 17:44:08 +02003169 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003170 }
3171
Ilya Dryomov86bd7992018-02-06 19:26:33 +01003172 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003173 if (ret)
3174 return ret;
3175
Ilya Dryomov86bd7992018-02-06 19:26:33 +01003176 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003177}
3178
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003179static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003180{
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003181 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3182 struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3183 u8 new_state;
3184 u32 i;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003185 int ret;
3186
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003187 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3188
3189 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3190 return;
3191
3192 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3193 return;
3194
3195 for (i = 0; i < snapc->num_snaps; i++) {
3196 if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3197 i + 1 < snapc->num_snaps)
3198 new_state = OBJECT_EXISTS_CLEAN;
3199 else
3200 new_state = OBJECT_EXISTS;
3201
3202 ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3203 new_state, NULL);
3204 if (ret < 0) {
3205 obj_req->pending.result = ret;
3206 return;
3207 }
3208
3209 rbd_assert(!ret);
3210 obj_req->pending.num_pending++;
3211 }
3212}
3213
Ilya Dryomov793333a302019-06-13 17:44:08 +02003214static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3215{
3216 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3217 int ret;
3218
3219 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3220
3221 /*
3222 * Only send non-zero copyup data to save some I/O and network
3223 * bandwidth -- zero copyup data is equivalent to the object not
3224 * existing.
3225 */
3226 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3227 bytes = 0;
3228
3229 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3230 /*
3231 * Send a copyup request with an empty snapshot context to
3232 * deep-copyup the object through all existing snapshots.
3233 * A second request with the current snapshot context will be
3234 * sent for the actual modification.
3235 */
3236 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3237 if (ret) {
3238 obj_req->pending.result = ret;
3239 return;
3240 }
3241
3242 obj_req->pending.num_pending++;
3243 bytes = MODS_ONLY;
3244 }
3245
3246 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3247 if (ret) {
3248 obj_req->pending.result = ret;
3249 return;
3250 }
3251
3252 obj_req->pending.num_pending++;
3253}
3254
3255static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3256{
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003257 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov793333a302019-06-13 17:44:08 +02003258 int ret;
3259
3260again:
3261 switch (obj_req->copyup_state) {
3262 case RBD_OBJ_COPYUP_START:
3263 rbd_assert(!*result);
3264
3265 ret = rbd_obj_copyup_read_parent(obj_req);
3266 if (ret) {
3267 *result = ret;
3268 return true;
3269 }
3270 if (obj_req->num_img_extents)
3271 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3272 else
3273 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3274 return false;
3275 case RBD_OBJ_COPYUP_READ_PARENT:
3276 if (*result)
3277 return true;
3278
3279 if (is_zero_bvecs(obj_req->copyup_bvecs,
3280 rbd_obj_img_extents_bytes(obj_req))) {
3281 dout("%s %p detected zeros\n", __func__, obj_req);
3282 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3283 }
3284
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003285 rbd_obj_copyup_object_maps(obj_req);
3286 if (!obj_req->pending.num_pending) {
3287 *result = obj_req->pending.result;
3288 obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3289 goto again;
3290 }
3291 obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3292 return false;
3293 case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3294 if (!pending_result_dec(&obj_req->pending, result))
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003295 return false;
Gustavo A. R. Silvadf561f662020-08-23 17:36:59 -05003296 fallthrough;
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003297 case RBD_OBJ_COPYUP_OBJECT_MAPS:
3298 if (*result) {
3299 rbd_warn(rbd_dev, "snap object map update failed: %d",
3300 *result);
3301 return true;
3302 }
3303
Ilya Dryomov793333a302019-06-13 17:44:08 +02003304 rbd_obj_copyup_write_object(obj_req);
3305 if (!obj_req->pending.num_pending) {
3306 *result = obj_req->pending.result;
3307 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3308 goto again;
3309 }
3310 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3311 return false;
3312 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3313 if (!pending_result_dec(&obj_req->pending, result))
3314 return false;
Gustavo A. R. Silvadf561f662020-08-23 17:36:59 -05003315 fallthrough;
Ilya Dryomov793333a302019-06-13 17:44:08 +02003316 case RBD_OBJ_COPYUP_WRITE_OBJECT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003317 return true;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003318 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02003319 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003320 }
3321}
3322
3323/*
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003324 * Return:
3325 * 0 - object map update sent
3326 * 1 - object map update isn't needed
3327 * <0 - error
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003328 */
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003329static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003330{
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003331 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3332 u8 current_state = OBJECT_PENDING;
3333
3334 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3335 return 1;
3336
3337 if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3338 return 1;
3339
3340 return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3341 &current_state);
3342}
3343
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003344static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003345{
Ilya Dryomov793333a302019-06-13 17:44:08 +02003346 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003347 int ret;
3348
Ilya Dryomov793333a302019-06-13 17:44:08 +02003349again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003350 switch (obj_req->write_state) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003351 case RBD_OBJ_WRITE_START:
3352 rbd_assert(!*result);
3353
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003354 if (rbd_obj_write_is_noop(obj_req))
3355 return true;
3356
3357 ret = rbd_obj_write_pre_object_map(obj_req);
3358 if (ret < 0) {
3359 *result = ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003360 return true;
3361 }
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003362 obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3363 if (ret > 0)
3364 goto again;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003365 return false;
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003366 case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3367 if (*result) {
3368 rbd_warn(rbd_dev, "pre object map update failed: %d",
3369 *result);
3370 return true;
3371 }
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003372 ret = rbd_obj_write_object(obj_req);
3373 if (ret) {
3374 *result = ret;
3375 return true;
3376 }
3377 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3378 return false;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003379 case RBD_OBJ_WRITE_OBJECT:
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02003380 if (*result == -ENOENT) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003381 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
Ilya Dryomov793333a302019-06-13 17:44:08 +02003382 *result = 0;
3383 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3384 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3385 goto again;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003386 }
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003387 /*
3388 * On a non-existent object:
3389 * delete - -ENOENT, truncate/zero - 0
3390 */
3391 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3392 *result = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003393 }
Ilya Dryomov793333a302019-06-13 17:44:08 +02003394 if (*result)
3395 return true;
3396
3397 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3398 goto again;
3399 case __RBD_OBJ_WRITE_COPYUP:
3400 if (!rbd_obj_advance_copyup(obj_req, result))
3401 return false;
Gustavo A. R. Silvadf561f662020-08-23 17:36:59 -05003402 fallthrough;
Ilya Dryomov793333a302019-06-13 17:44:08 +02003403 case RBD_OBJ_WRITE_COPYUP:
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003404 if (*result) {
Ilya Dryomov793333a302019-06-13 17:44:08 +02003405 rbd_warn(rbd_dev, "copyup failed: %d", *result);
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003406 return true;
3407 }
3408 ret = rbd_obj_write_post_object_map(obj_req);
3409 if (ret < 0) {
3410 *result = ret;
3411 return true;
3412 }
3413 obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3414 if (ret > 0)
3415 goto again;
3416 return false;
3417 case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3418 if (*result)
3419 rbd_warn(rbd_dev, "post object map update failed: %d",
3420 *result);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003421 return true;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003422 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02003423 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003424 }
3425}
3426
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003427/*
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003428 * Return true if @obj_req is completed.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003429 */
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02003430static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3431 int *result)
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003432{
3433 struct rbd_img_request *img_req = obj_req->img_request;
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003434 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003435 bool done;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003436
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003437 mutex_lock(&obj_req->state_mutex);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003438 if (!rbd_img_is_write(img_req))
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003439 done = rbd_obj_advance_read(obj_req, result);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003440 else
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003441 done = rbd_obj_advance_write(obj_req, result);
3442 mutex_unlock(&obj_req->state_mutex);
Alex Elder02c74fb2013-05-06 17:40:33 -05003443
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003444 if (done && *result) {
3445 rbd_assert(*result < 0);
3446 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3447 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3448 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003449 }
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003450 return done;
Alex Elder8b3e1a52013-01-24 16:13:36 -06003451}
3452
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003453/*
3454 * This is open-coded in rbd_img_handle_request() to avoid parent chain
3455 * recursion.
3456 */
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02003457static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
Alex Elder8b3e1a52013-01-24 16:13:36 -06003458{
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003459 if (__rbd_obj_handle_request(obj_req, &result))
3460 rbd_img_handle_request(obj_req->img_request, result);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003461}
Alex Elder8b3e1a52013-01-24 16:13:36 -06003462
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003463static bool need_exclusive_lock(struct rbd_img_request *img_req)
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003464{
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003465 struct rbd_device *rbd_dev = img_req->rbd_dev;
3466
3467 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3468 return false;
3469
Ilya Dryomov3fe69922019-11-12 19:41:48 +01003470 if (rbd_is_ro(rbd_dev))
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003471 return false;
3472
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003473 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003474 if (rbd_dev->opts->lock_on_read ||
3475 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003476 return true;
Alex Elder8b3e1a52013-01-24 16:13:36 -06003477
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003478 return rbd_img_is_write(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003479}
Alex Elder8b3e1a52013-01-24 16:13:36 -06003480
Ilya Dryomov637cd062019-06-06 17:14:49 +02003481static bool rbd_lock_add_request(struct rbd_img_request *img_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003482{
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003483 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomov637cd062019-06-06 17:14:49 +02003484 bool locked;
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003485
3486 lockdep_assert_held(&rbd_dev->lock_rwsem);
Ilya Dryomov637cd062019-06-06 17:14:49 +02003487 locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003488 spin_lock(&rbd_dev->lock_lists_lock);
3489 rbd_assert(list_empty(&img_req->lock_item));
Ilya Dryomov637cd062019-06-06 17:14:49 +02003490 if (!locked)
3491 list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3492 else
3493 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003494 spin_unlock(&rbd_dev->lock_lists_lock);
Ilya Dryomov637cd062019-06-06 17:14:49 +02003495 return locked;
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003496}
3497
3498static void rbd_lock_del_request(struct rbd_img_request *img_req)
3499{
3500 struct rbd_device *rbd_dev = img_req->rbd_dev;
3501 bool need_wakeup;
3502
3503 lockdep_assert_held(&rbd_dev->lock_rwsem);
3504 spin_lock(&rbd_dev->lock_lists_lock);
3505 rbd_assert(!list_empty(&img_req->lock_item));
3506 list_del_init(&img_req->lock_item);
3507 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3508 list_empty(&rbd_dev->running_list));
3509 spin_unlock(&rbd_dev->lock_lists_lock);
3510 if (need_wakeup)
3511 complete(&rbd_dev->releasing_wait);
3512}
3513
Ilya Dryomov637cd062019-06-06 17:14:49 +02003514static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3515{
3516 struct rbd_device *rbd_dev = img_req->rbd_dev;
3517
3518 if (!need_exclusive_lock(img_req))
3519 return 1;
3520
3521 if (rbd_lock_add_request(img_req))
3522 return 1;
3523
3524 if (rbd_dev->opts->exclusive) {
3525 WARN_ON(1); /* lock got released? */
3526 return -EROFS;
3527 }
3528
3529 /*
3530 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3531 * and cancel_delayed_work() in wake_lock_waiters().
3532 */
3533 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3534 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3535 return 0;
3536}
3537
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003538static void rbd_img_object_requests(struct rbd_img_request *img_req)
3539{
3540 struct rbd_obj_request *obj_req;
3541
3542 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3543
3544 for_each_obj_request(img_req, obj_req) {
3545 int result = 0;
3546
3547 if (__rbd_obj_handle_request(obj_req, &result)) {
3548 if (result) {
3549 img_req->pending.result = result;
3550 return;
3551 }
3552 } else {
3553 img_req->pending.num_pending++;
3554 }
3555 }
3556}
3557
3558static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
3559{
Ilya Dryomov637cd062019-06-06 17:14:49 +02003560 struct rbd_device *rbd_dev = img_req->rbd_dev;
3561 int ret;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003562
3563again:
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003564 switch (img_req->state) {
3565 case RBD_IMG_START:
3566 rbd_assert(!*result);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003567
Ilya Dryomov637cd062019-06-06 17:14:49 +02003568 ret = rbd_img_exclusive_lock(img_req);
3569 if (ret < 0) {
3570 *result = ret;
3571 return true;
3572 }
3573 img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3574 if (ret > 0)
3575 goto again;
3576 return false;
3577 case RBD_IMG_EXCLUSIVE_LOCK:
3578 if (*result)
3579 return true;
3580
3581 rbd_assert(!need_exclusive_lock(img_req) ||
3582 __rbd_is_lock_owner(rbd_dev));
3583
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003584 rbd_img_object_requests(img_req);
3585 if (!img_req->pending.num_pending) {
3586 *result = img_req->pending.result;
3587 img_req->state = RBD_IMG_OBJECT_REQUESTS;
3588 goto again;
3589 }
3590 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3591 return false;
3592 case __RBD_IMG_OBJECT_REQUESTS:
3593 if (!pending_result_dec(&img_req->pending, result))
3594 return false;
Gustavo A. R. Silvadf561f662020-08-23 17:36:59 -05003595 fallthrough;
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003596 case RBD_IMG_OBJECT_REQUESTS:
3597 return true;
3598 default:
3599 BUG();
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003600 }
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003601}
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003602
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003603/*
3604 * Return true if @img_req is completed.
3605 */
3606static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3607 int *result)
3608{
3609 struct rbd_device *rbd_dev = img_req->rbd_dev;
3610 bool done;
3611
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003612 if (need_exclusive_lock(img_req)) {
3613 down_read(&rbd_dev->lock_rwsem);
3614 mutex_lock(&img_req->state_mutex);
3615 done = rbd_img_advance(img_req, result);
3616 if (done)
3617 rbd_lock_del_request(img_req);
3618 mutex_unlock(&img_req->state_mutex);
3619 up_read(&rbd_dev->lock_rwsem);
3620 } else {
3621 mutex_lock(&img_req->state_mutex);
3622 done = rbd_img_advance(img_req, result);
3623 mutex_unlock(&img_req->state_mutex);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003624 }
3625
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003626 if (done && *result) {
3627 rbd_assert(*result < 0);
3628 rbd_warn(rbd_dev, "%s%s result %d",
3629 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3630 obj_op_name(img_req->op_type), *result);
3631 }
3632 return done;
3633}
3634
3635static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3636{
3637again:
3638 if (!__rbd_img_handle_request(img_req, &result))
3639 return;
3640
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003641 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003642 struct rbd_obj_request *obj_req = img_req->obj_request;
3643
Hannes Reinecke679a97d2020-01-31 11:37:36 +01003644 rbd_img_request_destroy(img_req);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003645 if (__rbd_obj_handle_request(obj_req, &result)) {
3646 img_req = obj_req->img_request;
3647 goto again;
3648 }
3649 } else {
Ilya Dryomov59e542c2020-02-12 15:23:58 +01003650 struct request *rq = blk_mq_rq_from_pdu(img_req);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003651
Hannes Reinecke679a97d2020-01-31 11:37:36 +01003652 rbd_img_request_destroy(img_req);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003653 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003654 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06003655}
3656
Ilya Dryomoved95b212016-08-12 16:40:02 +02003657static const struct rbd_client_id rbd_empty_cid;
3658
3659static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3660 const struct rbd_client_id *rhs)
3661{
3662 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3663}
3664
3665static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3666{
3667 struct rbd_client_id cid;
3668
3669 mutex_lock(&rbd_dev->watch_mutex);
3670 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3671 cid.handle = rbd_dev->watch_cookie;
3672 mutex_unlock(&rbd_dev->watch_mutex);
3673 return cid;
3674}
3675
3676/*
3677 * lock_rwsem must be held for write
3678 */
3679static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3680 const struct rbd_client_id *cid)
3681{
3682 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3683 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3684 cid->gid, cid->handle);
3685 rbd_dev->owner_cid = *cid; /* struct */
3686}
3687
3688static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3689{
3690 mutex_lock(&rbd_dev->watch_mutex);
3691 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3692 mutex_unlock(&rbd_dev->watch_mutex);
3693}
3694
Florian Margaineedd8ca82017-12-13 16:43:59 +01003695static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3696{
3697 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3698
Ilya Dryomova2b1da02019-05-30 11:15:23 +02003699 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01003700 strcpy(rbd_dev->lock_cookie, cookie);
3701 rbd_set_owner_cid(rbd_dev, &cid);
3702 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3703}
3704
Ilya Dryomoved95b212016-08-12 16:40:02 +02003705/*
3706 * lock_rwsem must be held for write
3707 */
3708static int rbd_lock(struct rbd_device *rbd_dev)
3709{
3710 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003711 char cookie[32];
3712 int ret;
3713
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003714 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3715 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02003716
3717 format_lock_cookie(rbd_dev, cookie);
3718 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3719 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3720 RBD_LOCK_TAG, "", 0);
3721 if (ret)
3722 return ret;
3723
Florian Margaineedd8ca82017-12-13 16:43:59 +01003724 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003725 return 0;
3726}
3727
3728/*
3729 * lock_rwsem must be held for write
3730 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02003731static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003732{
3733 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003734 int ret;
3735
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003736 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3737 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02003738
Ilya Dryomoved95b212016-08-12 16:40:02 +02003739 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003740 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02003741 if (ret && ret != -ENOENT)
Ilya Dryomov637cd062019-06-06 17:14:49 +02003742 rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003743
Ilya Dryomovbbead742017-04-13 12:17:38 +02003744 /* treat errors as the image is unlocked */
3745 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003746 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02003747 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3748 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003749}
3750
3751static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3752 enum rbd_notify_op notify_op,
3753 struct page ***preply_pages,
3754 size_t *preply_len)
3755{
3756 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3757 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07003758 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3759 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003760 void *p = buf;
3761
3762 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3763
3764 /* encode *LockPayload NotifyMessage (op + ClientId) */
3765 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3766 ceph_encode_32(&p, notify_op);
3767 ceph_encode_64(&p, cid.gid);
3768 ceph_encode_64(&p, cid.handle);
3769
3770 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3771 &rbd_dev->header_oloc, buf, buf_size,
3772 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3773}
3774
3775static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3776 enum rbd_notify_op notify_op)
3777{
Ilya Dryomov8ae02992020-03-17 15:18:48 +01003778 __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003779}
3780
3781static void rbd_notify_acquired_lock(struct work_struct *work)
3782{
3783 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3784 acquired_lock_work);
3785
3786 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3787}
3788
3789static void rbd_notify_released_lock(struct work_struct *work)
3790{
3791 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3792 released_lock_work);
3793
3794 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3795}
3796
3797static int rbd_request_lock(struct rbd_device *rbd_dev)
3798{
3799 struct page **reply_pages;
3800 size_t reply_len;
3801 bool lock_owner_responded = false;
3802 int ret;
3803
3804 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3805
3806 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3807 &reply_pages, &reply_len);
3808 if (ret && ret != -ETIMEDOUT) {
3809 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3810 goto out;
3811 }
3812
3813 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3814 void *p = page_address(reply_pages[0]);
3815 void *const end = p + reply_len;
3816 u32 n;
3817
3818 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3819 while (n--) {
3820 u8 struct_v;
3821 u32 len;
3822
3823 ceph_decode_need(&p, end, 8 + 8, e_inval);
3824 p += 8 + 8; /* skip gid and cookie */
3825
3826 ceph_decode_32_safe(&p, end, len, e_inval);
3827 if (!len)
3828 continue;
3829
3830 if (lock_owner_responded) {
3831 rbd_warn(rbd_dev,
3832 "duplicate lock owners detected");
3833 ret = -EIO;
3834 goto out;
3835 }
3836
3837 lock_owner_responded = true;
3838 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3839 &struct_v, &len);
3840 if (ret) {
3841 rbd_warn(rbd_dev,
3842 "failed to decode ResponseMessage: %d",
3843 ret);
3844 goto e_inval;
3845 }
3846
3847 ret = ceph_decode_32(&p);
3848 }
3849 }
3850
3851 if (!lock_owner_responded) {
3852 rbd_warn(rbd_dev, "no lock owners detected");
3853 ret = -ETIMEDOUT;
3854 }
3855
3856out:
3857 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3858 return ret;
3859
3860e_inval:
3861 ret = -EINVAL;
3862 goto out;
3863}
3864
Ilya Dryomov637cd062019-06-06 17:14:49 +02003865/*
3866 * Either image request state machine(s) or rbd_add_acquire_lock()
3867 * (i.e. "rbd map").
3868 */
3869static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003870{
Ilya Dryomov637cd062019-06-06 17:14:49 +02003871 struct rbd_img_request *img_req;
3872
3873 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
Linus Torvaldsd9b9c892019-07-18 11:05:25 -07003874 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003875
3876 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomov637cd062019-06-06 17:14:49 +02003877 if (!completion_done(&rbd_dev->acquire_wait)) {
3878 rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3879 list_empty(&rbd_dev->running_list));
3880 rbd_dev->acquire_err = result;
3881 complete_all(&rbd_dev->acquire_wait);
3882 return;
3883 }
3884
3885 list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3886 mutex_lock(&img_req->state_mutex);
3887 rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3888 rbd_img_schedule(img_req, result);
3889 mutex_unlock(&img_req->state_mutex);
3890 }
3891
3892 list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003893}
3894
3895static int get_lock_owner_info(struct rbd_device *rbd_dev,
3896 struct ceph_locker **lockers, u32 *num_lockers)
3897{
3898 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3899 u8 lock_type;
3900 char *lock_tag;
3901 int ret;
3902
3903 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3904
3905 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3906 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3907 &lock_type, &lock_tag, lockers, num_lockers);
3908 if (ret)
3909 return ret;
3910
3911 if (*num_lockers == 0) {
3912 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3913 goto out;
3914 }
3915
3916 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3917 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3918 lock_tag);
3919 ret = -EBUSY;
3920 goto out;
3921 }
3922
3923 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3924 rbd_warn(rbd_dev, "shared lock type detected");
3925 ret = -EBUSY;
3926 goto out;
3927 }
3928
3929 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3930 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3931 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3932 (*lockers)[0].id.cookie);
3933 ret = -EBUSY;
3934 goto out;
3935 }
3936
3937out:
3938 kfree(lock_tag);
3939 return ret;
3940}
3941
3942static int find_watcher(struct rbd_device *rbd_dev,
3943 const struct ceph_locker *locker)
3944{
3945 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3946 struct ceph_watch_item *watchers;
3947 u32 num_watchers;
3948 u64 cookie;
3949 int i;
3950 int ret;
3951
3952 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3953 &rbd_dev->header_oloc, &watchers,
3954 &num_watchers);
3955 if (ret)
3956 return ret;
3957
3958 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3959 for (i = 0; i < num_watchers; i++) {
3960 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3961 sizeof(locker->info.addr)) &&
3962 watchers[i].cookie == cookie) {
3963 struct rbd_client_id cid = {
3964 .gid = le64_to_cpu(watchers[i].name.num),
3965 .handle = cookie,
3966 };
3967
3968 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3969 rbd_dev, cid.gid, cid.handle);
3970 rbd_set_owner_cid(rbd_dev, &cid);
3971 ret = 1;
3972 goto out;
3973 }
3974 }
3975
3976 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3977 ret = 0;
3978out:
3979 kfree(watchers);
3980 return ret;
3981}
3982
3983/*
3984 * lock_rwsem must be held for write
3985 */
3986static int rbd_try_lock(struct rbd_device *rbd_dev)
3987{
3988 struct ceph_client *client = rbd_dev->rbd_client->client;
3989 struct ceph_locker *lockers;
3990 u32 num_lockers;
3991 int ret;
3992
3993 for (;;) {
3994 ret = rbd_lock(rbd_dev);
3995 if (ret != -EBUSY)
3996 return ret;
3997
3998 /* determine if the current lock holder is still alive */
3999 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
4000 if (ret)
4001 return ret;
4002
4003 if (num_lockers == 0)
4004 goto again;
4005
4006 ret = find_watcher(rbd_dev, lockers);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004007 if (ret)
4008 goto out; /* request lock or error */
Ilya Dryomoved95b212016-08-12 16:40:02 +02004009
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02004010 rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
Ilya Dryomoved95b212016-08-12 16:40:02 +02004011 ENTITY_NAME(lockers[0].id.name));
4012
4013 ret = ceph_monc_blacklist_add(&client->monc,
4014 &lockers[0].info.addr);
4015 if (ret) {
4016 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
4017 ENTITY_NAME(lockers[0].id.name), ret);
4018 goto out;
4019 }
4020
4021 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
4022 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4023 lockers[0].id.cookie,
4024 &lockers[0].id.name);
4025 if (ret && ret != -ENOENT)
4026 goto out;
4027
4028again:
4029 ceph_free_lockers(lockers, num_lockers);
4030 }
4031
4032out:
4033 ceph_free_lockers(lockers, num_lockers);
4034 return ret;
4035}
4036
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02004037static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004038{
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02004039 int ret;
4040
4041 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
4042 ret = rbd_object_map_open(rbd_dev);
4043 if (ret)
4044 return ret;
4045 }
4046
4047 return 0;
4048}
4049
Ilya Dryomoved95b212016-08-12 16:40:02 +02004050/*
Ilya Dryomov637cd062019-06-06 17:14:49 +02004051 * Return:
4052 * 0 - lock acquired
4053 * 1 - caller should call rbd_request_lock()
4054 * <0 - error
Ilya Dryomoved95b212016-08-12 16:40:02 +02004055 */
Ilya Dryomov637cd062019-06-06 17:14:49 +02004056static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004057{
Ilya Dryomov637cd062019-06-06 17:14:49 +02004058 int ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004059
4060 down_read(&rbd_dev->lock_rwsem);
4061 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4062 rbd_dev->lock_state);
4063 if (__rbd_is_lock_owner(rbd_dev)) {
Ilya Dryomoved95b212016-08-12 16:40:02 +02004064 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004065 return 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004066 }
4067
4068 up_read(&rbd_dev->lock_rwsem);
4069 down_write(&rbd_dev->lock_rwsem);
4070 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4071 rbd_dev->lock_state);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004072 if (__rbd_is_lock_owner(rbd_dev)) {
4073 up_write(&rbd_dev->lock_rwsem);
4074 return 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004075 }
4076
Ilya Dryomov637cd062019-06-06 17:14:49 +02004077 ret = rbd_try_lock(rbd_dev);
4078 if (ret < 0) {
4079 rbd_warn(rbd_dev, "failed to lock header: %d", ret);
4080 if (ret == -EBLACKLISTED)
4081 goto out;
4082
4083 ret = 1; /* request lock anyway */
4084 }
4085 if (ret > 0) {
4086 up_write(&rbd_dev->lock_rwsem);
4087 return ret;
4088 }
4089
4090 rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4091 rbd_assert(list_empty(&rbd_dev->running_list));
4092
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02004093 ret = rbd_post_acquire_action(rbd_dev);
4094 if (ret) {
4095 rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4096 /*
4097 * Can't stay in RBD_LOCK_STATE_LOCKED because
4098 * rbd_lock_add_request() would let the request through,
4099 * assuming that e.g. object map is locked and loaded.
4100 */
4101 rbd_unlock(rbd_dev);
4102 }
4103
Ilya Dryomov637cd062019-06-06 17:14:49 +02004104out:
4105 wake_lock_waiters(rbd_dev, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004106 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004107 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004108}
4109
4110static void rbd_acquire_lock(struct work_struct *work)
4111{
4112 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4113 struct rbd_device, lock_dwork);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004114 int ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004115
4116 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4117again:
Ilya Dryomov637cd062019-06-06 17:14:49 +02004118 ret = rbd_try_acquire_lock(rbd_dev);
4119 if (ret <= 0) {
4120 dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004121 return;
4122 }
4123
4124 ret = rbd_request_lock(rbd_dev);
4125 if (ret == -ETIMEDOUT) {
4126 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02004127 } else if (ret == -EROFS) {
4128 rbd_warn(rbd_dev, "peer will not release lock");
Ilya Dryomov637cd062019-06-06 17:14:49 +02004129 down_write(&rbd_dev->lock_rwsem);
4130 wake_lock_waiters(rbd_dev, ret);
4131 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004132 } else if (ret < 0) {
4133 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4134 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4135 RBD_RETRY_DELAY);
4136 } else {
4137 /*
4138 * lock owner acked, but resend if we don't see them
4139 * release the lock
4140 */
Colin Ian King6b0a8772019-11-07 22:36:46 +00004141 dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
Ilya Dryomoved95b212016-08-12 16:40:02 +02004142 rbd_dev);
4143 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4144 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4145 }
4146}
4147
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004148static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004149{
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004150 bool need_wait;
4151
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004152 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Linus Torvaldsd9b9c892019-07-18 11:05:25 -07004153 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004154
Ilya Dryomoved95b212016-08-12 16:40:02 +02004155 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4156 return false;
4157
Ilya Dryomoved95b212016-08-12 16:40:02 +02004158 /*
4159 * Ensure that all in-flight IO is flushed.
Ilya Dryomoved95b212016-08-12 16:40:02 +02004160 */
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004161 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4162 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4163 need_wait = !list_empty(&rbd_dev->running_list);
4164 downgrade_write(&rbd_dev->lock_rwsem);
4165 if (need_wait)
4166 wait_for_completion(&rbd_dev->releasing_wait);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004167 up_read(&rbd_dev->lock_rwsem);
4168
4169 down_write(&rbd_dev->lock_rwsem);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004170 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4171 return false;
4172
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004173 rbd_assert(list_empty(&rbd_dev->running_list));
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004174 return true;
4175}
4176
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02004177static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4178{
4179 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4180 rbd_object_map_close(rbd_dev);
4181}
4182
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004183static void __rbd_release_lock(struct rbd_device *rbd_dev)
4184{
4185 rbd_assert(list_empty(&rbd_dev->running_list));
4186
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02004187 rbd_pre_release_action(rbd_dev);
Ilya Dryomovbbead742017-04-13 12:17:38 +02004188 rbd_unlock(rbd_dev);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004189}
4190
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004191/*
4192 * lock_rwsem must be held for write
4193 */
4194static void rbd_release_lock(struct rbd_device *rbd_dev)
4195{
4196 if (!rbd_quiesce_lock(rbd_dev))
4197 return;
4198
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004199 __rbd_release_lock(rbd_dev);
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004200
Ilya Dryomovbbead742017-04-13 12:17:38 +02004201 /*
4202 * Give others a chance to grab the lock - we would re-acquire
Ilya Dryomov637cd062019-06-06 17:14:49 +02004203 * almost immediately if we got new IO while draining the running
4204 * list otherwise. We need to ack our own notifications, so this
4205 * lock_dwork will be requeued from rbd_handle_released_lock() by
4206 * way of maybe_kick_acquire().
Ilya Dryomovbbead742017-04-13 12:17:38 +02004207 */
4208 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004209}
4210
4211static void rbd_release_lock_work(struct work_struct *work)
4212{
4213 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4214 unlock_work);
4215
4216 down_write(&rbd_dev->lock_rwsem);
4217 rbd_release_lock(rbd_dev);
4218 up_write(&rbd_dev->lock_rwsem);
4219}
4220
Ilya Dryomov637cd062019-06-06 17:14:49 +02004221static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4222{
4223 bool have_requests;
4224
4225 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4226 if (__rbd_is_lock_owner(rbd_dev))
4227 return;
4228
4229 spin_lock(&rbd_dev->lock_lists_lock);
4230 have_requests = !list_empty(&rbd_dev->acquiring_list);
4231 spin_unlock(&rbd_dev->lock_lists_lock);
4232 if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4233 dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4234 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4235 }
4236}
4237
Ilya Dryomoved95b212016-08-12 16:40:02 +02004238static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4239 void **p)
4240{
4241 struct rbd_client_id cid = { 0 };
4242
4243 if (struct_v >= 2) {
4244 cid.gid = ceph_decode_64(p);
4245 cid.handle = ceph_decode_64(p);
4246 }
4247
4248 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4249 cid.handle);
4250 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4251 down_write(&rbd_dev->lock_rwsem);
4252 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4253 /*
4254 * we already know that the remote client is
4255 * the owner
4256 */
4257 up_write(&rbd_dev->lock_rwsem);
4258 return;
4259 }
4260
4261 rbd_set_owner_cid(rbd_dev, &cid);
4262 downgrade_write(&rbd_dev->lock_rwsem);
4263 } else {
4264 down_read(&rbd_dev->lock_rwsem);
4265 }
4266
Ilya Dryomov637cd062019-06-06 17:14:49 +02004267 maybe_kick_acquire(rbd_dev);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004268 up_read(&rbd_dev->lock_rwsem);
4269}
4270
4271static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4272 void **p)
4273{
4274 struct rbd_client_id cid = { 0 };
4275
4276 if (struct_v >= 2) {
4277 cid.gid = ceph_decode_64(p);
4278 cid.handle = ceph_decode_64(p);
4279 }
4280
4281 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4282 cid.handle);
4283 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4284 down_write(&rbd_dev->lock_rwsem);
4285 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4286 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
4287 __func__, rbd_dev, cid.gid, cid.handle,
4288 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4289 up_write(&rbd_dev->lock_rwsem);
4290 return;
4291 }
4292
4293 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4294 downgrade_write(&rbd_dev->lock_rwsem);
4295 } else {
4296 down_read(&rbd_dev->lock_rwsem);
4297 }
4298
Ilya Dryomov637cd062019-06-06 17:14:49 +02004299 maybe_kick_acquire(rbd_dev);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004300 up_read(&rbd_dev->lock_rwsem);
4301}
4302
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004303/*
4304 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4305 * ResponseMessage is needed.
4306 */
4307static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4308 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004309{
4310 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4311 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004312 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004313
4314 if (struct_v >= 2) {
4315 cid.gid = ceph_decode_64(p);
4316 cid.handle = ceph_decode_64(p);
4317 }
4318
4319 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4320 cid.handle);
4321 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004322 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004323
4324 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004325 if (__rbd_is_lock_owner(rbd_dev)) {
4326 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4327 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4328 goto out_unlock;
4329
4330 /*
4331 * encode ResponseMessage(0) so the peer can detect
4332 * a missing owner
4333 */
4334 result = 0;
4335
4336 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02004337 if (!rbd_dev->opts->exclusive) {
4338 dout("%s rbd_dev %p queueing unlock_work\n",
4339 __func__, rbd_dev);
4340 queue_work(rbd_dev->task_wq,
4341 &rbd_dev->unlock_work);
4342 } else {
4343 /* refuse to release the lock */
4344 result = -EROFS;
4345 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02004346 }
4347 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004348
4349out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02004350 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004351 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004352}
4353
4354static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4355 u64 notify_id, u64 cookie, s32 *result)
4356{
4357 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07004358 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4359 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004360 int ret;
4361
4362 if (result) {
4363 void *p = buf;
4364
4365 /* encode ResponseMessage */
4366 ceph_start_encoding(&p, 1, 1,
4367 buf_size - CEPH_ENCODING_START_BLK_LEN);
4368 ceph_encode_32(&p, *result);
4369 } else {
4370 buf_size = 0;
4371 }
4372
4373 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4374 &rbd_dev->header_oloc, notify_id, cookie,
4375 buf, buf_size);
4376 if (ret)
4377 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4378}
4379
4380static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4381 u64 cookie)
4382{
4383 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4384 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4385}
4386
4387static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4388 u64 notify_id, u64 cookie, s32 result)
4389{
4390 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4391 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4392}
Ilya Dryomov922dab62016-05-26 01:15:02 +02004393
4394static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4395 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06004396{
Ilya Dryomov922dab62016-05-26 01:15:02 +02004397 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004398 void *p = data;
4399 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02004400 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004401 u32 len;
4402 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06004403 int ret;
4404
Ilya Dryomoved95b212016-08-12 16:40:02 +02004405 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4406 __func__, rbd_dev, cookie, notify_id, data_len);
4407 if (data_len) {
4408 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4409 &struct_v, &len);
4410 if (ret) {
4411 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4412 ret);
4413 return;
4414 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004415
Ilya Dryomoved95b212016-08-12 16:40:02 +02004416 notify_op = ceph_decode_32(&p);
4417 } else {
4418 /* legacy notification for header updates */
4419 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4420 len = 0;
4421 }
Alex Elderb8d70032012-11-30 17:53:04 -06004422
Ilya Dryomoved95b212016-08-12 16:40:02 +02004423 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4424 switch (notify_op) {
4425 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4426 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4427 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4428 break;
4429 case RBD_NOTIFY_OP_RELEASED_LOCK:
4430 rbd_handle_released_lock(rbd_dev, struct_v, &p);
4431 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4432 break;
4433 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004434 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4435 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004436 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004437 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004438 else
4439 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4440 break;
4441 case RBD_NOTIFY_OP_HEADER_UPDATE:
4442 ret = rbd_dev_refresh(rbd_dev);
4443 if (ret)
4444 rbd_warn(rbd_dev, "refresh failed: %d", ret);
4445
4446 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4447 break;
4448 default:
4449 if (rbd_is_lock_owner(rbd_dev))
4450 rbd_acknowledge_notify_result(rbd_dev, notify_id,
4451 cookie, -EOPNOTSUPP);
4452 else
4453 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4454 break;
4455 }
Alex Elderb8d70032012-11-30 17:53:04 -06004456}
4457
Ilya Dryomov99d16942016-08-12 16:11:41 +02004458static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4459
Ilya Dryomov922dab62016-05-26 01:15:02 +02004460static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004461{
Ilya Dryomov922dab62016-05-26 01:15:02 +02004462 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004463
Ilya Dryomov922dab62016-05-26 01:15:02 +02004464 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004465
Ilya Dryomoved95b212016-08-12 16:40:02 +02004466 down_write(&rbd_dev->lock_rwsem);
4467 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4468 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004469
Ilya Dryomov99d16942016-08-12 16:11:41 +02004470 mutex_lock(&rbd_dev->watch_mutex);
4471 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4472 __rbd_unregister_watch(rbd_dev);
4473 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004474
Ilya Dryomov99d16942016-08-12 16:11:41 +02004475 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004476 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02004477 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004478}
4479
4480/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02004481 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06004482 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02004483static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06004484{
4485 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02004486 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06004487
Ilya Dryomov922dab62016-05-26 01:15:02 +02004488 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02004489 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06004490
Ilya Dryomov922dab62016-05-26 01:15:02 +02004491 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4492 &rbd_dev->header_oloc, rbd_watch_cb,
4493 rbd_watch_errcb, rbd_dev);
4494 if (IS_ERR(handle))
4495 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06004496
Ilya Dryomov922dab62016-05-26 01:15:02 +02004497 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004498 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06004499}
4500
Ilya Dryomov99d16942016-08-12 16:11:41 +02004501/*
4502 * watch_mutex must be locked
4503 */
4504static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02004505{
Ilya Dryomov922dab62016-05-26 01:15:02 +02004506 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4507 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004508
Ilya Dryomov99d16942016-08-12 16:11:41 +02004509 rbd_assert(rbd_dev->watch_handle);
4510 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004511
Ilya Dryomov922dab62016-05-26 01:15:02 +02004512 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4513 if (ret)
4514 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004515
Ilya Dryomov922dab62016-05-26 01:15:02 +02004516 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02004517}
4518
Ilya Dryomov99d16942016-08-12 16:11:41 +02004519static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02004520{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004521 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02004522
Ilya Dryomov99d16942016-08-12 16:11:41 +02004523 mutex_lock(&rbd_dev->watch_mutex);
4524 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4525 ret = __rbd_register_watch(rbd_dev);
4526 if (ret)
4527 goto out;
4528
4529 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4530 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4531
4532out:
4533 mutex_unlock(&rbd_dev->watch_mutex);
4534 return ret;
4535}
4536
4537static void cancel_tasks_sync(struct rbd_device *rbd_dev)
4538{
4539 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4540
Ilya Dryomoved95b212016-08-12 16:40:02 +02004541 cancel_work_sync(&rbd_dev->acquired_lock_work);
4542 cancel_work_sync(&rbd_dev->released_lock_work);
4543 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4544 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02004545}
4546
Ilya Dryomov0e4e1de52020-03-13 11:20:51 +01004547/*
4548 * header_rwsem must not be held to avoid a deadlock with
4549 * rbd_dev_refresh() when flushing notifies.
4550 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02004551static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4552{
4553 cancel_tasks_sync(rbd_dev);
4554
4555 mutex_lock(&rbd_dev->watch_mutex);
4556 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4557 __rbd_unregister_watch(rbd_dev);
4558 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4559 mutex_unlock(&rbd_dev->watch_mutex);
4560
Dongsheng Yang23edca82018-06-04 06:24:37 -04004561 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomov811c6682016-04-15 16:22:16 +02004562 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02004563}
4564
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004565/*
4566 * lock_rwsem must be held for write
4567 */
4568static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4569{
4570 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4571 char cookie[32];
4572 int ret;
4573
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004574 if (!rbd_quiesce_lock(rbd_dev))
4575 return;
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004576
4577 format_lock_cookie(rbd_dev, cookie);
4578 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4579 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4580 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4581 RBD_LOCK_TAG, cookie);
4582 if (ret) {
4583 if (ret != -EOPNOTSUPP)
4584 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4585 ret);
4586
4587 /*
4588 * Lock cookie cannot be updated on older OSDs, so do
4589 * a manual release and queue an acquire.
4590 */
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004591 __rbd_release_lock(rbd_dev);
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004592 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004593 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01004594 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004595 wake_lock_waiters(rbd_dev, 0);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004596 }
4597}
4598
Ilya Dryomov99d16942016-08-12 16:11:41 +02004599static void rbd_reregister_watch(struct work_struct *work)
4600{
4601 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4602 struct rbd_device, watch_dwork);
4603 int ret;
4604
4605 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4606
4607 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004608 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4609 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004610 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004611 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02004612
4613 ret = __rbd_register_watch(rbd_dev);
4614 if (ret) {
4615 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004616 if (ret != -EBLACKLISTED && ret != -ENOENT) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02004617 queue_delayed_work(rbd_dev->task_wq,
4618 &rbd_dev->watch_dwork,
4619 RBD_RETRY_DELAY);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004620 mutex_unlock(&rbd_dev->watch_mutex);
4621 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004622 }
Ilya Dryomov637cd062019-06-06 17:14:49 +02004623
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004624 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004625 down_write(&rbd_dev->lock_rwsem);
4626 wake_lock_waiters(rbd_dev, ret);
4627 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004628 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02004629 }
4630
4631 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4632 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4633 mutex_unlock(&rbd_dev->watch_mutex);
4634
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004635 down_write(&rbd_dev->lock_rwsem);
4636 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4637 rbd_reacquire_lock(rbd_dev);
4638 up_write(&rbd_dev->lock_rwsem);
4639
Ilya Dryomov99d16942016-08-12 16:11:41 +02004640 ret = rbd_dev_refresh(rbd_dev);
4641 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00004642 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02004643}
4644
Alex Elder36be9a72013-01-19 00:30:28 -06004645/*
Alex Elderf40eb342013-04-25 15:09:42 -05004646 * Synchronous osd object method call. Returns the number of bytes
4647 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06004648 */
4649static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004650 struct ceph_object_id *oid,
4651 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06004652 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05004653 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06004654 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05004655 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004656 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06004657{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004658 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4659 struct page *req_page = NULL;
4660 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06004661 int ret;
4662
4663 /*
Alex Elder6010a452013-04-05 01:27:11 -05004664 * Method calls are ultimately read operations. The result
4665 * should placed into the inbound buffer provided. They
4666 * also supply outbound data--parameters for the object
4667 * method. Currently if this is present it will be a
4668 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06004669 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004670 if (outbound) {
4671 if (outbound_size > PAGE_SIZE)
4672 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06004673
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004674 req_page = alloc_page(GFP_KERNEL);
4675 if (!req_page)
4676 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06004677
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004678 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05004679 }
Alex Elder430c28c2013-04-03 21:32:51 -05004680
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004681 reply_page = alloc_page(GFP_KERNEL);
4682 if (!reply_page) {
4683 if (req_page)
4684 __free_page(req_page);
4685 return -ENOMEM;
4686 }
Alex Elder36be9a72013-01-19 00:30:28 -06004687
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004688 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4689 CEPH_OSD_FLAG_READ, req_page, outbound_size,
Ilya Dryomov68ada912019-06-14 18:16:51 +02004690 &reply_page, &inbound_size);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004691 if (!ret) {
4692 memcpy(inbound, page_address(reply_page), inbound_size);
4693 ret = inbound_size;
4694 }
Alex Elder57385b52013-04-21 12:14:45 -05004695
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004696 if (req_page)
4697 __free_page(req_page);
4698 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06004699 return ret;
4700}
4701
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004702static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004703{
Ilya Dryomov59e542c2020-02-12 15:23:58 +01004704 struct rbd_img_request *img_request =
4705 container_of(work, struct rbd_img_request, work);
4706 struct rbd_device *rbd_dev = img_request->rbd_dev;
4707 enum obj_operation_type op_type = img_request->op_type;
4708 struct request *rq = blk_mq_rq_from_pdu(img_request);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004709 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4710 u64 length = blk_rq_bytes(rq);
Josh Durgin4e752f02014-04-08 11:12:11 -07004711 u64 mapping_size;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004712 int result;
4713
4714 /* Ignore/skip any zero-length requests */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004715 if (!length) {
4716 dout("%s: zero-length request\n", __func__);
4717 result = 0;
Ilya Dryomov59e542c2020-02-12 15:23:58 +01004718 goto err_img_request;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004719 }
4720
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004721 blk_mq_start_request(rq);
4722
Josh Durgin4e752f02014-04-08 11:12:11 -07004723 down_read(&rbd_dev->header_rwsem);
4724 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova52cc682020-02-12 15:08:39 +01004725 rbd_img_capture_header(img_request);
Josh Durgin4e752f02014-04-08 11:12:11 -07004726 up_read(&rbd_dev->header_rwsem);
4727
4728 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004729 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07004730 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004731 result = -EIO;
Ilya Dryomova52cc682020-02-12 15:08:39 +01004732 goto err_img_request;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004733 }
4734
Ilya Dryomov21ed05a2019-08-30 17:31:06 +02004735 dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4736 img_request, obj_op_name(op_type), offset, length);
4737
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01004738 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
Ilya Dryomov5a237812018-02-06 19:26:34 +01004739 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004740 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01004741 result = rbd_img_fill_from_bio(img_request, offset, length,
4742 rq->bio);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02004743 if (result)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004744 goto err_img_request;
4745
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004746 rbd_img_handle_request(img_request, 0);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004747 return;
4748
4749err_img_request:
Hannes Reinecke679a97d2020-01-31 11:37:36 +01004750 rbd_img_request_destroy(img_request);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004751 if (result)
4752 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004753 obj_op_name(op_type), length, offset, result);
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02004754 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004755}
4756
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004757static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004758 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004759{
Ilya Dryomov59e542c2020-02-12 15:23:58 +01004760 struct rbd_device *rbd_dev = hctx->queue->queuedata;
4761 struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
4762 enum obj_operation_type op_type;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004763
Ilya Dryomov59e542c2020-02-12 15:23:58 +01004764 switch (req_op(bd->rq)) {
4765 case REQ_OP_DISCARD:
4766 op_type = OBJ_OP_DISCARD;
4767 break;
4768 case REQ_OP_WRITE_ZEROES:
4769 op_type = OBJ_OP_ZEROOUT;
4770 break;
4771 case REQ_OP_WRITE:
4772 op_type = OBJ_OP_WRITE;
4773 break;
4774 case REQ_OP_READ:
4775 op_type = OBJ_OP_READ;
4776 break;
4777 default:
4778 rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
4779 return BLK_STS_IOERR;
4780 }
4781
4782 rbd_img_request_init(img_req, rbd_dev, op_type);
4783
4784 if (rbd_img_is_write(img_req)) {
4785 if (rbd_is_ro(rbd_dev)) {
4786 rbd_warn(rbd_dev, "%s on read-only mapping",
4787 obj_op_name(img_req->op_type));
4788 return BLK_STS_IOERR;
4789 }
4790 rbd_assert(!rbd_is_snap(rbd_dev));
4791 }
4792
4793 INIT_WORK(&img_req->work, rbd_queue_workfn);
4794 queue_work(rbd_wq, &img_req->work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004795 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06004796}
4797
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004798static void rbd_free_disk(struct rbd_device *rbd_dev)
4799{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004800 blk_cleanup_queue(rbd_dev->disk->queue);
4801 blk_mq_free_tag_set(&rbd_dev->tag_set);
4802 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05004803 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004804}
4805
Alex Elder788e2df2013-01-17 12:25:27 -06004806static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004807 struct ceph_object_id *oid,
4808 struct ceph_object_locator *oloc,
4809 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06004810
4811{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004812 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4813 struct ceph_osd_request *req;
4814 struct page **pages;
4815 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06004816 int ret;
4817
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004818 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4819 if (!req)
4820 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06004821
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004822 ceph_oid_copy(&req->r_base_oid, oid);
4823 ceph_oloc_copy(&req->r_base_oloc, oloc);
4824 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06004825
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004826 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4827 if (IS_ERR(pages)) {
4828 ret = PTR_ERR(pages);
4829 goto out_req;
4830 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06004831
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004832 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4833 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4834 true);
Alex Elder788e2df2013-01-17 12:25:27 -06004835
Ilya Dryomov26f887e2018-10-15 16:11:37 +02004836 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4837 if (ret)
4838 goto out_req;
4839
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004840 ceph_osdc_start_request(osdc, req, false);
4841 ret = ceph_osdc_wait_request(osdc, req);
4842 if (ret >= 0)
4843 ceph_copy_from_page_vector(pages, buf, 0, ret);
4844
4845out_req:
4846 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06004847 return ret;
4848}
4849
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004850/*
Alex Elder662518b2013-05-06 09:51:29 -05004851 * Read the complete header for the given rbd device. On successful
4852 * return, the rbd_dev->header field will contain up-to-date
4853 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05004854 */
Alex Elder99a41eb2013-05-06 09:51:30 -05004855static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05004856{
4857 struct rbd_image_header_ondisk *ondisk = NULL;
4858 u32 snap_count = 0;
4859 u64 names_size = 0;
4860 u32 want_count;
4861 int ret;
4862
4863 /*
4864 * The complete header will include an array of its 64-bit
4865 * snapshot ids, followed by the names of those snapshots as
4866 * a contiguous block of NUL-terminated strings. Note that
4867 * the number of snapshots could change by the time we read
4868 * it in, in which case we re-read it.
4869 */
4870 do {
4871 size_t size;
4872
4873 kfree(ondisk);
4874
4875 size = sizeof (*ondisk);
4876 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4877 size += names_size;
4878 ondisk = kmalloc(size, GFP_KERNEL);
4879 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05004880 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05004881
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004882 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4883 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05004884 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05004885 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004886 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05004887 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004888 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4889 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05004890 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004891 }
4892 if (!rbd_dev_ondisk_valid(ondisk)) {
4893 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004894 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05004895 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004896 }
4897
4898 names_size = le64_to_cpu(ondisk->snap_names_len);
4899 want_count = snap_count;
4900 snap_count = le32_to_cpu(ondisk->snap_count);
4901 } while (snap_count != want_count);
4902
Alex Elder662518b2013-05-06 09:51:29 -05004903 ret = rbd_header_from_disk(rbd_dev, ondisk);
4904out:
Alex Elder4156d992012-08-02 11:29:46 -05004905 kfree(ondisk);
4906
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004907 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004908}
4909
Josh Durgin98752012013-08-29 17:26:31 -07004910static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4911{
4912 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07004913
4914 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02004915 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4916 * try to update its size. If REMOVING is set, updating size
4917 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07004918 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02004919 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4920 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07004921 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4922 dout("setting size to %llu sectors", (unsigned long long)size);
4923 set_capacity(rbd_dev->disk, size);
4924 revalidate_disk(rbd_dev->disk);
4925 }
4926}
4927
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004928static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05004929{
Alex Eldere627db02013-05-06 07:40:30 -05004930 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05004931 int ret;
4932
Alex Eldercfbf6372013-05-31 17:40:45 -05004933 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004934 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004935
4936 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004937 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004938 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004939
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004940 /*
4941 * If there is a parent, see if it has disappeared due to the
4942 * mapped image getting flattened.
4943 */
4944 if (rbd_dev->parent) {
4945 ret = rbd_dev_v2_parent_info(rbd_dev);
4946 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004947 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004948 }
4949
Ilya Dryomov686238b2019-11-18 12:51:02 +01004950 rbd_assert(!rbd_is_snap(rbd_dev));
4951 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder15228ed2013-05-01 12:43:03 -05004952
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004953out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004954 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004955 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004956 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004957
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004958 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004959}
4960
Eric Biggersf363b082017-03-30 13:39:16 -07004961static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004962 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004963};
4964
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004965static int rbd_init_disk(struct rbd_device *rbd_dev)
4966{
4967 struct gendisk *disk;
4968 struct request_queue *q;
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004969 unsigned int objset_bytes =
4970 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004971 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004972
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004973 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004974 disk = alloc_disk(single_major ?
4975 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4976 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004977 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004978 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004979
Alex Elderf0f8cef2012-01-29 13:57:44 -06004980 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004981 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004982 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004983 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004984 if (single_major)
4985 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004986 disk->fops = &rbd_bd_ops;
4987 disk->private_data = rbd_dev;
4988
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004989 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4990 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004991 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004992 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ming Lei56d18f62019-02-15 19:13:24 +08004993 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
Hannes Reineckef9b6b982020-01-31 11:37:39 +01004994 rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
Ilya Dryomov59e542c2020-02-12 15:23:58 +01004995 rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004996
4997 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4998 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004999 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07005000
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01005001 q = blk_mq_init_queue(&rbd_dev->tag_set);
5002 if (IS_ERR(q)) {
5003 err = PTR_ERR(q);
5004 goto out_tag_set;
5005 }
5006
Bart Van Assche8b904b52018-03-07 17:10:10 -08005007 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03005008 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06005009
Ilya Dryomov420efbd2018-04-16 09:32:18 +02005010 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02005011 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01005012 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01005013 blk_queue_max_segment_size(q, UINT_MAX);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01005014 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
5015 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07005016
Ilya Dryomovd9360542018-03-23 06:14:47 +01005017 if (rbd_dev->opts->trim) {
5018 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01005019 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
Ilya Dryomovd9360542018-03-23 06:14:47 +01005020 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
5021 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
5022 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08005023
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00005024 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01005025 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00005026
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005027 /*
5028 * disk_release() expects a queue ref from add_disk() and will
5029 * put it. Hold an extra ref until add_disk() is called.
5030 */
5031 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005032 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005033 q->queuedata = rbd_dev;
5034
5035 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005036
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005037 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01005038out_tag_set:
5039 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005040out_disk:
5041 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01005042 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005043}
5044
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005045/*
5046 sysfs
5047*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005048
Alex Elder593a9e72012-02-07 12:03:37 -06005049static struct rbd_device *dev_to_rbd_dev(struct device *dev)
5050{
5051 return container_of(dev, struct rbd_device, dev);
5052}
5053
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005054static ssize_t rbd_size_show(struct device *dev,
5055 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005056{
Alex Elder593a9e72012-02-07 12:03:37 -06005057 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005058
Alex Elderfc71d832013-04-26 15:44:36 -05005059 return sprintf(buf, "%llu\n",
5060 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005061}
5062
Alex Elder34b13182012-07-13 20:35:12 -05005063static ssize_t rbd_features_show(struct device *dev,
5064 struct device_attribute *attr, char *buf)
5065{
5066 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5067
Ilya Dryomovfa58bca2019-11-05 13:16:52 +01005068 return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
Alex Elder34b13182012-07-13 20:35:12 -05005069}
5070
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005071static ssize_t rbd_major_show(struct device *dev,
5072 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005073{
Alex Elder593a9e72012-02-07 12:03:37 -06005074 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005075
Alex Elderfc71d832013-04-26 15:44:36 -05005076 if (rbd_dev->major)
5077 return sprintf(buf, "%d\n", rbd_dev->major);
5078
5079 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02005080}
Alex Elderfc71d832013-04-26 15:44:36 -05005081
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02005082static ssize_t rbd_minor_show(struct device *dev,
5083 struct device_attribute *attr, char *buf)
5084{
5085 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5086
5087 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005088}
5089
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02005090static ssize_t rbd_client_addr_show(struct device *dev,
5091 struct device_attribute *attr, char *buf)
5092{
5093 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5094 struct ceph_entity_addr *client_addr =
5095 ceph_client_addr(rbd_dev->rbd_client->client);
5096
5097 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5098 le32_to_cpu(client_addr->nonce));
5099}
5100
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005101static ssize_t rbd_client_id_show(struct device *dev,
5102 struct device_attribute *attr, char *buf)
5103{
Alex Elder593a9e72012-02-07 12:03:37 -06005104 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005105
Alex Elder1dbb4392012-01-24 10:08:37 -06005106 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02005107 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005108}
5109
Mike Christie267fb902016-08-18 18:38:43 +02005110static ssize_t rbd_cluster_fsid_show(struct device *dev,
5111 struct device_attribute *attr, char *buf)
5112{
5113 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5114
5115 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5116}
5117
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005118static ssize_t rbd_config_info_show(struct device *dev,
5119 struct device_attribute *attr, char *buf)
5120{
5121 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5122
5123 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005124}
5125
5126static ssize_t rbd_pool_show(struct device *dev,
5127 struct device_attribute *attr, char *buf)
5128{
Alex Elder593a9e72012-02-07 12:03:37 -06005129 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005130
Alex Elder0d7dbfc2012-10-25 23:34:41 -05005131 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005132}
5133
Alex Elder9bb2f332012-07-12 10:46:35 -05005134static ssize_t rbd_pool_id_show(struct device *dev,
5135 struct device_attribute *attr, char *buf)
5136{
5137 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5138
Alex Elder0d7dbfc2012-10-25 23:34:41 -05005139 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05005140 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05005141}
5142
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005143static ssize_t rbd_pool_ns_show(struct device *dev,
5144 struct device_attribute *attr, char *buf)
5145{
5146 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5147
5148 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5149}
5150
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005151static ssize_t rbd_name_show(struct device *dev,
5152 struct device_attribute *attr, char *buf)
5153{
Alex Elder593a9e72012-02-07 12:03:37 -06005154 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005155
Alex Eldera92ffdf2012-10-30 19:40:33 -05005156 if (rbd_dev->spec->image_name)
5157 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5158
5159 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005160}
5161
Alex Elder589d30e2012-07-10 20:30:11 -05005162static ssize_t rbd_image_id_show(struct device *dev,
5163 struct device_attribute *attr, char *buf)
5164{
5165 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5166
Alex Elder0d7dbfc2012-10-25 23:34:41 -05005167 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005168}
5169
Alex Elder34b13182012-07-13 20:35:12 -05005170/*
5171 * Shows the name of the currently-mapped snapshot (or
5172 * RBD_SNAP_HEAD_NAME for the base image).
5173 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005174static ssize_t rbd_snap_show(struct device *dev,
5175 struct device_attribute *attr,
5176 char *buf)
5177{
Alex Elder593a9e72012-02-07 12:03:37 -06005178 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005179
Alex Elder0d7dbfc2012-10-25 23:34:41 -05005180 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005181}
5182
Mike Christie92a58672016-08-18 18:38:44 +02005183static ssize_t rbd_snap_id_show(struct device *dev,
5184 struct device_attribute *attr, char *buf)
5185{
5186 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5187
5188 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5189}
5190
Alex Elder86b00e02012-10-25 23:34:42 -05005191/*
Ilya Dryomovff961282014-07-22 21:53:07 +04005192 * For a v2 image, shows the chain of parent images, separated by empty
5193 * lines. For v1 images or if there is no parent, shows "(no parent
5194 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05005195 */
5196static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04005197 struct device_attribute *attr,
5198 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05005199{
5200 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04005201 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05005202
Ilya Dryomovff961282014-07-22 21:53:07 +04005203 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05005204 return sprintf(buf, "(no parent image)\n");
5205
Ilya Dryomovff961282014-07-22 21:53:07 +04005206 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5207 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05005208
Ilya Dryomovff961282014-07-22 21:53:07 +04005209 count += sprintf(&buf[count], "%s"
5210 "pool_id %llu\npool_name %s\n"
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005211 "pool_ns %s\n"
Ilya Dryomovff961282014-07-22 21:53:07 +04005212 "image_id %s\nimage_name %s\n"
5213 "snap_id %llu\nsnap_name %s\n"
5214 "overlap %llu\n",
5215 !count ? "" : "\n", /* first? */
5216 spec->pool_id, spec->pool_name,
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005217 spec->pool_ns ?: "",
Ilya Dryomovff961282014-07-22 21:53:07 +04005218 spec->image_id, spec->image_name ?: "(unknown)",
5219 spec->snap_id, spec->snap_name,
5220 rbd_dev->parent_overlap);
5221 }
Alex Elder86b00e02012-10-25 23:34:42 -05005222
Ilya Dryomovff961282014-07-22 21:53:07 +04005223 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05005224}
5225
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005226static ssize_t rbd_image_refresh(struct device *dev,
5227 struct device_attribute *attr,
5228 const char *buf,
5229 size_t size)
5230{
Alex Elder593a9e72012-02-07 12:03:37 -06005231 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05005232 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005233
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005234 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05005235 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04005236 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05005237
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04005238 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005239}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005240
Joe Perches5657a812018-05-24 13:38:59 -06005241static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5242static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5243static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5244static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5245static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5246static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5247static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5248static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5249static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5250static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005251static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
Joe Perches5657a812018-05-24 13:38:59 -06005252static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5253static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5254static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5255static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5256static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5257static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005258
5259static struct attribute *rbd_attrs[] = {
5260 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05005261 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005262 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02005263 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02005264 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005265 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02005266 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005267 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005268 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05005269 &dev_attr_pool_id.attr,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005270 &dev_attr_pool_ns.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005271 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05005272 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005273 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02005274 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05005275 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005276 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005277 NULL
5278};
5279
5280static struct attribute_group rbd_attr_group = {
5281 .attrs = rbd_attrs,
5282};
5283
5284static const struct attribute_group *rbd_attr_groups[] = {
5285 &rbd_attr_group,
5286 NULL
5287};
5288
Ilya Dryomov6cac4692015-10-16 20:11:25 +02005289static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005290
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05305291static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005292 .name = "rbd",
5293 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02005294 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005295};
5296
Alex Elder8b8fb992012-10-26 17:25:24 -05005297static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5298{
5299 kref_get(&spec->kref);
5300
5301 return spec;
5302}
5303
5304static void rbd_spec_free(struct kref *kref);
5305static void rbd_spec_put(struct rbd_spec *spec)
5306{
5307 if (spec)
5308 kref_put(&spec->kref, rbd_spec_free);
5309}
5310
5311static struct rbd_spec *rbd_spec_alloc(void)
5312{
5313 struct rbd_spec *spec;
5314
5315 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5316 if (!spec)
5317 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04005318
5319 spec->pool_id = CEPH_NOPOOL;
5320 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05005321 kref_init(&spec->kref);
5322
Alex Elder8b8fb992012-10-26 17:25:24 -05005323 return spec;
5324}
5325
5326static void rbd_spec_free(struct kref *kref)
5327{
5328 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5329
5330 kfree(spec->pool_name);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005331 kfree(spec->pool_ns);
Alex Elder8b8fb992012-10-26 17:25:24 -05005332 kfree(spec->image_id);
5333 kfree(spec->image_name);
5334 kfree(spec->snap_name);
5335 kfree(spec);
5336}
5337
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005338static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005339{
Ilya Dryomov99d16942016-08-12 16:11:41 +02005340 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02005341 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005342
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005343 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02005344 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005345 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005346
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005347 rbd_put_client(rbd_dev->rbd_client);
5348 rbd_spec_put(rbd_dev->spec);
5349 kfree(rbd_dev->opts);
5350 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005351}
5352
5353static void rbd_dev_release(struct device *dev)
5354{
5355 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5356 bool need_put = !!rbd_dev->opts;
5357
5358 if (need_put) {
5359 destroy_workqueue(rbd_dev->task_wq);
5360 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5361 }
5362
5363 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005364
5365 /*
5366 * This is racy, but way better than putting module outside of
5367 * the release callback. The race window is pretty small, so
5368 * doing something similar to dm (dm-builtin.c) is overkill.
5369 */
5370 if (need_put)
5371 module_put(THIS_MODULE);
5372}
5373
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005374static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5375 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05005376{
5377 struct rbd_device *rbd_dev;
5378
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005379 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05005380 if (!rbd_dev)
5381 return NULL;
5382
5383 spin_lock_init(&rbd_dev->lock);
5384 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05005385 init_rwsem(&rbd_dev->header_rwsem);
5386
Ilya Dryomov7e973322017-01-25 18:16:22 +01005387 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005388 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01005389 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005390 if (spec->pool_ns) {
5391 WARN_ON(!*spec->pool_ns);
5392 rbd_dev->header_oloc.pool_ns =
5393 ceph_find_or_create_string(spec->pool_ns,
5394 strlen(spec->pool_ns));
5395 }
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005396
Ilya Dryomov99d16942016-08-12 16:11:41 +02005397 mutex_init(&rbd_dev->watch_mutex);
5398 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5399 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5400
Ilya Dryomoved95b212016-08-12 16:40:02 +02005401 init_rwsem(&rbd_dev->lock_rwsem);
5402 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5403 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5404 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5405 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5406 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02005407 spin_lock_init(&rbd_dev->lock_lists_lock);
Ilya Dryomov637cd062019-06-06 17:14:49 +02005408 INIT_LIST_HEAD(&rbd_dev->acquiring_list);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02005409 INIT_LIST_HEAD(&rbd_dev->running_list);
Ilya Dryomov637cd062019-06-06 17:14:49 +02005410 init_completion(&rbd_dev->acquire_wait);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02005411 init_completion(&rbd_dev->releasing_wait);
Ilya Dryomoved95b212016-08-12 16:40:02 +02005412
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02005413 spin_lock_init(&rbd_dev->object_map_lock);
Alex Elderc53d5892012-10-25 23:34:42 -05005414
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005415 rbd_dev->dev.bus = &rbd_bus_type;
5416 rbd_dev->dev.type = &rbd_device_type;
5417 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005418 device_initialize(&rbd_dev->dev);
5419
Alex Elderc53d5892012-10-25 23:34:42 -05005420 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03005421 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06005422
Alex Elderc53d5892012-10-25 23:34:42 -05005423 return rbd_dev;
5424}
5425
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005426/*
5427 * Create a mapping rbd_dev.
5428 */
5429static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5430 struct rbd_spec *spec,
5431 struct rbd_options *opts)
5432{
5433 struct rbd_device *rbd_dev;
5434
5435 rbd_dev = __rbd_dev_create(rbdc, spec);
5436 if (!rbd_dev)
5437 return NULL;
5438
5439 rbd_dev->opts = opts;
5440
5441 /* get an id and fill in device name */
5442 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5443 minor_to_rbd_dev_id(1 << MINORBITS),
5444 GFP_KERNEL);
5445 if (rbd_dev->dev_id < 0)
5446 goto fail_rbd_dev;
5447
5448 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5449 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5450 rbd_dev->name);
5451 if (!rbd_dev->task_wq)
5452 goto fail_dev_id;
5453
5454 /* we have a ref from do_rbd_add() */
5455 __module_get(THIS_MODULE);
5456
5457 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5458 return rbd_dev;
5459
5460fail_dev_id:
5461 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5462fail_rbd_dev:
5463 rbd_dev_free(rbd_dev);
5464 return NULL;
5465}
5466
Alex Elderc53d5892012-10-25 23:34:42 -05005467static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5468{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005469 if (rbd_dev)
5470 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005471}
5472
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005473/*
Alex Elder9d475de2012-07-03 16:01:19 -05005474 * Get the size and object order for an image snapshot, or if
5475 * snap_id is CEPH_NOSNAP, gets this information for the base
5476 * image.
5477 */
5478static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5479 u8 *order, u64 *snap_size)
5480{
5481 __le64 snapid = cpu_to_le64(snap_id);
5482 int ret;
5483 struct {
5484 u8 order;
5485 __le64 size;
5486 } __attribute__ ((packed)) size_buf = { 0 };
5487
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005488 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5489 &rbd_dev->header_oloc, "get_size",
5490 &snapid, sizeof(snapid),
5491 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06005492 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05005493 if (ret < 0)
5494 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05005495 if (ret < sizeof (size_buf))
5496 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05005497
Josh Durginc3545572013-08-28 17:08:10 -07005498 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05005499 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07005500 dout(" order %u", (unsigned int)*order);
5501 }
Alex Elder9d475de2012-07-03 16:01:19 -05005502 *snap_size = le64_to_cpu(size_buf.size);
5503
Josh Durginc3545572013-08-28 17:08:10 -07005504 dout(" snap_id 0x%016llx snap_size = %llu\n",
5505 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05005506 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05005507
5508 return 0;
5509}
5510
5511static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5512{
5513 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5514 &rbd_dev->header.obj_order,
5515 &rbd_dev->header.image_size);
5516}
5517
Alex Elder1e130192012-07-03 16:01:19 -05005518static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5519{
Dongsheng Yang5435d2062019-08-09 07:05:27 +00005520 size_t size;
Alex Elder1e130192012-07-03 16:01:19 -05005521 void *reply_buf;
5522 int ret;
5523 void *p;
5524
Dongsheng Yang5435d2062019-08-09 07:05:27 +00005525 /* Response will be an encoded string, which includes a length */
5526 size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5527 reply_buf = kzalloc(size, GFP_KERNEL);
Alex Elder1e130192012-07-03 16:01:19 -05005528 if (!reply_buf)
5529 return -ENOMEM;
5530
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005531 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5532 &rbd_dev->header_oloc, "get_object_prefix",
Dongsheng Yang5435d2062019-08-09 07:05:27 +00005533 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005534 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05005535 if (ret < 0)
5536 goto out;
5537
5538 p = reply_buf;
5539 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05005540 p + ret, NULL, GFP_NOIO);
5541 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005542
5543 if (IS_ERR(rbd_dev->header.object_prefix)) {
5544 ret = PTR_ERR(rbd_dev->header.object_prefix);
5545 rbd_dev->header.object_prefix = NULL;
5546 } else {
5547 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
5548 }
Alex Elder1e130192012-07-03 16:01:19 -05005549out:
5550 kfree(reply_buf);
5551
5552 return ret;
5553}
5554
Alex Elderb1b54022012-07-03 16:01:19 -05005555static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
Ilya Dryomov196e2d62019-11-05 15:38:46 +01005556 bool read_only, u64 *snap_features)
Alex Elderb1b54022012-07-03 16:01:19 -05005557{
Ilya Dryomov196e2d62019-11-05 15:38:46 +01005558 struct {
5559 __le64 snap_id;
5560 u8 read_only;
5561 } features_in;
Alex Elderb1b54022012-07-03 16:01:19 -05005562 struct {
5563 __le64 features;
5564 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05005565 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005566 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05005567 int ret;
5568
Ilya Dryomov196e2d62019-11-05 15:38:46 +01005569 features_in.snap_id = cpu_to_le64(snap_id);
5570 features_in.read_only = read_only;
5571
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005572 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5573 &rbd_dev->header_oloc, "get_features",
Ilya Dryomov196e2d62019-11-05 15:38:46 +01005574 &features_in, sizeof(features_in),
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005575 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06005576 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05005577 if (ret < 0)
5578 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05005579 if (ret < sizeof (features_buf))
5580 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07005581
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005582 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5583 if (unsup) {
5584 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5585 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05005586 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005587 }
Alex Elderd8891402012-10-09 13:50:17 -07005588
Alex Elderb1b54022012-07-03 16:01:19 -05005589 *snap_features = le64_to_cpu(features_buf.features);
5590
5591 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05005592 (unsigned long long)snap_id,
5593 (unsigned long long)*snap_features,
5594 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05005595
5596 return 0;
5597}
5598
5599static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5600{
5601 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
Ilya Dryomov196e2d62019-11-05 15:38:46 +01005602 rbd_is_ro(rbd_dev),
5603 &rbd_dev->header.features);
Alex Elderb1b54022012-07-03 16:01:19 -05005604}
5605
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02005606/*
5607 * These are generic image flags, but since they are used only for
5608 * object map, store them in rbd_dev->object_map_flags.
5609 *
5610 * For the same reason, this function is called only on object map
5611 * (re)load and not on header refresh.
5612 */
5613static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5614{
5615 __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5616 __le64 flags;
5617 int ret;
5618
5619 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5620 &rbd_dev->header_oloc, "get_flags",
5621 &snapid, sizeof(snapid),
5622 &flags, sizeof(flags));
5623 if (ret < 0)
5624 return ret;
5625 if (ret < sizeof(flags))
5626 return -EBADMSG;
5627
5628 rbd_dev->object_map_flags = le64_to_cpu(flags);
5629 return 0;
5630}
5631
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005632struct parent_image_info {
5633 u64 pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005634 const char *pool_ns;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005635 const char *image_id;
5636 u64 snap_id;
5637
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005638 bool has_overlap;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005639 u64 overlap;
5640};
5641
5642/*
5643 * The caller is responsible for @pii.
5644 */
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005645static int decode_parent_image_spec(void **p, void *end,
5646 struct parent_image_info *pii)
5647{
5648 u8 struct_v;
5649 u32 struct_len;
5650 int ret;
5651
5652 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5653 &struct_v, &struct_len);
5654 if (ret)
5655 return ret;
5656
5657 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5658 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5659 if (IS_ERR(pii->pool_ns)) {
5660 ret = PTR_ERR(pii->pool_ns);
5661 pii->pool_ns = NULL;
5662 return ret;
5663 }
5664 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5665 if (IS_ERR(pii->image_id)) {
5666 ret = PTR_ERR(pii->image_id);
5667 pii->image_id = NULL;
5668 return ret;
5669 }
5670 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5671 return 0;
5672
5673e_inval:
5674 return -EINVAL;
5675}
5676
5677static int __get_parent_info(struct rbd_device *rbd_dev,
5678 struct page *req_page,
5679 struct page *reply_page,
5680 struct parent_image_info *pii)
5681{
5682 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5683 size_t reply_len = PAGE_SIZE;
5684 void *p, *end;
5685 int ret;
5686
5687 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5688 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
Ilya Dryomov68ada912019-06-14 18:16:51 +02005689 req_page, sizeof(u64), &reply_page, &reply_len);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005690 if (ret)
5691 return ret == -EOPNOTSUPP ? 1 : ret;
5692
5693 p = page_address(reply_page);
5694 end = p + reply_len;
5695 ret = decode_parent_image_spec(&p, end, pii);
5696 if (ret)
5697 return ret;
5698
5699 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5700 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
Ilya Dryomov68ada912019-06-14 18:16:51 +02005701 req_page, sizeof(u64), &reply_page, &reply_len);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005702 if (ret)
5703 return ret;
5704
5705 p = page_address(reply_page);
5706 end = p + reply_len;
5707 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5708 if (pii->has_overlap)
5709 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5710
5711 return 0;
5712
5713e_inval:
5714 return -EINVAL;
5715}
5716
5717/*
5718 * The caller is responsible for @pii.
5719 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005720static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5721 struct page *req_page,
5722 struct page *reply_page,
5723 struct parent_image_info *pii)
5724{
5725 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5726 size_t reply_len = PAGE_SIZE;
5727 void *p, *end;
5728 int ret;
5729
5730 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5731 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
Ilya Dryomov68ada912019-06-14 18:16:51 +02005732 req_page, sizeof(u64), &reply_page, &reply_len);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005733 if (ret)
5734 return ret;
5735
5736 p = page_address(reply_page);
5737 end = p + reply_len;
5738 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5739 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5740 if (IS_ERR(pii->image_id)) {
5741 ret = PTR_ERR(pii->image_id);
5742 pii->image_id = NULL;
5743 return ret;
5744 }
5745 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005746 pii->has_overlap = true;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005747 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5748
5749 return 0;
5750
5751e_inval:
5752 return -EINVAL;
5753}
5754
5755static int get_parent_info(struct rbd_device *rbd_dev,
5756 struct parent_image_info *pii)
5757{
5758 struct page *req_page, *reply_page;
5759 void *p;
5760 int ret;
5761
5762 req_page = alloc_page(GFP_KERNEL);
5763 if (!req_page)
5764 return -ENOMEM;
5765
5766 reply_page = alloc_page(GFP_KERNEL);
5767 if (!reply_page) {
5768 __free_page(req_page);
5769 return -ENOMEM;
5770 }
5771
5772 p = page_address(req_page);
5773 ceph_encode_64(&p, rbd_dev->spec->snap_id);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005774 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5775 if (ret > 0)
5776 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5777 pii);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005778
5779 __free_page(req_page);
5780 __free_page(reply_page);
5781 return ret;
5782}
5783
Alex Elder86b00e02012-10-25 23:34:42 -05005784static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5785{
5786 struct rbd_spec *parent_spec;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005787 struct parent_image_info pii = { 0 };
Alex Elder86b00e02012-10-25 23:34:42 -05005788 int ret;
5789
5790 parent_spec = rbd_spec_alloc();
5791 if (!parent_spec)
5792 return -ENOMEM;
5793
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005794 ret = get_parent_info(rbd_dev, &pii);
5795 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05005796 goto out_err;
5797
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005798 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5799 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5800 pii.has_overlap, pii.overlap);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005801
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005802 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
Alex Elder392a9da2013-05-06 17:40:33 -05005803 /*
5804 * Either the parent never existed, or we have
5805 * record of it but the image got flattened so it no
5806 * longer has a parent. When the parent of a
5807 * layered image disappears we immediately set the
5808 * overlap to 0. The effect of this is that all new
5809 * requests will be treated as if the image had no
5810 * parent.
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005811 *
5812 * If !pii.has_overlap, the parent image spec is not
5813 * applicable. It's there to avoid duplication in each
5814 * snapshot record.
Alex Elder392a9da2013-05-06 17:40:33 -05005815 */
5816 if (rbd_dev->parent_overlap) {
5817 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05005818 rbd_dev_parent_put(rbd_dev);
5819 pr_info("%s: clone image has been flattened\n",
5820 rbd_dev->disk->disk_name);
5821 }
5822
Alex Elder86b00e02012-10-25 23:34:42 -05005823 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05005824 }
Alex Elder86b00e02012-10-25 23:34:42 -05005825
Alex Elder0903e872012-11-14 12:25:19 -06005826 /* The ceph file layout needs to fit pool id in 32 bits */
5827
5828 ret = -EIO;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005829 if (pii.pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04005830 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005831 (unsigned long long)pii.pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05005832 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005833 }
Alex Elder0903e872012-11-14 12:25:19 -06005834
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005835 /*
5836 * The parent won't change (except when the clone is
5837 * flattened, already handled that). So we only need to
5838 * record the parent spec we have not already done so.
5839 */
5840 if (!rbd_dev->parent_spec) {
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005841 parent_spec->pool_id = pii.pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005842 if (pii.pool_ns && *pii.pool_ns) {
5843 parent_spec->pool_ns = pii.pool_ns;
5844 pii.pool_ns = NULL;
5845 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005846 parent_spec->image_id = pii.image_id;
5847 pii.image_id = NULL;
5848 parent_spec->snap_id = pii.snap_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005849
Alex Elder70cf49c2013-05-06 17:40:33 -05005850 rbd_dev->parent_spec = parent_spec;
5851 parent_spec = NULL; /* rbd_dev now owns this */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005852 }
5853
5854 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005855 * We always update the parent overlap. If it's zero we issue
5856 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005857 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005858 if (!pii.overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005859 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005860 /* refresh, careful to warn just once */
5861 if (rbd_dev->parent_overlap)
5862 rbd_warn(rbd_dev,
5863 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005864 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005865 /* initial probe */
5866 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005867 }
Alex Elder70cf49c2013-05-06 17:40:33 -05005868 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005869 rbd_dev->parent_overlap = pii.overlap;
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005870
Alex Elder86b00e02012-10-25 23:34:42 -05005871out:
5872 ret = 0;
5873out_err:
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005874 kfree(pii.pool_ns);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005875 kfree(pii.image_id);
Alex Elder86b00e02012-10-25 23:34:42 -05005876 rbd_spec_put(parent_spec);
Alex Elder86b00e02012-10-25 23:34:42 -05005877 return ret;
5878}
5879
Alex Eldercc070d52013-04-21 12:14:45 -05005880static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5881{
5882 struct {
5883 __le64 stripe_unit;
5884 __le64 stripe_count;
5885 } __attribute__ ((packed)) striping_info_buf = { 0 };
5886 size_t size = sizeof (striping_info_buf);
5887 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05005888 int ret;
5889
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005890 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5891 &rbd_dev->header_oloc, "get_stripe_unit_count",
5892 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05005893 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5894 if (ret < 0)
5895 return ret;
5896 if (ret < size)
5897 return -ERANGE;
5898
Alex Eldercc070d52013-04-21 12:14:45 -05005899 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01005900 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5901 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05005902 return 0;
5903}
5904
Ilya Dryomov7e973322017-01-25 18:16:22 +01005905static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5906{
5907 __le64 data_pool_id;
5908 int ret;
5909
5910 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5911 &rbd_dev->header_oloc, "get_data_pool",
5912 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5913 if (ret < 0)
5914 return ret;
5915 if (ret < sizeof(data_pool_id))
5916 return -EBADMSG;
5917
5918 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5919 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5920 return 0;
5921}
5922
Alex Elder9e15b772012-10-30 19:40:33 -05005923static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5924{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005925 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05005926 size_t image_id_size;
5927 char *image_id;
5928 void *p;
5929 void *end;
5930 size_t size;
5931 void *reply_buf = NULL;
5932 size_t len = 0;
5933 char *image_name = NULL;
5934 int ret;
5935
5936 rbd_assert(!rbd_dev->spec->image_name);
5937
Alex Elder69e7a022012-11-01 08:39:26 -05005938 len = strlen(rbd_dev->spec->image_id);
5939 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05005940 image_id = kmalloc(image_id_size, GFP_KERNEL);
5941 if (!image_id)
5942 return NULL;
5943
5944 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05005945 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05005946 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05005947
5948 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5949 reply_buf = kmalloc(size, GFP_KERNEL);
5950 if (!reply_buf)
5951 goto out;
5952
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005953 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5954 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5955 "dir_get_name", image_id, image_id_size,
5956 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005957 if (ret < 0)
5958 goto out;
5959 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005960 end = reply_buf + ret;
5961
Alex Elder9e15b772012-10-30 19:40:33 -05005962 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5963 if (IS_ERR(image_name))
5964 image_name = NULL;
5965 else
5966 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5967out:
5968 kfree(reply_buf);
5969 kfree(image_id);
5970
5971 return image_name;
5972}
5973
Alex Elder2ad3d712013-04-30 00:44:33 -05005974static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5975{
5976 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5977 const char *snap_name;
5978 u32 which = 0;
5979
5980 /* Skip over names until we find the one we are looking for */
5981
5982 snap_name = rbd_dev->header.snap_names;
5983 while (which < snapc->num_snaps) {
5984 if (!strcmp(name, snap_name))
5985 return snapc->snaps[which];
5986 snap_name += strlen(snap_name) + 1;
5987 which++;
5988 }
5989 return CEPH_NOSNAP;
5990}
5991
5992static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5993{
5994 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5995 u32 which;
5996 bool found = false;
5997 u64 snap_id;
5998
5999 for (which = 0; !found && which < snapc->num_snaps; which++) {
6000 const char *snap_name;
6001
6002 snap_id = snapc->snaps[which];
6003 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07006004 if (IS_ERR(snap_name)) {
6005 /* ignore no-longer existing snapshots */
6006 if (PTR_ERR(snap_name) == -ENOENT)
6007 continue;
6008 else
6009 break;
6010 }
Alex Elder2ad3d712013-04-30 00:44:33 -05006011 found = !strcmp(name, snap_name);
6012 kfree(snap_name);
6013 }
6014 return found ? snap_id : CEPH_NOSNAP;
6015}
6016
6017/*
6018 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
6019 * no snapshot by that name is found, or if an error occurs.
6020 */
6021static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
6022{
6023 if (rbd_dev->image_format == 1)
6024 return rbd_v1_snap_id_by_name(rbd_dev, name);
6025
6026 return rbd_v2_snap_id_by_name(rbd_dev, name);
6027}
6028
Alex Elder9e15b772012-10-30 19:40:33 -05006029/*
Ilya Dryomov04077592014-07-23 17:11:20 +04006030 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05006031 */
Ilya Dryomov04077592014-07-23 17:11:20 +04006032static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
6033{
6034 struct rbd_spec *spec = rbd_dev->spec;
6035
6036 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
6037 rbd_assert(spec->image_id && spec->image_name);
6038 rbd_assert(spec->snap_name);
6039
6040 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
6041 u64 snap_id;
6042
6043 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
6044 if (snap_id == CEPH_NOSNAP)
6045 return -ENOENT;
6046
6047 spec->snap_id = snap_id;
6048 } else {
6049 spec->snap_id = CEPH_NOSNAP;
6050 }
6051
6052 return 0;
6053}
6054
6055/*
6056 * A parent image will have all ids but none of the names.
6057 *
6058 * All names in an rbd spec are dynamically allocated. It's OK if we
6059 * can't figure out the name for an image id.
6060 */
6061static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05006062{
Alex Elder2e9f7f12013-04-26 09:43:48 -05006063 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6064 struct rbd_spec *spec = rbd_dev->spec;
6065 const char *pool_name;
6066 const char *image_name;
6067 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05006068 int ret;
6069
Ilya Dryomov04077592014-07-23 17:11:20 +04006070 rbd_assert(spec->pool_id != CEPH_NOPOOL);
6071 rbd_assert(spec->image_id);
6072 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05006073
Alex Elder2e9f7f12013-04-26 09:43:48 -05006074 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05006075
Alex Elder2e9f7f12013-04-26 09:43:48 -05006076 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6077 if (!pool_name) {
6078 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05006079 return -EIO;
6080 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05006081 pool_name = kstrdup(pool_name, GFP_KERNEL);
6082 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05006083 return -ENOMEM;
6084
6085 /* Fetch the image name; tolerate failure here */
6086
Alex Elder2e9f7f12013-04-26 09:43:48 -05006087 image_name = rbd_dev_image_name(rbd_dev);
6088 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05006089 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05006090
Ilya Dryomov04077592014-07-23 17:11:20 +04006091 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05006092
Alex Elder2e9f7f12013-04-26 09:43:48 -05006093 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07006094 if (IS_ERR(snap_name)) {
6095 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05006096 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05006097 }
6098
6099 spec->pool_name = pool_name;
6100 spec->image_name = image_name;
6101 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05006102
6103 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04006104
Alex Elder9e15b772012-10-30 19:40:33 -05006105out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05006106 kfree(image_name);
6107 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05006108 return ret;
6109}
6110
Alex Eldercc4a38bd2013-04-30 00:44:33 -05006111static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05006112{
6113 size_t size;
6114 int ret;
6115 void *reply_buf;
6116 void *p;
6117 void *end;
6118 u64 seq;
6119 u32 snap_count;
6120 struct ceph_snap_context *snapc;
6121 u32 i;
6122
6123 /*
6124 * We'll need room for the seq value (maximum snapshot id),
6125 * snapshot count, and array of that many snapshot ids.
6126 * For now we have a fixed upper limit on the number we're
6127 * prepared to receive.
6128 */
6129 size = sizeof (__le64) + sizeof (__le32) +
6130 RBD_MAX_SNAP_COUNT * sizeof (__le64);
6131 reply_buf = kzalloc(size, GFP_KERNEL);
6132 if (!reply_buf)
6133 return -ENOMEM;
6134
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006135 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6136 &rbd_dev->header_oloc, "get_snapcontext",
6137 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06006138 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05006139 if (ret < 0)
6140 goto out;
6141
Alex Elder35d489f2012-07-03 16:01:19 -05006142 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05006143 end = reply_buf + ret;
6144 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05006145 ceph_decode_64_safe(&p, end, seq, out);
6146 ceph_decode_32_safe(&p, end, snap_count, out);
6147
6148 /*
6149 * Make sure the reported number of snapshot ids wouldn't go
6150 * beyond the end of our buffer. But before checking that,
6151 * make sure the computed size of the snapshot context we
6152 * allocate is representable in a size_t.
6153 */
6154 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6155 / sizeof (u64)) {
6156 ret = -EINVAL;
6157 goto out;
6158 }
6159 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6160 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05006161 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05006162
Alex Elder812164f82013-04-30 00:44:32 -05006163 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05006164 if (!snapc) {
6165 ret = -ENOMEM;
6166 goto out;
6167 }
Alex Elder35d489f2012-07-03 16:01:19 -05006168 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05006169 for (i = 0; i < snap_count; i++)
6170 snapc->snaps[i] = ceph_decode_64(&p);
6171
Alex Elder49ece552013-05-06 08:37:00 -05006172 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05006173 rbd_dev->header.snapc = snapc;
6174
6175 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05006176 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05006177out:
6178 kfree(reply_buf);
6179
Alex Elder57385b52013-04-21 12:14:45 -05006180 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05006181}
6182
Alex Elder54cac612013-04-30 00:44:33 -05006183static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6184 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006185{
6186 size_t size;
6187 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05006188 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006189 int ret;
6190 void *p;
6191 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006192 char *snap_name;
6193
6194 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6195 reply_buf = kmalloc(size, GFP_KERNEL);
6196 if (!reply_buf)
6197 return ERR_PTR(-ENOMEM);
6198
Alex Elder54cac612013-04-30 00:44:33 -05006199 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006200 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6201 &rbd_dev->header_oloc, "get_snapshot_name",
6202 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06006203 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05006204 if (ret < 0) {
6205 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006206 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05006207 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006208
6209 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05006210 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05006211 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05006212 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006213 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006214
Alex Elderf40eb342013-04-25 15:09:42 -05006215 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05006216 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006217out:
6218 kfree(reply_buf);
6219
Alex Elderf40eb342013-04-25 15:09:42 -05006220 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006221}
6222
Alex Elder2df3fac2013-05-06 09:51:30 -05006223static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05006224{
Alex Elder2df3fac2013-05-06 09:51:30 -05006225 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05006226 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05006227
Josh Durgin1617e402013-06-12 14:43:10 -07006228 ret = rbd_dev_v2_image_size(rbd_dev);
6229 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05006230 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07006231
Alex Elder2df3fac2013-05-06 09:51:30 -05006232 if (first_time) {
6233 ret = rbd_dev_v2_header_onetime(rbd_dev);
6234 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05006235 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05006236 }
6237
Alex Eldercc4a38bd2013-04-30 00:44:33 -05006238 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03006239 if (ret && first_time) {
6240 kfree(rbd_dev->header.object_prefix);
6241 rbd_dev->header.object_prefix = NULL;
6242 }
Alex Elder117973f2012-08-31 17:29:55 -05006243
6244 return ret;
6245}
6246
Ilya Dryomova720ae02014-07-23 17:11:19 +04006247static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6248{
6249 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6250
6251 if (rbd_dev->image_format == 1)
6252 return rbd_dev_v1_header_info(rbd_dev);
6253
6254 return rbd_dev_v2_header_info(rbd_dev);
6255}
6256
Alex Elder1ddbe942012-01-29 13:57:44 -06006257/*
Alex Eldere28fff262012-02-02 08:13:30 -06006258 * Skips over white space at *buf, and updates *buf to point to the
6259 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06006260 * the token (string of non-white space characters) found. Note
6261 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06006262 */
6263static inline size_t next_token(const char **buf)
6264{
6265 /*
6266 * These are the characters that produce nonzero for
6267 * isspace() in the "C" and "POSIX" locales.
6268 */
6269 const char *spaces = " \f\n\r\t\v";
6270
6271 *buf += strspn(*buf, spaces); /* Find start of token */
6272
6273 return strcspn(*buf, spaces); /* Return token length */
6274}
6275
6276/*
Alex Elderea3352f2012-07-09 21:04:23 -05006277 * Finds the next token in *buf, dynamically allocates a buffer big
6278 * enough to hold a copy of it, and copies the token into the new
6279 * buffer. The copy is guaranteed to be terminated with '\0'. Note
6280 * that a duplicate buffer is created even for a zero-length token.
6281 *
6282 * Returns a pointer to the newly-allocated duplicate, or a null
6283 * pointer if memory for the duplicate was not available. If
6284 * the lenp argument is a non-null pointer, the length of the token
6285 * (not including the '\0') is returned in *lenp.
6286 *
6287 * If successful, the *buf pointer will be updated to point beyond
6288 * the end of the found token.
6289 *
6290 * Note: uses GFP_KERNEL for allocation.
6291 */
6292static inline char *dup_token(const char **buf, size_t *lenp)
6293{
6294 char *dup;
6295 size_t len;
6296
6297 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05006298 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05006299 if (!dup)
6300 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05006301 *(dup + len) = '\0';
6302 *buf += len;
6303
6304 if (lenp)
6305 *lenp = len;
6306
6307 return dup;
6308}
6309
David Howells82995cc2019-03-25 16:38:32 +00006310static int rbd_parse_param(struct fs_parameter *param,
6311 struct rbd_parse_opts_ctx *pctx)
6312{
6313 struct rbd_options *opt = pctx->opts;
6314 struct fs_parse_result result;
Al Viro3fbb8d52019-12-20 23:43:32 -05006315 struct p_log log = {.prefix = "rbd"};
David Howells82995cc2019-03-25 16:38:32 +00006316 int token, ret;
6317
6318 ret = ceph_parse_param(param, pctx->copts, NULL);
6319 if (ret != -ENOPARAM)
6320 return ret;
6321
Al Virod7167b12019-09-07 07:23:15 -04006322 token = __fs_parse(&log, rbd_parameters, param, &result);
David Howells82995cc2019-03-25 16:38:32 +00006323 dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
6324 if (token < 0) {
Al Viro2c3f3dc2019-12-20 23:43:32 -05006325 if (token == -ENOPARAM)
6326 return inval_plog(&log, "Unknown parameter '%s'",
6327 param->key);
David Howells82995cc2019-03-25 16:38:32 +00006328 return token;
6329 }
6330
6331 switch (token) {
6332 case Opt_queue_depth:
6333 if (result.uint_32 < 1)
6334 goto out_of_range;
6335 opt->queue_depth = result.uint_32;
6336 break;
6337 case Opt_alloc_size:
6338 if (result.uint_32 < SECTOR_SIZE)
6339 goto out_of_range;
Al Viro2c3f3dc2019-12-20 23:43:32 -05006340 if (!is_power_of_2(result.uint_32))
6341 return inval_plog(&log, "alloc_size must be a power of 2");
David Howells82995cc2019-03-25 16:38:32 +00006342 opt->alloc_size = result.uint_32;
6343 break;
6344 case Opt_lock_timeout:
6345 /* 0 is "wait forever" (i.e. infinite timeout) */
6346 if (result.uint_32 > INT_MAX / 1000)
6347 goto out_of_range;
6348 opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
6349 break;
6350 case Opt_pool_ns:
6351 kfree(pctx->spec->pool_ns);
6352 pctx->spec->pool_ns = param->string;
6353 param->string = NULL;
6354 break;
Ilya Dryomovdc1dad82020-05-29 20:51:23 +02006355 case Opt_compression_hint:
6356 switch (result.uint_32) {
6357 case Opt_compression_hint_none:
6358 opt->alloc_hint_flags &=
6359 ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6360 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6361 break;
6362 case Opt_compression_hint_compressible:
6363 opt->alloc_hint_flags |=
6364 CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6365 opt->alloc_hint_flags &=
6366 ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6367 break;
6368 case Opt_compression_hint_incompressible:
6369 opt->alloc_hint_flags |=
6370 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6371 opt->alloc_hint_flags &=
6372 ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6373 break;
6374 default:
6375 BUG();
6376 }
6377 break;
David Howells82995cc2019-03-25 16:38:32 +00006378 case Opt_read_only:
6379 opt->read_only = true;
6380 break;
6381 case Opt_read_write:
6382 opt->read_only = false;
6383 break;
6384 case Opt_lock_on_read:
6385 opt->lock_on_read = true;
6386 break;
6387 case Opt_exclusive:
6388 opt->exclusive = true;
6389 break;
6390 case Opt_notrim:
6391 opt->trim = false;
6392 break;
6393 default:
6394 BUG();
6395 }
6396
6397 return 0;
6398
6399out_of_range:
Al Viro2c3f3dc2019-12-20 23:43:32 -05006400 return inval_plog(&log, "%s out of range", param->key);
David Howells82995cc2019-03-25 16:38:32 +00006401}
6402
6403/*
6404 * This duplicates most of generic_parse_monolithic(), untying it from
6405 * fs_context and skipping standard superblock and security options.
6406 */
6407static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
6408{
6409 char *key;
6410 int ret = 0;
6411
6412 dout("%s '%s'\n", __func__, options);
6413 while ((key = strsep(&options, ",")) != NULL) {
6414 if (*key) {
6415 struct fs_parameter param = {
6416 .key = key,
Al Viro0f895892019-12-17 14:15:04 -05006417 .type = fs_value_is_flag,
David Howells82995cc2019-03-25 16:38:32 +00006418 };
6419 char *value = strchr(key, '=');
6420 size_t v_len = 0;
6421
6422 if (value) {
6423 if (value == key)
6424 continue;
6425 *value++ = 0;
6426 v_len = strlen(value);
David Howells82995cc2019-03-25 16:38:32 +00006427 param.string = kmemdup_nul(value, v_len,
6428 GFP_KERNEL);
6429 if (!param.string)
6430 return -ENOMEM;
Al Viro0f895892019-12-17 14:15:04 -05006431 param.type = fs_value_is_string;
David Howells82995cc2019-03-25 16:38:32 +00006432 }
6433 param.size = v_len;
6434
6435 ret = rbd_parse_param(&param, pctx);
6436 kfree(param.string);
6437 if (ret)
6438 break;
6439 }
6440 }
6441
6442 return ret;
6443}
6444
Alex Elderea3352f2012-07-09 21:04:23 -05006445/*
Alex Elder859c31d2012-10-25 23:34:42 -05006446 * Parse the options provided for an "rbd add" (i.e., rbd image
6447 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
6448 * and the data written is passed here via a NUL-terminated buffer.
6449 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05006450 *
Alex Elder859c31d2012-10-25 23:34:42 -05006451 * The information extracted from these options is recorded in
6452 * the other parameters which return dynamically-allocated
6453 * structures:
6454 * ceph_opts
6455 * The address of a pointer that will refer to a ceph options
6456 * structure. Caller must release the returned pointer using
6457 * ceph_destroy_options() when it is no longer needed.
6458 * rbd_opts
6459 * Address of an rbd options pointer. Fully initialized by
6460 * this function; caller must release with kfree().
6461 * spec
6462 * Address of an rbd image specification pointer. Fully
6463 * initialized by this function based on parsed options.
6464 * Caller must release with rbd_spec_put().
6465 *
6466 * The options passed take this form:
6467 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6468 * where:
6469 * <mon_addrs>
6470 * A comma-separated list of one or more monitor addresses.
6471 * A monitor address is an ip address, optionally followed
6472 * by a port number (separated by a colon).
6473 * I.e.: ip1[:port1][,ip2[:port2]...]
6474 * <options>
6475 * A comma-separated list of ceph and/or rbd options.
6476 * <pool_name>
6477 * The name of the rados pool containing the rbd image.
6478 * <image_name>
6479 * The name of the image in that pool to map.
6480 * <snap_id>
6481 * An optional snapshot id. If provided, the mapping will
6482 * present data from the image at the time that snapshot was
6483 * created. The image head is used if no snapshot id is
6484 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06006485 */
Alex Elder859c31d2012-10-25 23:34:42 -05006486static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05006487 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05006488 struct rbd_options **opts,
6489 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06006490{
Alex Elderd22f76e2012-07-12 10:46:35 -05006491 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05006492 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05006493 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05006494 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05006495 size_t mon_addrs_size;
David Howells82995cc2019-03-25 16:38:32 +00006496 struct rbd_parse_opts_ctx pctx = { 0 };
Alex Elderdc79b112012-10-25 23:34:41 -05006497 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06006498
6499 /* The first four tokens are required */
6500
Alex Elder7ef32142012-02-02 08:13:30 -06006501 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05006502 if (!len) {
6503 rbd_warn(NULL, "no monitor address(es) provided");
6504 return -EINVAL;
6505 }
Alex Elder0ddebc02012-10-25 23:34:41 -05006506 mon_addrs = buf;
David Howells82995cc2019-03-25 16:38:32 +00006507 mon_addrs_size = len;
Alex Elder7ef32142012-02-02 08:13:30 -06006508 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06006509
Alex Elderdc79b112012-10-25 23:34:41 -05006510 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05006511 options = dup_token(&buf, NULL);
6512 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05006513 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05006514 if (!*options) {
6515 rbd_warn(NULL, "no options provided");
6516 goto out_err;
6517 }
Alex Eldera725f65e2012-02-02 08:13:30 -06006518
Ilya Dryomovc3001562018-07-03 15:28:43 +02006519 pctx.spec = rbd_spec_alloc();
6520 if (!pctx.spec)
Alex Elderf28e5652012-10-25 23:34:41 -05006521 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05006522
Ilya Dryomovc3001562018-07-03 15:28:43 +02006523 pctx.spec->pool_name = dup_token(&buf, NULL);
6524 if (!pctx.spec->pool_name)
Alex Elder859c31d2012-10-25 23:34:42 -05006525 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02006526 if (!*pctx.spec->pool_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05006527 rbd_warn(NULL, "no pool name provided");
6528 goto out_err;
6529 }
Alex Eldere28fff262012-02-02 08:13:30 -06006530
Ilya Dryomovc3001562018-07-03 15:28:43 +02006531 pctx.spec->image_name = dup_token(&buf, NULL);
6532 if (!pctx.spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05006533 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02006534 if (!*pctx.spec->image_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05006535 rbd_warn(NULL, "no image name provided");
6536 goto out_err;
6537 }
Alex Eldere28fff262012-02-02 08:13:30 -06006538
Alex Elderf28e5652012-10-25 23:34:41 -05006539 /*
6540 * Snapshot name is optional; default is to use "-"
6541 * (indicating the head/no snapshot).
6542 */
Alex Elder3feeb8942012-08-31 17:29:52 -05006543 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05006544 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05006545 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6546 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05006547 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05006548 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05006549 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05006550 }
Alex Elderecb4dc22013-04-26 09:43:47 -05006551 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6552 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05006553 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05006554 *(snap_name + len) = '\0';
Ilya Dryomovc3001562018-07-03 15:28:43 +02006555 pctx.spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05006556
David Howells82995cc2019-03-25 16:38:32 +00006557 pctx.copts = ceph_alloc_options();
6558 if (!pctx.copts)
6559 goto out_mem;
6560
Alex Elder0ddebc02012-10-25 23:34:41 -05006561 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06006562
Ilya Dryomovc3001562018-07-03 15:28:43 +02006563 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6564 if (!pctx.opts)
Alex Elder4e9afeb2012-10-25 23:34:41 -05006565 goto out_mem;
6566
Ilya Dryomovc3001562018-07-03 15:28:43 +02006567 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6568 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01006569 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
Ilya Dryomovc3001562018-07-03 15:28:43 +02006570 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6571 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6572 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6573 pctx.opts->trim = RBD_TRIM_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05006574
David Howells82995cc2019-03-25 16:38:32 +00006575 ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL);
6576 if (ret)
Alex Elderdc79b112012-10-25 23:34:41 -05006577 goto out_err;
Alex Elder859c31d2012-10-25 23:34:42 -05006578
David Howells82995cc2019-03-25 16:38:32 +00006579 ret = rbd_parse_options(options, &pctx);
6580 if (ret)
6581 goto out_err;
6582
6583 *ceph_opts = pctx.copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +02006584 *opts = pctx.opts;
6585 *rbd_spec = pctx.spec;
David Howells82995cc2019-03-25 16:38:32 +00006586 kfree(options);
Alex Elderdc79b112012-10-25 23:34:41 -05006587 return 0;
David Howells82995cc2019-03-25 16:38:32 +00006588
Alex Elderf28e5652012-10-25 23:34:41 -05006589out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05006590 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05006591out_err:
Ilya Dryomovc3001562018-07-03 15:28:43 +02006592 kfree(pctx.opts);
David Howells82995cc2019-03-25 16:38:32 +00006593 ceph_destroy_options(pctx.copts);
Ilya Dryomovc3001562018-07-03 15:28:43 +02006594 rbd_spec_put(pctx.spec);
Alex Elderf28e5652012-10-25 23:34:41 -05006595 kfree(options);
Alex Elderdc79b112012-10-25 23:34:41 -05006596 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06006597}
6598
Ilya Dryomove010dd02017-04-13 12:17:39 +02006599static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6600{
6601 down_write(&rbd_dev->lock_rwsem);
6602 if (__rbd_is_lock_owner(rbd_dev))
Ilya Dryomove1fddc82019-05-30 16:07:48 +02006603 __rbd_release_lock(rbd_dev);
Ilya Dryomove010dd02017-04-13 12:17:39 +02006604 up_write(&rbd_dev->lock_rwsem);
6605}
6606
Ilya Dryomov637cd062019-06-06 17:14:49 +02006607/*
6608 * If the wait is interrupted, an error is returned even if the lock
6609 * was successfully acquired. rbd_dev_image_unlock() will release it
6610 * if needed.
6611 */
Ilya Dryomove010dd02017-04-13 12:17:39 +02006612static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6613{
Ilya Dryomov637cd062019-06-06 17:14:49 +02006614 long ret;
Ilya Dryomov2f18d462018-04-04 10:15:38 +02006615
Ilya Dryomove010dd02017-04-13 12:17:39 +02006616 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
Ilya Dryomov637cd062019-06-06 17:14:49 +02006617 if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6618 return 0;
6619
Ilya Dryomove010dd02017-04-13 12:17:39 +02006620 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6621 return -EINVAL;
6622 }
6623
Ilya Dryomov3fe69922019-11-12 19:41:48 +01006624 if (rbd_is_ro(rbd_dev))
Ilya Dryomov637cd062019-06-06 17:14:49 +02006625 return 0;
6626
6627 rbd_assert(!rbd_is_lock_owner(rbd_dev));
6628 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6629 ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6630 ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
Dongsheng Yang25e6be22019-09-27 15:33:22 +00006631 if (ret > 0) {
Ilya Dryomov637cd062019-06-06 17:14:49 +02006632 ret = rbd_dev->acquire_err;
Dongsheng Yang25e6be22019-09-27 15:33:22 +00006633 } else {
6634 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6635 if (!ret)
6636 ret = -ETIMEDOUT;
6637 }
Ilya Dryomov637cd062019-06-06 17:14:49 +02006638
Ilya Dryomov2f18d462018-04-04 10:15:38 +02006639 if (ret) {
Ilya Dryomov637cd062019-06-06 17:14:49 +02006640 rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6641 return ret;
Ilya Dryomove010dd02017-04-13 12:17:39 +02006642 }
6643
Ilya Dryomov637cd062019-06-06 17:14:49 +02006644 /*
6645 * The lock may have been released by now, unless automatic lock
6646 * transitions are disabled.
6647 */
6648 rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
Ilya Dryomove010dd02017-04-13 12:17:39 +02006649 return 0;
6650}
6651
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006652/*
Alex Elder589d30e2012-07-10 20:30:11 -05006653 * An rbd format 2 image has a unique identifier, distinct from the
6654 * name given to it by the user. Internally, that identifier is
6655 * what's used to specify the names of objects related to the image.
6656 *
6657 * A special "rbd id" object is used to map an rbd image name to its
6658 * id. If that object doesn't exist, then there is no v2 rbd image
6659 * with the supplied name.
6660 *
6661 * This function will record the given rbd_dev's image_id field if
6662 * it can be determined, and in that case will return 0. If any
6663 * errors occur a negative errno will be returned and the rbd_dev's
6664 * image_id field will be unchanged (and should be NULL).
6665 */
6666static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6667{
6668 int ret;
6669 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006670 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05006671 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05006672 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05006673
Alex Elder589d30e2012-07-10 20:30:11 -05006674 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05006675 * When probing a parent image, the image id is already
6676 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05006677 * need to fetch the image id again in this case. We
6678 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05006679 */
Alex Elderc0fba362013-04-25 23:15:08 -05006680 if (rbd_dev->spec->image_id) {
6681 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6682
Alex Elder2c0d0a12012-10-30 19:40:33 -05006683 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05006684 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05006685
6686 /*
Alex Elder589d30e2012-07-10 20:30:11 -05006687 * First, see if the format 2 image id file exists, and if
6688 * so, get the image's persistent id from it.
6689 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006690 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6691 rbd_dev->spec->image_name);
6692 if (ret)
6693 return ret;
6694
6695 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05006696
6697 /* Response will be an encoded string, which includes a length */
Alex Elder589d30e2012-07-10 20:30:11 -05006698 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6699 response = kzalloc(size, GFP_NOIO);
6700 if (!response) {
6701 ret = -ENOMEM;
6702 goto out;
6703 }
6704
Alex Elderc0fba362013-04-25 23:15:08 -05006705 /* If it doesn't exist we'll assume it's a format 1 image */
6706
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006707 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6708 "get_id", NULL, 0,
Dongsheng Yang5435d2062019-08-09 07:05:27 +00006709 response, size);
Alex Elder36be9a72013-01-19 00:30:28 -06006710 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05006711 if (ret == -ENOENT) {
6712 image_id = kstrdup("", GFP_KERNEL);
6713 ret = image_id ? 0 : -ENOMEM;
6714 if (!ret)
6715 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04006716 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05006717 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05006718
Alex Elderc0fba362013-04-25 23:15:08 -05006719 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05006720 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08006721 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05006722 if (!ret)
6723 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05006724 }
6725
6726 if (!ret) {
6727 rbd_dev->spec->image_id = image_id;
6728 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05006729 }
6730out:
6731 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006732 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05006733 return ret;
6734}
6735
Alex Elder3abef3b2013-05-13 20:35:37 -05006736/*
6737 * Undo whatever state changes are made by v1 or v2 header info
6738 * call.
6739 */
Alex Elder6fd48b32013-04-28 23:32:34 -05006740static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6741{
6742 struct rbd_image_header *header;
6743
Ilya Dryomove69b8d42015-01-19 12:06:14 +03006744 rbd_dev_parent_put(rbd_dev);
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02006745 rbd_object_map_free(rbd_dev);
Ilya Dryomovda5ef6be2019-06-17 15:29:49 +02006746 rbd_dev_mapping_clear(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006747
6748 /* Free dynamic fields from the header, then zero it out */
6749
6750 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05006751 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05006752 kfree(header->snap_sizes);
6753 kfree(header->snap_names);
6754 kfree(header->object_prefix);
6755 memset(header, 0, sizeof (*header));
6756}
6757
Alex Elder2df3fac2013-05-06 09:51:30 -05006758static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05006759{
6760 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006761
Alex Elder1e130192012-07-03 16:01:19 -05006762 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05006763 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05006764 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05006765
Alex Elder2df3fac2013-05-06 09:51:30 -05006766 /*
6767 * Get the and check features for the image. Currently the
6768 * features are assumed to never change.
6769 */
Alex Elderb1b54022012-07-03 16:01:19 -05006770 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05006771 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05006772 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05006773
Alex Eldercc070d52013-04-21 12:14:45 -05006774 /* If the image supports fancy striping, get its parameters */
6775
6776 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6777 ret = rbd_dev_v2_striping_info(rbd_dev);
6778 if (ret < 0)
6779 goto out_err;
6780 }
Alex Eldera30b71b2012-07-10 20:30:11 -05006781
Ilya Dryomov7e973322017-01-25 18:16:22 +01006782 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6783 ret = rbd_dev_v2_data_pool(rbd_dev);
6784 if (ret)
6785 goto out_err;
6786 }
6787
Ilya Dryomov263423f2017-01-25 18:16:22 +01006788 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05006789 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01006790
Alex Elder9d475de2012-07-03 16:01:19 -05006791out_err:
Alex Elder642a2532013-05-06 17:40:33 -05006792 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05006793 kfree(rbd_dev->header.object_prefix);
6794 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05006795 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006796}
6797
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006798/*
6799 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6800 * rbd_dev_image_probe() recursion depth, which means it's also the
6801 * length of the already discovered part of the parent chain.
6802 */
6803static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05006804{
Alex Elder2f82ee52012-10-30 19:40:33 -05006805 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05006806 int ret;
6807
6808 if (!rbd_dev->parent_spec)
6809 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05006810
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006811 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6812 pr_info("parent chain is too long (%d)\n", depth);
6813 ret = -EINVAL;
6814 goto out_err;
6815 }
6816
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006817 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006818 if (!parent) {
6819 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05006820 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006821 }
6822
6823 /*
6824 * Images related by parent/child relationships always share
6825 * rbd_client and spec/parent_spec, so bump their refcounts.
6826 */
6827 __rbd_get_client(rbd_dev->rbd_client);
6828 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05006829
Ilya Dryomov39258aa2019-11-07 17:16:23 +01006830 __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
6831
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006832 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05006833 if (ret < 0)
6834 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006835
Alex Elder124afba2013-04-26 15:44:36 -05006836 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05006837 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05006838 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05006839
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006840out_err:
6841 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01006842 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05006843 return ret;
6844}
6845
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006846static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6847{
6848 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006849 rbd_free_disk(rbd_dev);
6850 if (!single_major)
6851 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6852}
6853
Ilya Dryomov811c6682016-04-15 16:22:16 +02006854/*
6855 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6856 * upon return.
6857 */
Alex Elder200a6a82013-04-28 23:32:34 -05006858static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05006859{
Alex Elder83a06262012-10-30 15:47:17 -05006860 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05006861
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006862 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05006863
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006864 if (!single_major) {
6865 ret = register_blkdev(0, rbd_dev->name);
6866 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006867 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006868
6869 rbd_dev->major = ret;
6870 rbd_dev->minor = 0;
6871 } else {
6872 rbd_dev->major = rbd_major;
6873 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6874 }
Alex Elder83a06262012-10-30 15:47:17 -05006875
6876 /* Set up the blkdev mapping. */
6877
6878 ret = rbd_init_disk(rbd_dev);
6879 if (ret)
6880 goto err_out_blkdev;
6881
Alex Elderf35a4de2013-05-06 09:51:29 -05006882 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov39258aa2019-11-07 17:16:23 +01006883 set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
Alex Elderf35a4de2013-05-06 09:51:29 -05006884
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006885 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05006886 if (ret)
Ilya Dryomovda5ef6be2019-06-17 15:29:49 +02006887 goto err_out_disk;
Alex Elder83a06262012-10-30 15:47:17 -05006888
Alex Elder129b79d2013-04-26 15:44:36 -05006889 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006890 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006891 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05006892
Alex Elder83a06262012-10-30 15:47:17 -05006893err_out_disk:
6894 rbd_free_disk(rbd_dev);
6895err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006896 if (!single_major)
6897 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006898err_out_unlock:
6899 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05006900 return ret;
6901}
6902
Alex Elder332bb122013-04-27 09:59:30 -05006903static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6904{
6905 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006906 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05006907
6908 /* Record the header object name for this rbd image. */
6909
6910 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05006911 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006912 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6913 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05006914 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006915 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6916 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05006917
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006918 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05006919}
6920
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006921static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6922{
6923 if (!is_snap) {
6924 pr_info("image %s/%s%s%s does not exist\n",
6925 rbd_dev->spec->pool_name,
6926 rbd_dev->spec->pool_ns ?: "",
6927 rbd_dev->spec->pool_ns ? "/" : "",
6928 rbd_dev->spec->image_name);
6929 } else {
6930 pr_info("snap %s/%s%s%s@%s does not exist\n",
6931 rbd_dev->spec->pool_name,
6932 rbd_dev->spec->pool_ns ?: "",
6933 rbd_dev->spec->pool_ns ? "/" : "",
6934 rbd_dev->spec->image_name,
6935 rbd_dev->spec->snap_name);
6936 }
6937}
6938
Alex Elder200a6a82013-04-28 23:32:34 -05006939static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6940{
Ilya Dryomovb8776052020-03-16 17:16:28 +01006941 if (!rbd_is_ro(rbd_dev))
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006942 rbd_unregister_watch(rbd_dev);
Ilya Dryomov952c48b2020-03-16 15:52:54 +01006943
6944 rbd_dev_unprobe(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006945 rbd_dev->image_format = 0;
6946 kfree(rbd_dev->spec->image_id);
6947 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05006948}
6949
Alex Eldera30b71b2012-07-10 20:30:11 -05006950/*
6951 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05006952 * device. If this image is the one being mapped (i.e., not a
6953 * parent), initiate a watch on its header object before using that
6954 * object to get detailed information about the rbd image.
Ilya Dryomov0e4e1de52020-03-13 11:20:51 +01006955 *
6956 * On success, returns with header_rwsem held for write if called
6957 * with @depth == 0.
Alex Eldera30b71b2012-07-10 20:30:11 -05006958 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006959static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05006960{
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006961 bool need_watch = !rbd_is_ro(rbd_dev);
Alex Eldera30b71b2012-07-10 20:30:11 -05006962 int ret;
6963
6964 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05006965 * Get the id from the image id object. Unless there's an
6966 * error, rbd_dev->spec->image_id will be filled in with
6967 * a dynamically-allocated string, and rbd_dev->image_format
6968 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05006969 */
6970 ret = rbd_dev_image_id(rbd_dev);
6971 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05006972 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05006973
Alex Elder332bb122013-04-27 09:59:30 -05006974 ret = rbd_dev_header_name(rbd_dev);
6975 if (ret)
6976 goto err_out_format;
6977
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006978 if (need_watch) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02006979 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006980 if (ret) {
6981 if (ret == -ENOENT)
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006982 rbd_print_dne(rbd_dev, false);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006983 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006984 }
Alex Elder1f3ef782013-05-06 17:40:33 -05006985 }
Alex Elderb644de22013-04-27 09:59:31 -05006986
Ilya Dryomov0e4e1de52020-03-13 11:20:51 +01006987 if (!depth)
6988 down_write(&rbd_dev->header_rwsem);
6989
Ilya Dryomova720ae02014-07-23 17:11:19 +04006990 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006991 if (ret) {
6992 if (ret == -ENOENT && !need_watch)
6993 rbd_print_dne(rbd_dev, false);
Ilya Dryomov952c48b2020-03-16 15:52:54 +01006994 goto err_out_probe;
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006995 }
Alex Elder83a06262012-10-30 15:47:17 -05006996
Ilya Dryomov04077592014-07-23 17:11:20 +04006997 /*
6998 * If this image is the one being mapped, we have pool name and
6999 * id, image name and id, and snap name - need to fill snap id.
7000 * Otherwise this is a parent image, identified by pool, image
7001 * and snap ids - need to fill in names for those ids.
7002 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02007003 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04007004 ret = rbd_spec_fill_snap_id(rbd_dev);
7005 else
7006 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03007007 if (ret) {
7008 if (ret == -ENOENT)
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01007009 rbd_print_dne(rbd_dev, true);
Alex Elder33dca392013-04-30 00:44:33 -05007010 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03007011 }
Alex Elder9bb81c92013-04-27 09:59:30 -05007012
Ilya Dryomovda5ef6be2019-06-17 15:29:49 +02007013 ret = rbd_dev_mapping_set(rbd_dev);
7014 if (ret)
7015 goto err_out_probe;
7016
Ilya Dryomovf3c0e452019-11-07 16:22:10 +01007017 if (rbd_is_snap(rbd_dev) &&
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02007018 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
7019 ret = rbd_object_map_load(rbd_dev);
7020 if (ret)
7021 goto err_out_probe;
7022 }
7023
Ilya Dryomove8f59b52014-07-24 10:42:13 +04007024 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
7025 ret = rbd_dev_v2_parent_info(rbd_dev);
7026 if (ret)
7027 goto err_out_probe;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04007028 }
7029
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02007030 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05007031 if (ret)
7032 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05007033
Alex Elder30d60ba2013-05-06 09:51:30 -05007034 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02007035 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05007036 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04007037
Alex Elder6fd48b32013-04-28 23:32:34 -05007038err_out_probe:
Ilya Dryomov0e4e1de52020-03-13 11:20:51 +01007039 if (!depth)
7040 up_write(&rbd_dev->header_rwsem);
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01007041 if (need_watch)
Ilya Dryomov99d16942016-08-12 16:11:41 +02007042 rbd_unregister_watch(rbd_dev);
Ilya Dryomov952c48b2020-03-16 15:52:54 +01007043 rbd_dev_unprobe(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05007044err_out_format:
7045 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05007046 kfree(rbd_dev->spec->image_id);
7047 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05007048 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05007049}
7050
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007051static ssize_t do_rbd_add(struct bus_type *bus,
7052 const char *buf,
7053 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007054{
Alex Eldercb8627c2012-07-09 21:04:23 -05007055 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05007056 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05007057 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05007058 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05007059 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02007060 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007061
7062 if (!try_module_get(THIS_MODULE))
7063 return -ENODEV;
7064
Alex Eldera725f65e2012-02-02 08:13:30 -06007065 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05007066 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05007067 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02007068 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06007069
Alex Elder9d3997f2012-10-25 23:34:42 -05007070 rbdc = rbd_get_client(ceph_opts);
7071 if (IS_ERR(rbdc)) {
7072 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05007073 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05007074 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007075
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007076 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01007077 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03007078 if (rc < 0) {
7079 if (rc == -ENOENT)
7080 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007081 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03007082 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05007083 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05007084
Ilya Dryomovd1475432015-06-22 13:24:48 +03007085 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02007086 if (!rbd_dev) {
7087 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05007088 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02007089 }
Alex Elderc53d5892012-10-25 23:34:42 -05007090 rbdc = NULL; /* rbd_dev now owns this */
7091 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03007092 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007093
Ilya Dryomov39258aa2019-11-07 17:16:23 +01007094 /* if we are mapping a snapshot it will be a read-only mapping */
7095 if (rbd_dev->opts->read_only ||
7096 strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
7097 __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
7098
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02007099 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7100 if (!rbd_dev->config_info) {
7101 rc = -ENOMEM;
7102 goto err_out_rbd_dev;
7103 }
7104
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02007105 rc = rbd_dev_image_probe(rbd_dev, 0);
Ilya Dryomov0e4e1de52020-03-13 11:20:51 +01007106 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05007107 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05007108
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01007109 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7110 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7111 rbd_dev->layout.object_size);
7112 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7113 }
7114
Alex Elderb536f692013-04-28 23:32:34 -05007115 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02007116 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02007117 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05007118
Ilya Dryomov637cd062019-06-06 17:14:49 +02007119 rc = rbd_add_acquire_lock(rbd_dev);
7120 if (rc)
7121 goto err_out_image_lock;
Alex Elderb536f692013-04-28 23:32:34 -05007122
Ilya Dryomov5769ed02017-04-13 12:17:38 +02007123 /* Everything's ready. Announce the disk to the world. */
7124
7125 rc = device_add(&rbd_dev->dev);
7126 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02007127 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02007128
Hannes Reinecke33253222020-01-23 13:44:33 +01007129 device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02007130 /* see rbd_init_disk() */
7131 blk_put_queue(rbd_dev->disk->queue);
7132
7133 spin_lock(&rbd_dev_list_lock);
7134 list_add_tail(&rbd_dev->node, &rbd_dev_list);
7135 spin_unlock(&rbd_dev_list_lock);
7136
7137 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7138 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7139 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02007140 rc = count;
7141out:
7142 module_put(THIS_MODULE);
7143 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05007144
Ilya Dryomove010dd02017-04-13 12:17:39 +02007145err_out_image_lock:
7146 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02007147 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02007148err_out_image_probe:
7149 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05007150err_out_rbd_dev:
7151 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05007152err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05007153 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05007154err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05007155 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03007156 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02007157 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007158}
7159
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01007160static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007161{
7162 if (single_major)
7163 return -EINVAL;
7164
7165 return do_rbd_add(bus, buf, count);
7166}
7167
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01007168static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7169 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007170{
7171 return do_rbd_add(bus, buf, count);
7172}
7173
Alex Elder05a46af2013-04-26 15:44:36 -05007174static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7175{
Alex Elderad945fc2013-04-26 15:44:36 -05007176 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05007177 struct rbd_device *first = rbd_dev;
7178 struct rbd_device *second = first->parent;
7179 struct rbd_device *third;
7180
7181 /*
7182 * Follow to the parent with no grandparent and
7183 * remove it.
7184 */
7185 while (second && (third = second->parent)) {
7186 first = second;
7187 second = third;
7188 }
Alex Elderad945fc2013-04-26 15:44:36 -05007189 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05007190 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02007191 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05007192 first->parent = NULL;
7193 first->parent_overlap = 0;
7194
7195 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05007196 rbd_spec_put(first->parent_spec);
7197 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05007198 }
7199}
7200
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007201static ssize_t do_rbd_remove(struct bus_type *bus,
7202 const char *buf,
7203 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007204{
7205 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05007206 struct list_head *tmp;
7207 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02007208 char opt_buf[6];
Mike Christie0276dca2016-08-18 18:38:45 +02007209 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05007210 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007211
Mike Christie0276dca2016-08-18 18:38:45 +02007212 dev_id = -1;
7213 opt_buf[0] = '\0';
7214 sscanf(buf, "%d %5s", &dev_id, opt_buf);
7215 if (dev_id < 0) {
7216 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007217 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02007218 }
7219 if (opt_buf[0] != '\0') {
7220 if (!strcmp(opt_buf, "force")) {
7221 force = true;
7222 } else {
7223 pr_err("bad remove option at '%s'\n", opt_buf);
7224 return -EINVAL;
7225 }
7226 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007227
Alex Elder751cc0e2013-05-31 15:17:01 -05007228 ret = -ENOENT;
7229 spin_lock(&rbd_dev_list_lock);
7230 list_for_each(tmp, &rbd_dev_list) {
7231 rbd_dev = list_entry(tmp, struct rbd_device, node);
7232 if (rbd_dev->dev_id == dev_id) {
7233 ret = 0;
7234 break;
7235 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007236 }
Alex Elder751cc0e2013-05-31 15:17:01 -05007237 if (!ret) {
7238 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02007239 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05007240 ret = -EBUSY;
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01007241 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7242 &rbd_dev->flags))
7243 ret = -EINPROGRESS;
Alex Elder751cc0e2013-05-31 15:17:01 -05007244 spin_unlock_irq(&rbd_dev->lock);
7245 }
7246 spin_unlock(&rbd_dev_list_lock);
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01007247 if (ret)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05007248 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05007249
Mike Christie0276dca2016-08-18 18:38:45 +02007250 if (force) {
7251 /*
7252 * Prevent new IO from being queued and wait for existing
7253 * IO to complete/fail.
7254 */
7255 blk_mq_freeze_queue(rbd_dev->disk->queue);
7256 blk_set_queue_dying(rbd_dev->disk->queue);
7257 }
7258
Ilya Dryomov5769ed02017-04-13 12:17:38 +02007259 del_gendisk(rbd_dev->disk);
7260 spin_lock(&rbd_dev_list_lock);
7261 list_del_init(&rbd_dev->node);
7262 spin_unlock(&rbd_dev_list_lock);
7263 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02007264
Ilya Dryomove010dd02017-04-13 12:17:39 +02007265 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02007266 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05007267 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02007268 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05007269 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007270}
7271
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01007272static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007273{
7274 if (single_major)
7275 return -EINVAL;
7276
7277 return do_rbd_remove(bus, buf, count);
7278}
7279
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01007280static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7281 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007282{
7283 return do_rbd_remove(bus, buf, count);
7284}
7285
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007286/*
7287 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08007288 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007289 */
Chengguang Xu7d8dc532018-08-12 23:06:54 +08007290static int __init rbd_sysfs_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007291{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08007292 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007293
Alex Elderfed4c142012-02-07 12:03:36 -06007294 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06007295 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08007296 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007297
Alex Elderfed4c142012-02-07 12:03:36 -06007298 ret = bus_register(&rbd_bus_type);
7299 if (ret < 0)
7300 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007301
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007302 return ret;
7303}
7304
Chengguang Xu7d8dc532018-08-12 23:06:54 +08007305static void __exit rbd_sysfs_cleanup(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007306{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08007307 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06007308 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007309}
7310
Chengguang Xu7d8dc532018-08-12 23:06:54 +08007311static int __init rbd_slab_init(void)
Alex Elder1c2a9df2013-05-01 12:43:03 -05007312{
7313 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08007314 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05007315 if (!rbd_img_request_cache)
7316 return -ENOMEM;
7317
7318 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08007319 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05007320 if (!rbd_obj_request_cache)
7321 goto out_err;
7322
Ilya Dryomov6c696d82017-01-25 18:16:23 +01007323 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05007324
Ilya Dryomov6c696d82017-01-25 18:16:23 +01007325out_err:
Alex Elder868311b2013-05-01 12:43:03 -05007326 kmem_cache_destroy(rbd_img_request_cache);
7327 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05007328 return -ENOMEM;
7329}
7330
7331static void rbd_slab_exit(void)
7332{
Alex Elder868311b2013-05-01 12:43:03 -05007333 rbd_assert(rbd_obj_request_cache);
7334 kmem_cache_destroy(rbd_obj_request_cache);
7335 rbd_obj_request_cache = NULL;
7336
Alex Elder1c2a9df2013-05-01 12:43:03 -05007337 rbd_assert(rbd_img_request_cache);
7338 kmem_cache_destroy(rbd_img_request_cache);
7339 rbd_img_request_cache = NULL;
7340}
7341
Alex Eldercc344fa2013-02-19 12:25:56 -06007342static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007343{
7344 int rc;
7345
Alex Elder1e32d342013-01-30 11:13:33 -06007346 if (!libceph_compatible(NULL)) {
7347 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06007348 return -EINVAL;
7349 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02007350
Alex Elder1c2a9df2013-05-01 12:43:03 -05007351 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007352 if (rc)
7353 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02007354
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04007355 /*
7356 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03007357 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04007358 */
7359 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7360 if (!rbd_wq) {
7361 rc = -ENOMEM;
7362 goto err_out_slab;
7363 }
7364
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007365 if (single_major) {
7366 rbd_major = register_blkdev(0, RBD_DRV_NAME);
7367 if (rbd_major < 0) {
7368 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04007369 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007370 }
7371 }
7372
Alex Elder1c2a9df2013-05-01 12:43:03 -05007373 rc = rbd_sysfs_init();
7374 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007375 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05007376
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007377 if (single_major)
7378 pr_info("loaded (major %d)\n", rbd_major);
7379 else
7380 pr_info("loaded\n");
7381
Ilya Dryomove1b4d962013-12-13 15:28:57 +02007382 return 0;
7383
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007384err_out_blkdev:
7385 if (single_major)
7386 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04007387err_out_wq:
7388 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02007389err_out_slab:
7390 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05007391 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007392}
7393
Alex Eldercc344fa2013-02-19 12:25:56 -06007394static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007395{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04007396 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007397 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007398 if (single_major)
7399 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04007400 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05007401 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007402}
7403
7404module_init(rbd_init);
7405module_exit(rbd_exit);
7406
Alex Elderd552c612013-05-31 20:13:09 -05007407MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007408MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7409MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007410/* following authorship retained from original osdblk.c */
7411MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7412
Ilya Dryomov90da2582013-12-13 15:28:56 +02007413MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007414MODULE_LICENSE("GPL");