blob: 531d390902dd65b7871adb7a25e99da80ebe732a [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
David Howells82995cc2019-03-25 16:38:32 +000037#include <linux/fs_parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
Mark Rutlandbfc18e32018-06-21 13:13:04 +010064 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
Alex Eldera2acd002013-05-08 22:50:04 -050065 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200118#define RBD_FEATURE_OBJECT_MAP (1ULL<<3)
119#define RBD_FEATURE_FAST_DIFF (1ULL<<4)
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100120#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100121#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100122#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100123
Ilya Dryomoved95b212016-08-12 16:40:02 +0200124#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
125 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100126 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200127 RBD_FEATURE_OBJECT_MAP | \
128 RBD_FEATURE_FAST_DIFF | \
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100129 RBD_FEATURE_DEEP_FLATTEN | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100130 RBD_FEATURE_DATA_POOL | \
131 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700132
133/* Features supported by this (client software) implementation. */
134
Alex Elder770eba62012-10-25 23:34:40 -0500135#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700136
Alex Elder81a89792012-02-02 08:13:30 -0600137/*
138 * An RBD device name will be "rbd#", where the "rbd" comes from
139 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600140 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700141#define DEV_NAME_LEN 32
142
143/*
144 * block device image metadata (in-memory version)
145 */
146struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500147 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500148 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700149 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500150 u64 stripe_unit;
151 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100152 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500153 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700154
Alex Elderf84344f2012-08-31 17:29:51 -0500155 /* The remaining fields need to be updated occasionally */
156 u64 image_size;
157 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500158 char *snap_names; /* format 1 only */
159 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700160};
161
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500162/*
163 * An rbd image specification.
164 *
165 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500166 * identify an image. Each rbd_dev structure includes a pointer to
167 * an rbd_spec structure that encapsulates this identity.
168 *
169 * Each of the id's in an rbd_spec has an associated name. For a
170 * user-mapped image, the names are supplied and the id's associated
171 * with them are looked up. For a layered image, a parent image is
172 * defined by the tuple, and the names are looked up.
173 *
174 * An rbd_dev structure contains a parent_spec pointer which is
175 * non-null if the image it represents is a child in a layered
176 * image. This pointer will refer to the rbd_spec structure used
177 * by the parent rbd_dev for its own identity (i.e., the structure
178 * is shared between the parent and child).
179 *
180 * Since these structures are populated once, during the discovery
181 * phase of image construction, they are effectively immutable so
182 * we make no effort to synchronize access to them.
183 *
184 * Note that code herein does not assume the image name is known (it
185 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500186 */
187struct rbd_spec {
188 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500189 const char *pool_name;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200190 const char *pool_ns; /* NULL if default, never "" */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500191
Alex Elderecb4dc22013-04-26 09:43:47 -0500192 const char *image_id;
193 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500194
195 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500196 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500197
198 struct kref kref;
199};
200
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700201/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600202 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700203 */
204struct rbd_client {
205 struct ceph_client *client;
206 struct kref kref;
207 struct list_head node;
208};
209
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200210struct pending_result {
211 int result; /* first nonzero result */
212 int num_pending;
213};
214
Alex Elderbf0d5f502012-11-22 00:00:08 -0600215struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600216
Alex Elder9969ebc2013-01-18 12:31:10 -0600217enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100218 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100219 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100220 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100221 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600222};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600223
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800224enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100225 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800226 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800227 OBJ_OP_DISCARD,
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100228 OBJ_OP_ZEROOUT,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800229};
230
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200231#define RBD_OBJ_FLAG_DELETION (1U << 0)
232#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
Ilya Dryomov793333a302019-06-13 17:44:08 +0200233#define RBD_OBJ_FLAG_COPYUP_ZEROS (1U << 2)
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200234#define RBD_OBJ_FLAG_MAY_EXIST (1U << 3)
235#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT (1U << 4)
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200236
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200237enum rbd_obj_read_state {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200238 RBD_OBJ_READ_START = 1,
239 RBD_OBJ_READ_OBJECT,
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200240 RBD_OBJ_READ_PARENT,
241};
242
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100243/*
244 * Writes go through the following state machine to deal with
245 * layering:
246 *
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100247 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
248 * . | .
249 * . v .
250 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
251 * . | . .
252 * . v v (deep-copyup .
253 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
254 * flattened) v | . .
255 * . v . .
256 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
257 * | not needed) v
258 * v .
259 * done . . . . . . . . . . . . . . . . . .
260 * ^
261 * |
262 * RBD_OBJ_WRITE_FLAT
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100263 *
264 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100265 * assert_exists guard is needed or not (in some cases it's not needed
266 * even if there is a parent).
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100267 */
268enum rbd_obj_write_state {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200269 RBD_OBJ_WRITE_START = 1,
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200270 RBD_OBJ_WRITE_PRE_OBJECT_MAP,
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200271 RBD_OBJ_WRITE_OBJECT,
Ilya Dryomov793333a302019-06-13 17:44:08 +0200272 __RBD_OBJ_WRITE_COPYUP,
273 RBD_OBJ_WRITE_COPYUP,
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200274 RBD_OBJ_WRITE_POST_OBJECT_MAP,
Ilya Dryomov793333a302019-06-13 17:44:08 +0200275};
276
277enum rbd_obj_copyup_state {
278 RBD_OBJ_COPYUP_START = 1,
279 RBD_OBJ_COPYUP_READ_PARENT,
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200280 __RBD_OBJ_COPYUP_OBJECT_MAPS,
281 RBD_OBJ_COPYUP_OBJECT_MAPS,
Ilya Dryomov793333a302019-06-13 17:44:08 +0200282 __RBD_OBJ_COPYUP_WRITE_OBJECT,
283 RBD_OBJ_COPYUP_WRITE_OBJECT,
Alex Elder926f9b32013-02-11 12:33:24 -0600284};
285
Alex Elderbf0d5f502012-11-22 00:00:08 -0600286struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100287 struct ceph_object_extent ex;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200288 unsigned int flags; /* RBD_OBJ_FLAG_* */
Alex Elderc5b5ef62013-02-11 12:33:24 -0600289 union {
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200290 enum rbd_obj_read_state read_state; /* for reads */
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100291 enum rbd_obj_write_state write_state; /* for writes */
292 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293
Ilya Dryomov51c35092018-01-29 14:04:08 +0100294 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100295 struct ceph_file_extent *img_extents;
296 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600297
Alex Elder788e2df2013-01-17 12:25:27 -0600298 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100299 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600300 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100301 struct ceph_bvec_iter bvec_pos;
302 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100303 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600304 };
305 };
Ilya Dryomov793333a302019-06-13 17:44:08 +0200306
307 enum rbd_obj_copyup_state copyup_state;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100308 struct bio_vec *copyup_bvecs;
309 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600310
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +0200311 struct list_head osd_reqs; /* w/ r_private_item */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600312
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200313 struct mutex state_mutex;
Ilya Dryomov793333a302019-06-13 17:44:08 +0200314 struct pending_result pending;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600315 struct kref kref;
316};
317
Alex Elder0c425242013-02-08 09:55:49 -0600318enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600319 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600320 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600321};
322
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200323enum rbd_img_state {
324 RBD_IMG_START = 1,
Ilya Dryomov637cd062019-06-06 17:14:49 +0200325 RBD_IMG_EXCLUSIVE_LOCK,
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200326 __RBD_IMG_OBJECT_REQUESTS,
327 RBD_IMG_OBJECT_REQUESTS,
328};
329
Alex Elderbf0d5f502012-11-22 00:00:08 -0600330struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600331 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100332 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100333 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600334 unsigned long flags;
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200335 enum rbd_img_state state;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600336 union {
Alex Elder9849e982013-01-24 16:13:36 -0600337 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600338 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600339 };
Ilya Dryomov59e542c2020-02-12 15:23:58 +0100340 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600341
Ilya Dryomove1fddc82019-05-30 16:07:48 +0200342 struct list_head lock_item;
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100343 struct list_head object_extents; /* obj_req.ex structs */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600344
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200345 struct mutex state_mutex;
346 struct pending_result pending;
347 struct work_struct work;
348 int work_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600349};
350
351#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100352 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600353#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100354 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600355
Ilya Dryomov99d16942016-08-12 16:11:41 +0200356enum rbd_watch_state {
357 RBD_WATCH_STATE_UNREGISTERED,
358 RBD_WATCH_STATE_REGISTERED,
359 RBD_WATCH_STATE_ERROR,
360};
361
Ilya Dryomoved95b212016-08-12 16:40:02 +0200362enum rbd_lock_state {
363 RBD_LOCK_STATE_UNLOCKED,
364 RBD_LOCK_STATE_LOCKED,
365 RBD_LOCK_STATE_RELEASING,
366};
367
368/* WatchNotify::ClientId */
369struct rbd_client_id {
370 u64 gid;
371 u64 handle;
372};
373
Alex Elderf84344f2012-08-31 17:29:51 -0500374struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500375 u64 size;
Alex Elderf84344f2012-08-31 17:29:51 -0500376};
377
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700378/*
379 * a single device
380 */
381struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500382 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383
384 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200385 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700386 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700387
Alex Eldera30b71b2012-07-10 20:30:11 -0500388 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700389 struct rbd_client *rbd_client;
390
391 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
392
Alex Elderb82d1672013-01-14 12:43:31 -0600393 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700394
395 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600396 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500397 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300398 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200399 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700400
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200401 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200402 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500403
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200404 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600405
Ilya Dryomov99d16942016-08-12 16:11:41 +0200406 struct mutex watch_mutex;
407 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200408 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200409 u64 watch_cookie;
410 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700411
Ilya Dryomoved95b212016-08-12 16:40:02 +0200412 struct rw_semaphore lock_rwsem;
413 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200414 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200415 struct rbd_client_id owner_cid;
416 struct work_struct acquired_lock_work;
417 struct work_struct released_lock_work;
418 struct delayed_work lock_dwork;
419 struct work_struct unlock_work;
Ilya Dryomove1fddc82019-05-30 16:07:48 +0200420 spinlock_t lock_lists_lock;
Ilya Dryomov637cd062019-06-06 17:14:49 +0200421 struct list_head acquiring_list;
Ilya Dryomove1fddc82019-05-30 16:07:48 +0200422 struct list_head running_list;
Ilya Dryomov637cd062019-06-06 17:14:49 +0200423 struct completion acquire_wait;
424 int acquire_err;
Ilya Dryomove1fddc82019-05-30 16:07:48 +0200425 struct completion releasing_wait;
Ilya Dryomoved95b212016-08-12 16:40:02 +0200426
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200427 spinlock_t object_map_lock;
428 u8 *object_map;
429 u64 object_map_size; /* in objects */
430 u64 object_map_flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700431
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200432 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700433
Alex Elder86b00e02012-10-25 23:34:42 -0500434 struct rbd_spec *parent_spec;
435 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500436 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500437 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500438
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100439 /* Block layer tags. */
440 struct blk_mq_tag_set tag_set;
441
Josh Durginc6666012011-11-21 17:11:12 -0800442 /* protects updating the header */
443 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500444
445 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446
447 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800448
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800449 /* sysfs related */
450 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600451 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800452};
453
Alex Elderb82d1672013-01-14 12:43:31 -0600454/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200455 * Flag bits for rbd_dev->flags:
456 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
457 * by rbd_dev->lock
Alex Elderb82d1672013-01-14 12:43:31 -0600458 */
Alex Elder6d292902013-01-14 12:43:31 -0600459enum rbd_dev_flags {
Ilya Dryomov686238b2019-11-18 12:51:02 +0100460 RBD_DEV_FLAG_EXISTS, /* rbd_dev_device_setup() ran */
Alex Elderb82d1672013-01-14 12:43:31 -0600461 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov39258aa2019-11-07 17:16:23 +0100462 RBD_DEV_FLAG_READONLY, /* -o ro or snapshot */
Alex Elder6d292902013-01-14 12:43:31 -0600463};
464
Alex Eldercfbf6372013-05-31 17:40:45 -0500465static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600466
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700467static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600468static DEFINE_SPINLOCK(rbd_dev_list_lock);
469
Alex Elder432b8582012-01-29 13:57:44 -0600470static LIST_HEAD(rbd_client_list); /* clients */
471static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700472
Alex Elder78c2a442013-05-01 12:43:04 -0500473/* Slab caches for frequently-allocated structures */
474
Alex Elder1c2a9df2013-05-01 12:43:03 -0500475static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500476static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500477
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200478static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200479static DEFINE_IDA(rbd_dev_id_ida);
480
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400481static struct workqueue_struct *rbd_wq;
482
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100483static struct ceph_snap_context rbd_empty_snapc = {
484 .nref = REFCOUNT_INIT(1),
485};
486
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200487/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100488 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200489 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100490static bool single_major = true;
Joe Perches5657a812018-05-24 13:38:59 -0600491module_param(single_major, bool, 0444);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100492MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200493
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100494static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
495static ssize_t remove_store(struct bus_type *bus, const char *buf,
496 size_t count);
497static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
498 size_t count);
499static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
500 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200501static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600502
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200503static int rbd_dev_id_to_minor(int dev_id)
504{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200505 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200506}
507
508static int minor_to_rbd_dev_id(int minor)
509{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200510 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200511}
512
Ilya Dryomov39258aa2019-11-07 17:16:23 +0100513static bool rbd_is_ro(struct rbd_device *rbd_dev)
514{
515 return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
516}
517
Ilya Dryomovf3c0e452019-11-07 16:22:10 +0100518static bool rbd_is_snap(struct rbd_device *rbd_dev)
519{
520 return rbd_dev->spec->snap_id != CEPH_NOSNAP;
521}
522
Ilya Dryomoved95b212016-08-12 16:40:02 +0200523static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
524{
Ilya Dryomov637cd062019-06-06 17:14:49 +0200525 lockdep_assert_held(&rbd_dev->lock_rwsem);
526
Ilya Dryomoved95b212016-08-12 16:40:02 +0200527 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
528 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
529}
530
531static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
532{
533 bool is_lock_owner;
534
535 down_read(&rbd_dev->lock_rwsem);
536 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
537 up_read(&rbd_dev->lock_rwsem);
538 return is_lock_owner;
539}
540
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100541static ssize_t supported_features_show(struct bus_type *bus, char *buf)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100542{
543 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
544}
545
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100546static BUS_ATTR_WO(add);
547static BUS_ATTR_WO(remove);
548static BUS_ATTR_WO(add_single_major);
549static BUS_ATTR_WO(remove_single_major);
550static BUS_ATTR_RO(supported_features);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700551
552static struct attribute *rbd_bus_attrs[] = {
553 &bus_attr_add.attr,
554 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200555 &bus_attr_add_single_major.attr,
556 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100557 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700558 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600559};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200560
561static umode_t rbd_bus_is_visible(struct kobject *kobj,
562 struct attribute *attr, int index)
563{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200564 if (!single_major &&
565 (attr == &bus_attr_add_single_major.attr ||
566 attr == &bus_attr_remove_single_major.attr))
567 return 0;
568
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200569 return attr->mode;
570}
571
572static const struct attribute_group rbd_bus_group = {
573 .attrs = rbd_bus_attrs,
574 .is_visible = rbd_bus_is_visible,
575};
576__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600577
578static struct bus_type rbd_bus_type = {
579 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700580 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600581};
582
583static void rbd_root_dev_release(struct device *dev)
584{
585}
586
587static struct device rbd_root_dev = {
588 .init_name = "rbd",
589 .release = rbd_root_dev_release,
590};
591
Alex Elder06ecc6c2012-11-01 10:17:15 -0500592static __printf(2, 3)
593void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
594{
595 struct va_format vaf;
596 va_list args;
597
598 va_start(args, fmt);
599 vaf.fmt = fmt;
600 vaf.va = &args;
601
602 if (!rbd_dev)
603 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
604 else if (rbd_dev->disk)
605 printk(KERN_WARNING "%s: %s: %pV\n",
606 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
607 else if (rbd_dev->spec && rbd_dev->spec->image_name)
608 printk(KERN_WARNING "%s: image %s: %pV\n",
609 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
610 else if (rbd_dev->spec && rbd_dev->spec->image_id)
611 printk(KERN_WARNING "%s: id %s: %pV\n",
612 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
613 else /* punt */
614 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
615 RBD_DRV_NAME, rbd_dev, &vaf);
616 va_end(args);
617}
618
Alex Elderaafb2302012-09-06 16:00:54 -0500619#ifdef RBD_DEBUG
620#define rbd_assert(expr) \
621 if (unlikely(!(expr))) { \
622 printk(KERN_ERR "\nAssertion failure in %s() " \
623 "at line %d:\n\n" \
624 "\trbd_assert(%s);\n\n", \
625 __func__, __LINE__, #expr); \
626 BUG(); \
627 }
628#else /* !RBD_DEBUG */
629# define rbd_assert(expr) ((void) 0)
630#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800631
Alex Elder05a46af2013-04-26 15:44:36 -0500632static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600633
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500634static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500635static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400636static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400637static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500638static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
639 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500640static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
641 u8 *order, u64 *snap_size);
Ilya Dryomov22e8bd52019-06-05 19:25:11 +0200642static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700643
Ilya Dryomov54ab3b22019-05-11 16:21:49 +0200644static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200645static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
646
647/*
648 * Return true if nothing else is pending.
649 */
650static bool pending_result_dec(struct pending_result *pending, int *result)
651{
652 rbd_assert(pending->num_pending > 0);
653
654 if (*result && !pending->result)
655 pending->result = *result;
656 if (--pending->num_pending)
657 return false;
658
659 *result = pending->result;
660 return true;
661}
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700662
663static int rbd_open(struct block_device *bdev, fmode_t mode)
664{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600665 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600666 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667
Alex Eldera14ea262013-02-05 13:23:12 -0600668 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600669 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
670 removing = true;
671 else
672 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600673 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600674 if (removing)
675 return -ENOENT;
676
Alex Elderc3e946c2012-11-16 09:29:16 -0600677 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700678
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679 return 0;
680}
681
Al Virodb2a1442013-05-05 21:52:57 -0400682static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800683{
684 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600685 unsigned long open_count_before;
686
Alex Eldera14ea262013-02-05 13:23:12 -0600687 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600688 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600689 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600690 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800691
Alex Elderc3e946c2012-11-16 09:29:16 -0600692 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800693}
694
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700695static const struct block_device_operations rbd_bd_ops = {
696 .owner = THIS_MODULE,
697 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800698 .release = rbd_release,
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700699};
700
701/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500702 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500703 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700704 */
Alex Elderf8c38922012-08-10 13:12:07 -0700705static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706{
707 struct rbd_client *rbdc;
708 int ret = -ENOMEM;
709
Alex Elder37206ee2013-02-20 17:32:08 -0600710 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700711 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
712 if (!rbdc)
713 goto out_opt;
714
715 kref_init(&rbdc->kref);
716 INIT_LIST_HEAD(&rbdc->node);
717
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100718 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700719 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500720 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500721 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700722
723 ret = ceph_open_session(rbdc->client);
724 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500725 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726
Alex Elder432b8582012-01-29 13:57:44 -0600727 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700728 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600729 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700730
Alex Elder37206ee2013-02-20 17:32:08 -0600731 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600732
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500734out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500736out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700737 kfree(rbdc);
738out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500739 if (ceph_opts)
740 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600741 dout("%s: error %d\n", __func__, ret);
742
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400743 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744}
745
Alex Elder2f82ee52012-10-30 19:40:33 -0500746static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
747{
748 kref_get(&rbdc->kref);
749
750 return rbdc;
751}
752
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700753/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700754 * Find a ceph client with specific addr and configuration. If
755 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700756 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700757static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700758{
759 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700760 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700761
Alex Elder43ae4702012-07-03 16:01:18 -0500762 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700763 return NULL;
764
Alex Elder1f7ba332012-08-10 13:12:07 -0700765 spin_lock(&rbd_client_list_lock);
766 list_for_each_entry(client_node, &rbd_client_list, node) {
767 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500768 __rbd_get_client(client_node);
769
Alex Elder1f7ba332012-08-10 13:12:07 -0700770 found = true;
771 break;
772 }
773 }
774 spin_unlock(&rbd_client_list_lock);
775
776 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700777}
778
779/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300780 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700781 */
782enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300783 Opt_queue_depth,
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100784 Opt_alloc_size,
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400785 Opt_lock_timeout,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700786 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200787 Opt_pool_ns,
Ilya Dryomovdc1dad82020-05-29 20:51:23 +0200788 Opt_compression_hint,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700789 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700790 Opt_read_only,
791 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200792 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200793 Opt_exclusive,
Ilya Dryomovd9360542018-03-23 06:14:47 +0100794 Opt_notrim,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700795};
796
Ilya Dryomovdc1dad82020-05-29 20:51:23 +0200797enum {
798 Opt_compression_hint_none,
799 Opt_compression_hint_compressible,
800 Opt_compression_hint_incompressible,
801};
802
803static const struct constant_table rbd_param_compression_hint[] = {
804 {"none", Opt_compression_hint_none},
805 {"compressible", Opt_compression_hint_compressible},
806 {"incompressible", Opt_compression_hint_incompressible},
807 {}
808};
809
Al Virod7167b12019-09-07 07:23:15 -0400810static const struct fs_parameter_spec rbd_parameters[] = {
David Howells82995cc2019-03-25 16:38:32 +0000811 fsparam_u32 ("alloc_size", Opt_alloc_size),
Ilya Dryomovdc1dad82020-05-29 20:51:23 +0200812 fsparam_enum ("compression_hint", Opt_compression_hint,
813 rbd_param_compression_hint),
David Howells82995cc2019-03-25 16:38:32 +0000814 fsparam_flag ("exclusive", Opt_exclusive),
815 fsparam_flag ("lock_on_read", Opt_lock_on_read),
816 fsparam_u32 ("lock_timeout", Opt_lock_timeout),
817 fsparam_flag ("notrim", Opt_notrim),
818 fsparam_string ("_pool_ns", Opt_pool_ns),
819 fsparam_u32 ("queue_depth", Opt_queue_depth),
820 fsparam_flag ("read_only", Opt_read_only),
821 fsparam_flag ("read_write", Opt_read_write),
822 fsparam_flag ("ro", Opt_read_only),
823 fsparam_flag ("rw", Opt_read_write),
824 {}
825};
826
Alex Elder98571b52013-01-20 14:44:42 -0600827struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300828 int queue_depth;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100829 int alloc_size;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400830 unsigned long lock_timeout;
Alex Elder98571b52013-01-20 14:44:42 -0600831 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200832 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200833 bool exclusive;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100834 bool trim;
Ilya Dryomovdc1dad82020-05-29 20:51:23 +0200835
836 u32 alloc_hint_flags; /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
Alex Elder98571b52013-01-20 14:44:42 -0600837};
838
Ilya Dryomovb5584182015-06-23 16:21:19 +0300839#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100840#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400841#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
Alex Elder98571b52013-01-20 14:44:42 -0600842#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200843#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200844#define RBD_EXCLUSIVE_DEFAULT false
Ilya Dryomovd9360542018-03-23 06:14:47 +0100845#define RBD_TRIM_DEFAULT true
Alex Elder98571b52013-01-20 14:44:42 -0600846
David Howells82995cc2019-03-25 16:38:32 +0000847struct rbd_parse_opts_ctx {
Ilya Dryomovc3001562018-07-03 15:28:43 +0200848 struct rbd_spec *spec;
David Howells82995cc2019-03-25 16:38:32 +0000849 struct ceph_options *copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +0200850 struct rbd_options *opts;
851};
852
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800853static char* obj_op_name(enum obj_operation_type op_type)
854{
855 switch (op_type) {
856 case OBJ_OP_READ:
857 return "read";
858 case OBJ_OP_WRITE:
859 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800860 case OBJ_OP_DISCARD:
861 return "discard";
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100862 case OBJ_OP_ZEROOUT:
863 return "zeroout";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800864 default:
865 return "???";
866 }
867}
868
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700869/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700870 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600871 *
Alex Elder432b8582012-01-29 13:57:44 -0600872 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700873 */
874static void rbd_client_release(struct kref *kref)
875{
876 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
877
Alex Elder37206ee2013-02-20 17:32:08 -0600878 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500879 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700880 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500881 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700882
883 ceph_destroy_client(rbdc->client);
884 kfree(rbdc);
885}
886
887/*
888 * Drop reference to ceph client node. If it's not referenced anymore, release
889 * it.
890 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500891static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892{
Alex Elderc53d5892012-10-25 23:34:42 -0500893 if (rbdc)
894 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700895}
896
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100897/*
898 * Get a ceph client with specific addr and configuration, if one does
899 * not exist create it. Either way, ceph_opts is consumed by this
900 * function.
901 */
902static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
903{
904 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100905 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100906
Ilya Dryomova32e4142019-05-02 15:56:00 +0200907 mutex_lock(&client_mutex);
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100908 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100909 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100910 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100911
912 /*
913 * Using an existing client. Make sure ->pg_pools is up to
914 * date before we look up the pool id in do_rbd_add().
915 */
Ilya Dryomov9d4a2272019-03-20 10:58:05 +0100916 ret = ceph_wait_for_latest_osdmap(rbdc->client,
917 rbdc->client->options->mount_timeout);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100918 if (ret) {
919 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
920 rbd_put_client(rbdc);
921 rbdc = ERR_PTR(ret);
922 }
923 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100924 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100925 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100926 mutex_unlock(&client_mutex);
927
928 return rbdc;
929}
930
Alex Eldera30b71b2012-07-10 20:30:11 -0500931static bool rbd_image_format_valid(u32 image_format)
932{
933 return image_format == 1 || image_format == 2;
934}
935
Alex Elder8e94af82012-07-25 09:32:40 -0500936static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
937{
Alex Elder103a1502012-08-02 11:29:45 -0500938 size_t size;
939 u32 snap_count;
940
941 /* The header has to start with the magic rbd header text */
942 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
943 return false;
944
Alex Elderdb2388b2012-10-20 22:17:27 -0500945 /* The bio layer requires at least sector-sized I/O */
946
947 if (ondisk->options.order < SECTOR_SHIFT)
948 return false;
949
950 /* If we use u64 in a few spots we may be able to loosen this */
951
952 if (ondisk->options.order > 8 * sizeof (int) - 1)
953 return false;
954
Alex Elder103a1502012-08-02 11:29:45 -0500955 /*
956 * The size of a snapshot header has to fit in a size_t, and
957 * that limits the number of snapshots.
958 */
959 snap_count = le32_to_cpu(ondisk->snap_count);
960 size = SIZE_MAX - sizeof (struct ceph_snap_context);
961 if (snap_count > size / sizeof (__le64))
962 return false;
963
964 /*
965 * Not only that, but the size of the entire the snapshot
966 * header must also be representable in a size_t.
967 */
968 size -= snap_count * sizeof (__le64);
969 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
970 return false;
971
972 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500973}
974
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700975/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100976 * returns the size of an object in the image
977 */
978static u32 rbd_obj_bytes(struct rbd_image_header *header)
979{
980 return 1U << header->obj_order;
981}
982
Ilya Dryomov263423f2017-01-25 18:16:22 +0100983static void rbd_init_layout(struct rbd_device *rbd_dev)
984{
985 if (rbd_dev->header.stripe_unit == 0 ||
986 rbd_dev->header.stripe_count == 0) {
987 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
988 rbd_dev->header.stripe_count = 1;
989 }
990
991 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
992 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
993 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +0100994 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
995 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +0100996 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
997}
998
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100999/*
Alex Elderbb23e372013-05-06 09:51:29 -05001000 * Fill an rbd image header with information from the given format 1
1001 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001002 */
Alex Elder662518b2013-05-06 09:51:29 -05001003static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001004 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005{
Alex Elder662518b2013-05-06 09:51:29 -05001006 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001007 bool first_time = header->object_prefix == NULL;
1008 struct ceph_snap_context *snapc;
1009 char *object_prefix = NULL;
1010 char *snap_names = NULL;
1011 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001012 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001013 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001014 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015
Alex Elderbb23e372013-05-06 09:51:29 -05001016 /* Allocate this now to avoid having to handle failure below */
1017
1018 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001019 object_prefix = kstrndup(ondisk->object_prefix,
1020 sizeof(ondisk->object_prefix),
1021 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001022 if (!object_prefix)
1023 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001024 }
1025
1026 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001027
Alex Elder103a1502012-08-02 11:29:45 -05001028 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001029 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1030 if (!snapc)
1031 goto out_err;
1032 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001033 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001034 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001035 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1036
Alex Elderbb23e372013-05-06 09:51:29 -05001037 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001038
Alex Elderbb23e372013-05-06 09:51:29 -05001039 if (snap_names_len > (u64)SIZE_MAX)
1040 goto out_2big;
1041 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1042 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001043 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001044
1045 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001046 snap_sizes = kmalloc_array(snap_count,
1047 sizeof(*header->snap_sizes),
1048 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001049 if (!snap_sizes)
1050 goto out_err;
1051
Alex Elderf785cc12012-08-23 23:22:06 -05001052 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001053 * Copy the names, and fill in each snapshot's id
1054 * and size.
1055 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001056 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001057 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001058 * snap_names_len bytes beyond the end of the
1059 * snapshot id array, this memcpy() is safe.
1060 */
Alex Elderbb23e372013-05-06 09:51:29 -05001061 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1062 snaps = ondisk->snaps;
1063 for (i = 0; i < snap_count; i++) {
1064 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1065 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1066 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001067 }
Alex Elder849b4262012-07-09 21:04:24 -05001068
Alex Elderbb23e372013-05-06 09:51:29 -05001069 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001070
Alex Elderbb23e372013-05-06 09:51:29 -05001071 if (first_time) {
1072 header->object_prefix = object_prefix;
1073 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001074 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001075 } else {
1076 ceph_put_snap_context(header->snapc);
1077 kfree(header->snap_names);
1078 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001079 }
1080
1081 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001082
Alex Elderf84344f2012-08-31 17:29:51 -05001083 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001084 header->snapc = snapc;
1085 header->snap_names = snap_names;
1086 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001087
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001088 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001089out_2big:
1090 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001091out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001092 kfree(snap_sizes);
1093 kfree(snap_names);
1094 ceph_put_snap_context(snapc);
1095 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001096
Alex Elderbb23e372013-05-06 09:51:29 -05001097 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001098}
1099
Alex Elder9682fc62013-04-30 00:44:33 -05001100static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1101{
1102 const char *snap_name;
1103
1104 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1105
1106 /* Skip over names until we find the one we are looking for */
1107
1108 snap_name = rbd_dev->header.snap_names;
1109 while (which--)
1110 snap_name += strlen(snap_name) + 1;
1111
1112 return kstrdup(snap_name, GFP_KERNEL);
1113}
1114
Alex Elder30d1cff2013-05-01 12:43:03 -05001115/*
1116 * Snapshot id comparison function for use with qsort()/bsearch().
1117 * Note that result is for snapshots in *descending* order.
1118 */
1119static int snapid_compare_reverse(const void *s1, const void *s2)
1120{
1121 u64 snap_id1 = *(u64 *)s1;
1122 u64 snap_id2 = *(u64 *)s2;
1123
1124 if (snap_id1 < snap_id2)
1125 return 1;
1126 return snap_id1 == snap_id2 ? 0 : -1;
1127}
1128
1129/*
1130 * Search a snapshot context to see if the given snapshot id is
1131 * present.
1132 *
1133 * Returns the position of the snapshot id in the array if it's found,
1134 * or BAD_SNAP_INDEX otherwise.
1135 *
1136 * Note: The snapshot array is in kept sorted (by the osd) in
1137 * reverse order, highest snapshot id first.
1138 */
Alex Elder9682fc62013-04-30 00:44:33 -05001139static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1140{
1141 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001142 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001143
Alex Elder30d1cff2013-05-01 12:43:03 -05001144 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1145 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001146
Alex Elder30d1cff2013-05-01 12:43:03 -05001147 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001148}
1149
Alex Elder2ad3d712013-04-30 00:44:33 -05001150static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1151 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001152{
1153 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001154 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001155
1156 which = rbd_dev_snap_index(rbd_dev, snap_id);
1157 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001158 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001159
Josh Durginda6a6b62013-09-04 17:57:31 -07001160 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1161 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001162}
1163
Alex Elder9e15b772012-10-30 19:40:33 -05001164static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1165{
Alex Elder9e15b772012-10-30 19:40:33 -05001166 if (snap_id == CEPH_NOSNAP)
1167 return RBD_SNAP_HEAD_NAME;
1168
Alex Elder54cac612013-04-30 00:44:33 -05001169 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1170 if (rbd_dev->image_format == 1)
1171 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001172
Alex Elder54cac612013-04-30 00:44:33 -05001173 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001174}
1175
Alex Elder2ad3d712013-04-30 00:44:33 -05001176static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1177 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001178{
Alex Elder2ad3d712013-04-30 00:44:33 -05001179 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1180 if (snap_id == CEPH_NOSNAP) {
1181 *snap_size = rbd_dev->header.image_size;
1182 } else if (rbd_dev->image_format == 1) {
1183 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001184
Alex Elder2ad3d712013-04-30 00:44:33 -05001185 which = rbd_dev_snap_index(rbd_dev, snap_id);
1186 if (which == BAD_SNAP_INDEX)
1187 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001188
Alex Elder2ad3d712013-04-30 00:44:33 -05001189 *snap_size = rbd_dev->header.snap_sizes[which];
1190 } else {
1191 u64 size = 0;
1192 int ret;
1193
1194 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1195 if (ret)
1196 return ret;
1197
1198 *snap_size = size;
1199 }
1200 return 0;
1201}
1202
Alex Elderd1cf5782013-04-27 09:59:30 -05001203static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001204{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001205 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001206 u64 size = 0;
Alex Elder2ad3d712013-04-30 00:44:33 -05001207 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001208
Alex Elder2ad3d712013-04-30 00:44:33 -05001209 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1210 if (ret)
1211 return ret;
Alex Elder2ad3d712013-04-30 00:44:33 -05001212
1213 rbd_dev->mapping.size = size;
Alex Elder8b0241f2013-04-25 23:15:08 -05001214 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001215}
1216
Alex Elderd1cf5782013-04-27 09:59:30 -05001217static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1218{
1219 rbd_dev->mapping.size = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001220}
1221
Ilya Dryomov5359a172018-01-20 10:30:10 +01001222static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001223{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001224 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001225 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001226
Ilya Dryomov5359a172018-01-20 10:30:10 +01001227 buf = bvec_kmap_irq(bv, &flags);
1228 memset(buf, 0, bv->bv_len);
1229 flush_dcache_page(bv->bv_page);
1230 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231}
1232
Ilya Dryomov5359a172018-01-20 10:30:10 +01001233static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001234{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001235 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001236
Ilya Dryomov5359a172018-01-20 10:30:10 +01001237 ceph_bio_iter_advance(&it, off);
1238 ceph_bio_iter_advance_step(&it, bytes, ({
1239 zero_bvec(&bv);
1240 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001241}
1242
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001243static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001244{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001245 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001246
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001247 ceph_bvec_iter_advance(&it, off);
1248 ceph_bvec_iter_advance_step(&it, bytes, ({
1249 zero_bvec(&bv);
1250 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001251}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001252
Alex Elderf7760da2012-10-20 22:17:27 -05001253/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001254 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001255 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001256 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001257 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001258 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001259static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1260 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001261{
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001262 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1263
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001264 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001265 case OBJ_REQUEST_BIO:
1266 zero_bios(&obj_req->bio_pos, off, bytes);
1267 break;
1268 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001269 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001270 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1271 break;
1272 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001273 BUG();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001274 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001275}
1276
1277static void rbd_obj_request_destroy(struct kref *kref);
1278static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1279{
1280 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001281 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001282 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001283 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1284}
1285
Alex Elderbf0d5f502012-11-22 00:00:08 -06001286static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1287 struct rbd_obj_request *obj_request)
1288{
Alex Elder25dcf952013-01-25 17:08:55 -06001289 rbd_assert(obj_request->img_request == NULL);
1290
Alex Elderb155e862013-04-15 14:50:37 -05001291 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001292 obj_request->img_request = img_request;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001293 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001294}
1295
1296static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1297 struct rbd_obj_request *obj_request)
1298{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001299 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001300 list_del(&obj_request->ex.oe_item);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001301 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001302 rbd_obj_request_put(obj_request);
1303}
1304
Ilya Dryomova086a1b2019-06-12 18:33:31 +02001305static void rbd_osd_submit(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001306{
Ilya Dryomova086a1b2019-06-12 18:33:31 +02001307 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov980917f2016-09-12 18:59:42 +02001308
Ilya Dryomova086a1b2019-06-12 18:33:31 +02001309 dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
1310 __func__, osd_req, obj_req, obj_req->ex.oe_objno,
1311 obj_req->ex.oe_off, obj_req->ex.oe_len);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001312 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001313}
1314
Alex Elder0c425242013-02-08 09:55:49 -06001315/*
1316 * The default/initial value for all image request flags is 0. Each
1317 * is conditionally set to 1 at image request initialization time
1318 * and currently never change thereafter.
1319 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001320static void img_request_layered_set(struct rbd_img_request *img_request)
1321{
1322 set_bit(IMG_REQ_LAYERED, &img_request->flags);
Alex Elderd0b2e942013-01-24 16:13:36 -06001323}
1324
1325static bool img_request_layered_test(struct rbd_img_request *img_request)
1326{
Alex Elderd0b2e942013-01-24 16:13:36 -06001327 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1328}
1329
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001330static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001331{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001332 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1333
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001334 return !obj_req->ex.oe_off &&
1335 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001336}
1337
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001338static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001339{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001340 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001341
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001342 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001343 rbd_dev->layout.object_size;
1344}
1345
Ilya Dryomov13488d52019-02-25 12:37:50 +01001346/*
1347 * Must be called after rbd_obj_calc_img_extents().
1348 */
1349static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1350{
1351 if (!obj_req->num_img_extents ||
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01001352 (rbd_obj_is_entire(obj_req) &&
1353 !obj_req->img_request->snapc->num_snaps))
Ilya Dryomov13488d52019-02-25 12:37:50 +01001354 return false;
1355
1356 return true;
1357}
1358
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001359static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1360{
1361 return ceph_file_extents_bytes(obj_req->img_extents,
1362 obj_req->num_img_extents);
1363}
1364
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001365static bool rbd_img_is_write(struct rbd_img_request *img_req)
1366{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001367 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001368 case OBJ_OP_READ:
1369 return false;
1370 case OBJ_OP_WRITE:
1371 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001372 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001373 return true;
1374 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02001375 BUG();
Alex Elder6e2a4502013-03-27 09:16:30 -05001376 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001377}
1378
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001379static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001380{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001381 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001382 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001383
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001384 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1385 osd_req->r_result, obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001386
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001387 /*
1388 * Writes aren't allowed to return a data payload. In some
1389 * guarded write cases (e.g. stat + zero on an empty object)
1390 * a stat response makes it through, but we don't care.
1391 */
1392 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1393 result = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001394 else
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001395 result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001396
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001397 rbd_obj_handle_request(obj_req, result);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001398}
1399
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001400static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
Alex Elder430c28c2013-04-03 21:32:51 -05001401{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001402 struct rbd_obj_request *obj_request = osd_req->r_priv;
Ilya Dryomov22d2cfd2020-06-04 11:12:34 +02001403 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1404 struct ceph_options *opt = rbd_dev->rbd_client->client->options;
Alex Elder430c28c2013-04-03 21:32:51 -05001405
Ilya Dryomov22d2cfd2020-06-04 11:12:34 +02001406 osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001407 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001408}
1409
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001410static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
Alex Elder9d4df012013-04-19 15:34:50 -05001411{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001412 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elder9d4df012013-04-19 15:34:50 -05001413
Ilya Dryomova162b302018-01-30 17:52:10 +01001414 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001415 ktime_get_real_ts64(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001416 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001417}
1418
Ilya Dryomovbc812072017-01-25 18:16:23 +01001419static struct ceph_osd_request *
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001420__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1421 struct ceph_snap_context *snapc, int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001422{
Ilya Dryomove28eded2019-02-25 11:42:26 +01001423 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001424 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1425 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001426 const char *name_format = rbd_dev->image_format == 1 ?
1427 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001428 int ret;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001429
Ilya Dryomove28eded2019-02-25 11:42:26 +01001430 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001431 if (!req)
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001432 return ERR_PTR(-ENOMEM);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001433
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001434 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001435 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001436 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001437
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001438 /*
1439 * Data objects may be stored in a separate pool, but always in
1440 * the same namespace in that pool as the header in its pool.
1441 */
1442 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001443 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001444
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001445 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1446 rbd_dev->header.object_prefix,
1447 obj_req->ex.oe_objno);
1448 if (ret)
1449 return ERR_PTR(ret);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001450
Ilya Dryomovbc812072017-01-25 18:16:23 +01001451 return req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001452}
1453
Ilya Dryomove28eded2019-02-25 11:42:26 +01001454static struct ceph_osd_request *
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001455rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
Ilya Dryomove28eded2019-02-25 11:42:26 +01001456{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001457 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1458 num_ops);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001459}
1460
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001461static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001462{
1463 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001464
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001465 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001466 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001467 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001468
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001469 ceph_object_extent_init(&obj_request->ex);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001470 INIT_LIST_HEAD(&obj_request->osd_reqs);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001471 mutex_init(&obj_request->state_mutex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001472 kref_init(&obj_request->kref);
1473
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001474 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001475 return obj_request;
1476}
1477
1478static void rbd_obj_request_destroy(struct kref *kref)
1479{
1480 struct rbd_obj_request *obj_request;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001481 struct ceph_osd_request *osd_req;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001482 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001483
1484 obj_request = container_of(kref, struct rbd_obj_request, kref);
1485
Alex Elder37206ee2013-02-20 17:32:08 -06001486 dout("%s: obj %p\n", __func__, obj_request);
1487
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001488 while (!list_empty(&obj_request->osd_reqs)) {
1489 osd_req = list_first_entry(&obj_request->osd_reqs,
1490 struct ceph_osd_request, r_private_item);
1491 list_del_init(&osd_req->r_private_item);
1492 ceph_osdc_put_request(osd_req);
1493 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001494
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001495 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001496 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001497 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001498 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001499 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001500 case OBJ_REQUEST_OWN_BVECS:
1501 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001502 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001503 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001504 BUG();
Alex Elderbf0d5f502012-11-22 00:00:08 -06001505 }
1506
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001507 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001508 if (obj_request->copyup_bvecs) {
1509 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1510 if (obj_request->copyup_bvecs[i].bv_page)
1511 __free_page(obj_request->copyup_bvecs[i].bv_page);
1512 }
1513 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001514 }
1515
Alex Elder868311b2013-05-01 12:43:03 -05001516 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001517}
1518
Alex Elderfb65d2282013-05-08 22:50:04 -05001519/* It's OK to call this for a device with no parent */
1520
1521static void rbd_spec_put(struct rbd_spec *spec);
1522static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1523{
1524 rbd_dev_remove_parent(rbd_dev);
1525 rbd_spec_put(rbd_dev->parent_spec);
1526 rbd_dev->parent_spec = NULL;
1527 rbd_dev->parent_overlap = 0;
1528}
1529
Alex Elderbf0d5f502012-11-22 00:00:08 -06001530/*
Alex Eldera2acd002013-05-08 22:50:04 -05001531 * Parent image reference counting is used to determine when an
1532 * image's parent fields can be safely torn down--after there are no
1533 * more in-flight requests to the parent image. When the last
1534 * reference is dropped, cleaning them up is safe.
1535 */
1536static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1537{
1538 int counter;
1539
1540 if (!rbd_dev->parent_spec)
1541 return;
1542
1543 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1544 if (counter > 0)
1545 return;
1546
1547 /* Last reference; clean up parent data structures */
1548
1549 if (!counter)
1550 rbd_dev_unparent(rbd_dev);
1551 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001552 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001553}
1554
1555/*
1556 * If an image has a non-zero parent overlap, get a reference to its
1557 * parent.
1558 *
1559 * Returns true if the rbd device has a parent with a non-zero
1560 * overlap and a reference for it was successfully taken, or
1561 * false otherwise.
1562 */
1563static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1564{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001565 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001566
1567 if (!rbd_dev->parent_spec)
1568 return false;
1569
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001570 if (rbd_dev->parent_overlap)
1571 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
Alex Eldera2acd002013-05-08 22:50:04 -05001572
1573 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001574 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001575
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001576 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001577}
1578
Ilya Dryomov59e542c2020-02-12 15:23:58 +01001579static void rbd_img_request_init(struct rbd_img_request *img_request,
1580 struct rbd_device *rbd_dev,
1581 enum obj_operation_type op_type)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001582{
Ilya Dryomov59e542c2020-02-12 15:23:58 +01001583 memset(img_request, 0, sizeof(*img_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001584
Alex Elderbf0d5f502012-11-22 00:00:08 -06001585 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001586 img_request->op_type = op_type;
Ilya Dryomova0c58952018-01-22 16:03:06 +01001587
Ilya Dryomove1fddc82019-05-30 16:07:48 +02001588 INIT_LIST_HEAD(&img_request->lock_item);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001589 INIT_LIST_HEAD(&img_request->object_extents);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02001590 mutex_init(&img_request->state_mutex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001591}
1592
Ilya Dryomova52cc682020-02-12 15:08:39 +01001593static void rbd_img_capture_header(struct rbd_img_request *img_req)
1594{
1595 struct rbd_device *rbd_dev = img_req->rbd_dev;
1596
1597 lockdep_assert_held(&rbd_dev->header_rwsem);
1598
1599 if (rbd_img_is_write(img_req))
1600 img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
1601 else
1602 img_req->snap_id = rbd_dev->spec->snap_id;
1603
1604 if (rbd_dev_parent_get(rbd_dev))
1605 img_request_layered_set(img_req);
1606}
1607
Hannes Reinecke679a97d2020-01-31 11:37:36 +01001608static void rbd_img_request_destroy(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001609{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001610 struct rbd_obj_request *obj_request;
1611 struct rbd_obj_request *next_obj_request;
1612
Alex Elder37206ee2013-02-20 17:32:08 -06001613 dout("%s: img %p\n", __func__, img_request);
1614
Ilya Dryomove1fddc82019-05-30 16:07:48 +02001615 WARN_ON(!list_empty(&img_request->lock_item));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001616 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1617 rbd_img_obj_request_del(img_request, obj_request);
1618
Ilya Dryomov78b42a82020-02-12 14:34:03 +01001619 if (img_request_layered_test(img_request))
Alex Eldera2acd002013-05-08 22:50:04 -05001620 rbd_dev_parent_put(img_request->rbd_dev);
Alex Eldera2acd002013-05-08 22:50:04 -05001621
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001622 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001623 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001624
Ilya Dryomov59e542c2020-02-12 15:23:58 +01001625 if (test_bit(IMG_REQ_CHILD, &img_request->flags))
1626 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001627}
1628
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02001629#define BITS_PER_OBJ 2
1630#define OBJS_PER_BYTE (BITS_PER_BYTE / BITS_PER_OBJ)
1631#define OBJ_MASK ((1 << BITS_PER_OBJ) - 1)
1632
1633static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
1634 u64 *index, u8 *shift)
1635{
1636 u32 off;
1637
1638 rbd_assert(objno < rbd_dev->object_map_size);
1639 *index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
1640 *shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
1641}
1642
1643static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1644{
1645 u64 index;
1646 u8 shift;
1647
1648 lockdep_assert_held(&rbd_dev->object_map_lock);
1649 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1650 return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
1651}
1652
1653static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
1654{
1655 u64 index;
1656 u8 shift;
1657 u8 *p;
1658
1659 lockdep_assert_held(&rbd_dev->object_map_lock);
1660 rbd_assert(!(val & ~OBJ_MASK));
1661
1662 __rbd_object_map_index(rbd_dev, objno, &index, &shift);
1663 p = &rbd_dev->object_map[index];
1664 *p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
1665}
1666
1667static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
1668{
1669 u8 state;
1670
1671 spin_lock(&rbd_dev->object_map_lock);
1672 state = __rbd_object_map_get(rbd_dev, objno);
1673 spin_unlock(&rbd_dev->object_map_lock);
1674 return state;
1675}
1676
1677static bool use_object_map(struct rbd_device *rbd_dev)
1678{
Ilya Dryomov3fe69922019-11-12 19:41:48 +01001679 /*
1680 * An image mapped read-only can't use the object map -- it isn't
1681 * loaded because the header lock isn't acquired. Someone else can
1682 * write to the image and update the object map behind our back.
1683 *
1684 * A snapshot can't be written to, so using the object map is always
1685 * safe.
1686 */
1687 if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
1688 return false;
1689
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02001690 return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
1691 !(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
1692}
1693
1694static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
1695{
1696 u8 state;
1697
1698 /* fall back to default logic if object map is disabled or invalid */
1699 if (!use_object_map(rbd_dev))
1700 return true;
1701
1702 state = rbd_object_map_get(rbd_dev, objno);
1703 return state != OBJECT_NONEXISTENT;
1704}
1705
1706static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
1707 struct ceph_object_id *oid)
1708{
1709 if (snap_id == CEPH_NOSNAP)
1710 ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
1711 rbd_dev->spec->image_id);
1712 else
1713 ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
1714 rbd_dev->spec->image_id, snap_id);
1715}
1716
1717static int rbd_object_map_lock(struct rbd_device *rbd_dev)
1718{
1719 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1720 CEPH_DEFINE_OID_ONSTACK(oid);
1721 u8 lock_type;
1722 char *lock_tag;
1723 struct ceph_locker *lockers;
1724 u32 num_lockers;
1725 bool broke_lock = false;
1726 int ret;
1727
1728 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1729
1730again:
1731 ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1732 CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
1733 if (ret != -EBUSY || broke_lock) {
1734 if (ret == -EEXIST)
1735 ret = 0; /* already locked by myself */
1736 if (ret)
1737 rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
1738 return ret;
1739 }
1740
1741 ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
1742 RBD_LOCK_NAME, &lock_type, &lock_tag,
1743 &lockers, &num_lockers);
1744 if (ret) {
1745 if (ret == -ENOENT)
1746 goto again;
1747
1748 rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
1749 return ret;
1750 }
1751
1752 kfree(lock_tag);
1753 if (num_lockers == 0)
1754 goto again;
1755
1756 rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
1757 ENTITY_NAME(lockers[0].id.name));
1758
1759 ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
1760 RBD_LOCK_NAME, lockers[0].id.cookie,
1761 &lockers[0].id.name);
1762 ceph_free_lockers(lockers, num_lockers);
1763 if (ret) {
1764 if (ret == -ENOENT)
1765 goto again;
1766
1767 rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
1768 return ret;
1769 }
1770
1771 broke_lock = true;
1772 goto again;
1773}
1774
1775static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
1776{
1777 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1778 CEPH_DEFINE_OID_ONSTACK(oid);
1779 int ret;
1780
1781 rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
1782
1783 ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
1784 "");
1785 if (ret && ret != -ENOENT)
1786 rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
1787}
1788
1789static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
1790{
1791 u8 struct_v;
1792 u32 struct_len;
1793 u32 header_len;
1794 void *header_end;
1795 int ret;
1796
1797 ceph_decode_32_safe(p, end, header_len, e_inval);
1798 header_end = *p + header_len;
1799
1800 ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
1801 &struct_len);
1802 if (ret)
1803 return ret;
1804
1805 ceph_decode_64_safe(p, end, *object_map_size, e_inval);
1806
1807 *p = header_end;
1808 return 0;
1809
1810e_inval:
1811 return -EINVAL;
1812}
1813
1814static int __rbd_object_map_load(struct rbd_device *rbd_dev)
1815{
1816 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1817 CEPH_DEFINE_OID_ONSTACK(oid);
1818 struct page **pages;
1819 void *p, *end;
1820 size_t reply_len;
1821 u64 num_objects;
1822 u64 object_map_bytes;
1823 u64 object_map_size;
1824 int num_pages;
1825 int ret;
1826
1827 rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
1828
1829 num_objects = ceph_get_num_objects(&rbd_dev->layout,
1830 rbd_dev->mapping.size);
1831 object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
1832 BITS_PER_BYTE);
1833 num_pages = calc_pages_for(0, object_map_bytes) + 1;
1834 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
1835 if (IS_ERR(pages))
1836 return PTR_ERR(pages);
1837
1838 reply_len = num_pages * PAGE_SIZE;
1839 rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
1840 ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
1841 "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
1842 NULL, 0, pages, &reply_len);
1843 if (ret)
1844 goto out;
1845
1846 p = page_address(pages[0]);
1847 end = p + min(reply_len, (size_t)PAGE_SIZE);
1848 ret = decode_object_map_header(&p, end, &object_map_size);
1849 if (ret)
1850 goto out;
1851
1852 if (object_map_size != num_objects) {
1853 rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
1854 object_map_size, num_objects);
1855 ret = -EINVAL;
1856 goto out;
1857 }
1858
1859 if (offset_in_page(p) + object_map_bytes > reply_len) {
1860 ret = -EINVAL;
1861 goto out;
1862 }
1863
1864 rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
1865 if (!rbd_dev->object_map) {
1866 ret = -ENOMEM;
1867 goto out;
1868 }
1869
1870 rbd_dev->object_map_size = object_map_size;
1871 ceph_copy_from_page_vector(pages, rbd_dev->object_map,
1872 offset_in_page(p), object_map_bytes);
1873
1874out:
1875 ceph_release_page_vector(pages, num_pages);
1876 return ret;
1877}
1878
1879static void rbd_object_map_free(struct rbd_device *rbd_dev)
1880{
1881 kvfree(rbd_dev->object_map);
1882 rbd_dev->object_map = NULL;
1883 rbd_dev->object_map_size = 0;
1884}
1885
1886static int rbd_object_map_load(struct rbd_device *rbd_dev)
1887{
1888 int ret;
1889
1890 ret = __rbd_object_map_load(rbd_dev);
1891 if (ret)
1892 return ret;
1893
1894 ret = rbd_dev_v2_get_flags(rbd_dev);
1895 if (ret) {
1896 rbd_object_map_free(rbd_dev);
1897 return ret;
1898 }
1899
1900 if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
1901 rbd_warn(rbd_dev, "object map is invalid");
1902
1903 return 0;
1904}
1905
1906static int rbd_object_map_open(struct rbd_device *rbd_dev)
1907{
1908 int ret;
1909
1910 ret = rbd_object_map_lock(rbd_dev);
1911 if (ret)
1912 return ret;
1913
1914 ret = rbd_object_map_load(rbd_dev);
1915 if (ret) {
1916 rbd_object_map_unlock(rbd_dev);
1917 return ret;
1918 }
1919
1920 return 0;
1921}
1922
1923static void rbd_object_map_close(struct rbd_device *rbd_dev)
1924{
1925 rbd_object_map_free(rbd_dev);
1926 rbd_object_map_unlock(rbd_dev);
1927}
1928
1929/*
1930 * This function needs snap_id (or more precisely just something to
1931 * distinguish between HEAD and snapshot object maps), new_state and
1932 * current_state that were passed to rbd_object_map_update().
1933 *
1934 * To avoid allocating and stashing a context we piggyback on the OSD
1935 * request. A HEAD update has two ops (assert_locked). For new_state
1936 * and current_state we decode our own object_map_update op, encoded in
1937 * rbd_cls_object_map_update().
1938 */
1939static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
1940 struct ceph_osd_request *osd_req)
1941{
1942 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1943 struct ceph_osd_data *osd_data;
1944 u64 objno;
Kees Cook3f649ab2020-06-03 13:09:38 -07001945 u8 state, new_state, current_state;
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02001946 bool has_current_state;
1947 void *p;
1948
1949 if (osd_req->r_result)
1950 return osd_req->r_result;
1951
1952 /*
1953 * Nothing to do for a snapshot object map.
1954 */
1955 if (osd_req->r_num_ops == 1)
1956 return 0;
1957
1958 /*
1959 * Update in-memory HEAD object map.
1960 */
1961 rbd_assert(osd_req->r_num_ops == 2);
1962 osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
1963 rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
1964
1965 p = page_address(osd_data->pages[0]);
1966 objno = ceph_decode_64(&p);
1967 rbd_assert(objno == obj_req->ex.oe_objno);
1968 rbd_assert(ceph_decode_64(&p) == objno + 1);
1969 new_state = ceph_decode_8(&p);
1970 has_current_state = ceph_decode_8(&p);
1971 if (has_current_state)
1972 current_state = ceph_decode_8(&p);
1973
1974 spin_lock(&rbd_dev->object_map_lock);
1975 state = __rbd_object_map_get(rbd_dev, objno);
1976 if (!has_current_state || current_state == state ||
1977 (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
1978 __rbd_object_map_set(rbd_dev, objno, new_state);
1979 spin_unlock(&rbd_dev->object_map_lock);
1980
1981 return 0;
1982}
1983
1984static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
1985{
1986 struct rbd_obj_request *obj_req = osd_req->r_priv;
1987 int result;
1988
1989 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1990 osd_req->r_result, obj_req);
1991
1992 result = rbd_object_map_update_finish(obj_req, osd_req);
1993 rbd_obj_handle_request(obj_req, result);
1994}
1995
1996static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
1997{
1998 u8 state = rbd_object_map_get(rbd_dev, objno);
1999
2000 if (state == new_state ||
2001 (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
2002 (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
2003 return false;
2004
2005 return true;
2006}
2007
2008static int rbd_cls_object_map_update(struct ceph_osd_request *req,
2009 int which, u64 objno, u8 new_state,
2010 const u8 *current_state)
2011{
2012 struct page **pages;
2013 void *p, *start;
2014 int ret;
2015
2016 ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
2017 if (ret)
2018 return ret;
2019
2020 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2021 if (IS_ERR(pages))
2022 return PTR_ERR(pages);
2023
2024 p = start = page_address(pages[0]);
2025 ceph_encode_64(&p, objno);
2026 ceph_encode_64(&p, objno + 1);
2027 ceph_encode_8(&p, new_state);
2028 if (current_state) {
2029 ceph_encode_8(&p, 1);
2030 ceph_encode_8(&p, *current_state);
2031 } else {
2032 ceph_encode_8(&p, 0);
2033 }
2034
2035 osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
2036 false, true);
2037 return 0;
2038}
2039
2040/*
2041 * Return:
2042 * 0 - object map update sent
2043 * 1 - object map update isn't needed
2044 * <0 - error
2045 */
2046static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
2047 u8 new_state, const u8 *current_state)
2048{
2049 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2050 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2051 struct ceph_osd_request *req;
2052 int num_ops = 1;
2053 int which = 0;
2054 int ret;
2055
2056 if (snap_id == CEPH_NOSNAP) {
2057 if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
2058 return 1;
2059
2060 num_ops++; /* assert_locked */
2061 }
2062
2063 req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
2064 if (!req)
2065 return -ENOMEM;
2066
2067 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
2068 req->r_callback = rbd_object_map_callback;
2069 req->r_priv = obj_req;
2070
2071 rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
2072 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
2073 req->r_flags = CEPH_OSD_FLAG_WRITE;
2074 ktime_get_real_ts64(&req->r_mtime);
2075
2076 if (snap_id == CEPH_NOSNAP) {
2077 /*
2078 * Protect against possible race conditions during lock
2079 * ownership transitions.
2080 */
2081 ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
2082 CEPH_CLS_LOCK_EXCLUSIVE, "", "");
2083 if (ret)
2084 return ret;
2085 }
2086
2087 ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
2088 new_state, current_state);
2089 if (ret)
2090 return ret;
2091
2092 ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
2093 if (ret)
2094 return ret;
2095
2096 ceph_osdc_start_request(osdc, req, false);
2097 return 0;
2098}
2099
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002100static void prune_extents(struct ceph_file_extent *img_extents,
2101 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05002102{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002103 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05002104
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002105 /* drop extents completely beyond the overlap */
2106 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
2107 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05002108
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002109 if (cnt) {
2110 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05002111
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002112 /* trim final overlapping extent */
2113 if (ex->fe_off + ex->fe_len > overlap)
2114 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06002115 }
2116
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002117 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05002118}
2119
Alex Elderf1a47392013-04-19 15:34:50 -05002120/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002121 * Determine the byte range(s) covered by either just the object extent
2122 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002123 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002124static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
2125 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002126{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002127 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002128 int ret;
2129
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002130 if (!rbd_dev->parent_overlap)
2131 return 0;
2132
2133 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
2134 entire ? 0 : obj_req->ex.oe_off,
2135 entire ? rbd_dev->layout.object_size :
2136 obj_req->ex.oe_len,
2137 &obj_req->img_extents,
2138 &obj_req->num_img_extents);
2139 if (ret)
2140 return ret;
2141
2142 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2143 rbd_dev->parent_overlap);
2144 return 0;
2145}
2146
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002147static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002148{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002149 struct rbd_obj_request *obj_req = osd_req->r_priv;
2150
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002151 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002152 case OBJ_REQUEST_BIO:
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002153 osd_req_op_extent_osd_data_bio(osd_req, which,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002154 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002155 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002156 break;
2157 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002158 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002159 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002160 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01002161 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002162 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002163 &obj_req->bvec_pos);
2164 break;
2165 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002166 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002167 }
2168}
2169
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002170static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002171{
2172 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02002173
Alex Elderc5b5ef62013-02-11 12:33:24 -06002174 /*
2175 * The response data for a STAT call consists of:
2176 * le64 length;
2177 * struct {
2178 * le32 tv_sec;
2179 * le32 tv_nsec;
2180 * } mtime;
2181 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002182 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2183 if (IS_ERR(pages))
2184 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002185
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002186 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
2187 osd_req_op_raw_data_in_pages(osd_req, which, pages,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002188 8 + sizeof(struct ceph_timespec),
2189 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02002190 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002191}
2192
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02002193static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
2194 u32 bytes)
Ilya Dryomov13488d52019-02-25 12:37:50 +01002195{
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02002196 struct rbd_obj_request *obj_req = osd_req->r_priv;
2197 int ret;
2198
2199 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
2200 if (ret)
2201 return ret;
2202
2203 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
2204 obj_req->copyup_bvec_count, bytes);
2205 return 0;
Ilya Dryomov13488d52019-02-25 12:37:50 +01002206}
2207
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002208static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
Alex Elderb454e362013-04-19 15:34:50 -05002209{
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002210 obj_req->read_state = RBD_OBJ_READ_START;
2211 return 0;
2212}
2213
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002214static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2215 int which)
Alex Elderb454e362013-04-19 15:34:50 -05002216{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002217 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002218 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2219 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05002220
Ilya Dryomov8b5bec52019-06-19 15:45:27 +02002221 if (!use_object_map(rbd_dev) ||
2222 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
2223 osd_req_op_alloc_hint_init(osd_req, which++,
2224 rbd_dev->layout.object_size,
Ilya Dryomovd3798ac2020-05-29 20:31:37 +02002225 rbd_dev->layout.object_size,
Ilya Dryomovdc1dad82020-05-29 20:51:23 +02002226 rbd_dev->opts->alloc_hint_flags);
Ilya Dryomov8b5bec52019-06-19 15:45:27 +02002227 }
Alex Elderb454e362013-04-19 15:34:50 -05002228
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002229 if (rbd_obj_is_entire(obj_req))
2230 opcode = CEPH_OSD_OP_WRITEFULL;
2231 else
2232 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002233
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002234 osd_req_op_extent_init(osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002235 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002236 rbd_osd_setup_data(osd_req, which);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002237}
2238
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002239static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002240{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002241 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02002242
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002243 /* reverse map the entire object onto the parent */
2244 ret = rbd_obj_calc_img_extents(obj_req, true);
2245 if (ret)
2246 return ret;
2247
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002248 if (rbd_obj_copyup_enabled(obj_req))
2249 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002250
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002251 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002252 return 0;
2253}
2254
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002255static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
2256{
2257 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
2258 CEPH_OSD_OP_ZERO;
2259}
2260
Ilya Dryomov27bbd912019-05-29 17:31:37 +02002261static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
2262 int which)
2263{
2264 struct rbd_obj_request *obj_req = osd_req->r_priv;
2265
2266 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
2267 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
2268 osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
2269 } else {
2270 osd_req_op_extent_init(osd_req, which,
2271 truncate_or_zero_opcode(obj_req),
2272 obj_req->ex.oe_off, obj_req->ex.oe_len,
2273 0, 0);
2274 }
2275}
2276
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002277static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002278{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002279 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov27bbd912019-05-29 17:31:37 +02002280 u64 off, next_off;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002281 int ret;
2282
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002283 /*
2284 * Align the range to alloc_size boundary and punt on discards
2285 * that are too small to free up any space.
2286 *
2287 * alloc_size == object_size && is_tail() is a special case for
2288 * filestore with filestore_punch_hole = false, needed to allow
2289 * truncate (in addition to delete).
2290 */
2291 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
2292 !rbd_obj_is_tail(obj_req)) {
Ilya Dryomov27bbd912019-05-29 17:31:37 +02002293 off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
2294 next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
2295 rbd_dev->opts->alloc_size);
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002296 if (off >= next_off)
2297 return 1;
Ilya Dryomov27bbd912019-05-29 17:31:37 +02002298
2299 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
2300 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
2301 off, next_off - off);
2302 obj_req->ex.oe_off = off;
2303 obj_req->ex.oe_len = next_off - off;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002304 }
2305
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002306 /* reverse map the entire object onto the parent */
2307 ret = rbd_obj_calc_img_extents(obj_req, true);
2308 if (ret)
2309 return ret;
2310
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002311 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002312 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
2313 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002314
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002315 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002316 return 0;
2317}
2318
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002319static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2320 int which)
Ilya Dryomov13488d52019-02-25 12:37:50 +01002321{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002322 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002323 u16 opcode;
2324
2325 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002326 if (obj_req->num_img_extents) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002327 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002328 osd_req_op_init(osd_req, which++,
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01002329 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002330 opcode = CEPH_OSD_OP_TRUNCATE;
2331 } else {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002332 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002333 osd_req_op_init(osd_req, which++,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002334 CEPH_OSD_OP_DELETE, 0);
2335 opcode = 0;
2336 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002337 } else {
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002338 opcode = truncate_or_zero_opcode(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002339 }
2340
2341 if (opcode)
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002342 osd_req_op_extent_init(osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002343 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002344 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002345}
2346
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002347static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002348{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002349 int ret;
2350
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002351 /* reverse map the entire object onto the parent */
2352 ret = rbd_obj_calc_img_extents(obj_req, true);
2353 if (ret)
2354 return ret;
2355
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002356 if (rbd_obj_copyup_enabled(obj_req))
2357 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2358 if (!obj_req->num_img_extents) {
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002359 obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002360 if (rbd_obj_is_entire(obj_req))
2361 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002362 }
2363
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002364 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002365 return 0;
2366}
2367
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002368static int count_write_ops(struct rbd_obj_request *obj_req)
2369{
Ilya Dryomov8b5bec52019-06-19 15:45:27 +02002370 struct rbd_img_request *img_req = obj_req->img_request;
2371
2372 switch (img_req->op_type) {
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002373 case OBJ_OP_WRITE:
Ilya Dryomov8b5bec52019-06-19 15:45:27 +02002374 if (!use_object_map(img_req->rbd_dev) ||
2375 !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
2376 return 2; /* setallochint + write/writefull */
2377
2378 return 1; /* write/writefull */
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002379 case OBJ_OP_DISCARD:
2380 return 1; /* delete/truncate/zero */
2381 case OBJ_OP_ZEROOUT:
2382 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2383 !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
2384 return 2; /* create + truncate */
2385
2386 return 1; /* delete/truncate/zero */
2387 default:
2388 BUG();
2389 }
2390}
2391
2392static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
2393 int which)
2394{
2395 struct rbd_obj_request *obj_req = osd_req->r_priv;
2396
2397 switch (obj_req->img_request->op_type) {
2398 case OBJ_OP_WRITE:
2399 __rbd_osd_setup_write_ops(osd_req, which);
2400 break;
2401 case OBJ_OP_DISCARD:
2402 __rbd_osd_setup_discard_ops(osd_req, which);
2403 break;
2404 case OBJ_OP_ZEROOUT:
2405 __rbd_osd_setup_zeroout_ops(osd_req, which);
2406 break;
2407 default:
2408 BUG();
2409 }
2410}
2411
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002412/*
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002413 * Prune the list of object requests (adjust offset and/or length, drop
2414 * redundant requests). Prepare object request state machines and image
2415 * request state machine for execution.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002416 */
2417static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2418{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002419 struct rbd_obj_request *obj_req, *next_obj_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002420 int ret;
2421
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002422 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002423 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002424 case OBJ_OP_READ:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002425 ret = rbd_obj_init_read(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002426 break;
2427 case OBJ_OP_WRITE:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002428 ret = rbd_obj_init_write(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002429 break;
2430 case OBJ_OP_DISCARD:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002431 ret = rbd_obj_init_discard(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002432 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002433 case OBJ_OP_ZEROOUT:
Ilya Dryomovea9b7432019-05-31 15:11:26 +02002434 ret = rbd_obj_init_zeroout(obj_req);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002435 break;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002436 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002437 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002438 }
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002439 if (ret < 0)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002440 return ret;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002441 if (ret > 0) {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002442 rbd_img_obj_request_del(img_req, obj_req);
2443 continue;
2444 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002445 }
2446
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002447 img_req->state = RBD_IMG_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002448 return 0;
2449}
2450
Ilya Dryomov5a237812018-02-06 19:26:34 +01002451union rbd_img_fill_iter {
2452 struct ceph_bio_iter bio_iter;
2453 struct ceph_bvec_iter bvec_iter;
2454};
2455
2456struct rbd_img_fill_ctx {
2457 enum obj_request_type pos_type;
2458 union rbd_img_fill_iter *pos;
2459 union rbd_img_fill_iter iter;
2460 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01002461 ceph_object_extent_fn_t count_fn;
2462 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01002463};
2464
2465static struct ceph_object_extent *alloc_object_extent(void *arg)
2466{
2467 struct rbd_img_request *img_req = arg;
2468 struct rbd_obj_request *obj_req;
2469
2470 obj_req = rbd_obj_request_create();
2471 if (!obj_req)
2472 return NULL;
2473
2474 rbd_img_obj_request_add(img_req, obj_req);
2475 return &obj_req->ex;
2476}
2477
2478/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002479 * While su != os && sc == 1 is technically not fancy (it's the same
2480 * layout as su == os && sc == 1), we can't use the nocopy path for it
2481 * because ->set_pos_fn() should be called only once per object.
2482 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2483 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01002484 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002485static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2486{
2487 return l->stripe_unit != l->object_size;
2488}
2489
2490static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2491 struct ceph_file_extent *img_extents,
2492 u32 num_img_extents,
2493 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01002494{
2495 u32 i;
2496 int ret;
2497
2498 img_req->data_type = fctx->pos_type;
2499
2500 /*
2501 * Create object requests and set each object request's starting
2502 * position in the provided bio (list) or bio_vec array.
2503 */
2504 fctx->iter = *fctx->pos;
2505 for (i = 0; i < num_img_extents; i++) {
2506 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2507 img_extents[i].fe_off,
2508 img_extents[i].fe_len,
2509 &img_req->object_extents,
2510 alloc_object_extent, img_req,
2511 fctx->set_pos_fn, &fctx->iter);
2512 if (ret)
2513 return ret;
2514 }
2515
2516 return __rbd_img_fill_request(img_req);
2517}
2518
Ilya Dryomovafb97882018-02-06 19:26:35 +01002519/*
2520 * Map a list of image extents to a list of object extents, create the
2521 * corresponding object requests (normally each to a different object,
2522 * but not always) and add them to @img_req. For each object request,
2523 * set up its data descriptor to point to the corresponding chunk(s) of
2524 * @fctx->pos data buffer.
2525 *
2526 * Because ceph_file_to_extents() will merge adjacent object extents
2527 * together, each object request's data descriptor may point to multiple
2528 * different chunks of @fctx->pos data buffer.
2529 *
2530 * @fctx->pos data buffer is assumed to be large enough.
2531 */
2532static int rbd_img_fill_request(struct rbd_img_request *img_req,
2533 struct ceph_file_extent *img_extents,
2534 u32 num_img_extents,
2535 struct rbd_img_fill_ctx *fctx)
2536{
2537 struct rbd_device *rbd_dev = img_req->rbd_dev;
2538 struct rbd_obj_request *obj_req;
2539 u32 i;
2540 int ret;
2541
2542 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2543 !rbd_layout_is_fancy(&rbd_dev->layout))
2544 return rbd_img_fill_request_nocopy(img_req, img_extents,
2545 num_img_extents, fctx);
2546
2547 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2548
2549 /*
2550 * Create object requests and determine ->bvec_count for each object
2551 * request. Note that ->bvec_count sum over all object requests may
2552 * be greater than the number of bio_vecs in the provided bio (list)
2553 * or bio_vec array because when mapped, those bio_vecs can straddle
2554 * stripe unit boundaries.
2555 */
2556 fctx->iter = *fctx->pos;
2557 for (i = 0; i < num_img_extents; i++) {
2558 ret = ceph_file_to_extents(&rbd_dev->layout,
2559 img_extents[i].fe_off,
2560 img_extents[i].fe_len,
2561 &img_req->object_extents,
2562 alloc_object_extent, img_req,
2563 fctx->count_fn, &fctx->iter);
2564 if (ret)
2565 return ret;
2566 }
2567
2568 for_each_obj_request(img_req, obj_req) {
2569 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2570 sizeof(*obj_req->bvec_pos.bvecs),
2571 GFP_NOIO);
2572 if (!obj_req->bvec_pos.bvecs)
2573 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002574 }
2575
2576 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002577 * Fill in each object request's private bio_vec array, splitting and
2578 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002579 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002580 fctx->iter = *fctx->pos;
2581 for (i = 0; i < num_img_extents; i++) {
2582 ret = ceph_iterate_extents(&rbd_dev->layout,
2583 img_extents[i].fe_off,
2584 img_extents[i].fe_len,
2585 &img_req->object_extents,
2586 fctx->copy_fn, &fctx->iter);
2587 if (ret)
2588 return ret;
2589 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002590
Ilya Dryomovafb97882018-02-06 19:26:35 +01002591 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002592}
2593
Ilya Dryomov5a237812018-02-06 19:26:34 +01002594static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2595 u64 off, u64 len)
2596{
2597 struct ceph_file_extent ex = { off, len };
Arnd Bergmanna55e6012020-01-07 22:01:04 +01002598 union rbd_img_fill_iter dummy = {};
Ilya Dryomov5a237812018-02-06 19:26:34 +01002599 struct rbd_img_fill_ctx fctx = {
2600 .pos_type = OBJ_REQUEST_NODATA,
2601 .pos = &dummy,
2602 };
2603
2604 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2605}
2606
2607static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2608{
2609 struct rbd_obj_request *obj_req =
2610 container_of(ex, struct rbd_obj_request, ex);
2611 struct ceph_bio_iter *it = arg;
2612
2613 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2614 obj_req->bio_pos = *it;
2615 ceph_bio_iter_advance(it, bytes);
2616}
2617
Ilya Dryomovafb97882018-02-06 19:26:35 +01002618static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2619{
2620 struct rbd_obj_request *obj_req =
2621 container_of(ex, struct rbd_obj_request, ex);
2622 struct ceph_bio_iter *it = arg;
2623
2624 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2625 ceph_bio_iter_advance_step(it, bytes, ({
2626 obj_req->bvec_count++;
2627 }));
2628
2629}
2630
2631static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2632{
2633 struct rbd_obj_request *obj_req =
2634 container_of(ex, struct rbd_obj_request, ex);
2635 struct ceph_bio_iter *it = arg;
2636
2637 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2638 ceph_bio_iter_advance_step(it, bytes, ({
2639 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2640 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2641 }));
2642}
2643
Ilya Dryomov5a237812018-02-06 19:26:34 +01002644static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2645 struct ceph_file_extent *img_extents,
2646 u32 num_img_extents,
2647 struct ceph_bio_iter *bio_pos)
2648{
2649 struct rbd_img_fill_ctx fctx = {
2650 .pos_type = OBJ_REQUEST_BIO,
2651 .pos = (union rbd_img_fill_iter *)bio_pos,
2652 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002653 .count_fn = count_bio_bvecs,
2654 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002655 };
2656
2657 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2658 &fctx);
2659}
2660
2661static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2662 u64 off, u64 len, struct bio *bio)
2663{
2664 struct ceph_file_extent ex = { off, len };
2665 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2666
2667 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2668}
2669
2670static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2671{
2672 struct rbd_obj_request *obj_req =
2673 container_of(ex, struct rbd_obj_request, ex);
2674 struct ceph_bvec_iter *it = arg;
2675
2676 obj_req->bvec_pos = *it;
2677 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2678 ceph_bvec_iter_advance(it, bytes);
2679}
2680
Ilya Dryomovafb97882018-02-06 19:26:35 +01002681static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2682{
2683 struct rbd_obj_request *obj_req =
2684 container_of(ex, struct rbd_obj_request, ex);
2685 struct ceph_bvec_iter *it = arg;
2686
2687 ceph_bvec_iter_advance_step(it, bytes, ({
2688 obj_req->bvec_count++;
2689 }));
2690}
2691
2692static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2693{
2694 struct rbd_obj_request *obj_req =
2695 container_of(ex, struct rbd_obj_request, ex);
2696 struct ceph_bvec_iter *it = arg;
2697
2698 ceph_bvec_iter_advance_step(it, bytes, ({
2699 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2700 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2701 }));
2702}
2703
Ilya Dryomov5a237812018-02-06 19:26:34 +01002704static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2705 struct ceph_file_extent *img_extents,
2706 u32 num_img_extents,
2707 struct ceph_bvec_iter *bvec_pos)
2708{
2709 struct rbd_img_fill_ctx fctx = {
2710 .pos_type = OBJ_REQUEST_BVECS,
2711 .pos = (union rbd_img_fill_iter *)bvec_pos,
2712 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002713 .count_fn = count_bvecs,
2714 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002715 };
2716
2717 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2718 &fctx);
2719}
2720
2721static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2722 struct ceph_file_extent *img_extents,
2723 u32 num_img_extents,
2724 struct bio_vec *bvecs)
2725{
2726 struct ceph_bvec_iter it = {
2727 .bvecs = bvecs,
2728 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2729 num_img_extents) },
2730 };
2731
2732 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2733 &it);
2734}
2735
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002736static void rbd_img_handle_request_work(struct work_struct *work)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002737{
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002738 struct rbd_img_request *img_req =
2739 container_of(work, struct rbd_img_request, work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002740
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002741 rbd_img_handle_request(img_req, img_req->work_result);
2742}
Alex Elderbf0d5f502012-11-22 00:00:08 -06002743
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002744static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2745{
2746 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2747 img_req->work_result = result;
2748 queue_work(rbd_wq, &img_req->work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002749}
2750
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002751static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
2752{
2753 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2754
2755 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
2756 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2757 return true;
2758 }
2759
2760 dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
2761 obj_req->ex.oe_objno);
2762 return false;
2763}
2764
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002765static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2766{
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002767 struct ceph_osd_request *osd_req;
2768 int ret;
2769
2770 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
2771 if (IS_ERR(osd_req))
2772 return PTR_ERR(osd_req);
2773
2774 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
2775 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
2776 rbd_osd_setup_data(osd_req, 0);
2777 rbd_osd_format_read(osd_req);
2778
2779 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2780 if (ret)
2781 return ret;
2782
2783 rbd_osd_submit(osd_req);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002784 return 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002785}
2786
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002787static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002788{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002789 struct rbd_img_request *img_req = obj_req->img_request;
Ilya Dryomova52cc682020-02-12 15:08:39 +01002790 struct rbd_device *parent = img_req->rbd_dev->parent;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002791 struct rbd_img_request *child_img_req;
2792 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002793
Ilya Dryomov59e542c2020-02-12 15:23:58 +01002794 child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002795 if (!child_img_req)
2796 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002797
Ilya Dryomov59e542c2020-02-12 15:23:58 +01002798 rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
Ilya Dryomove93aca02018-02-06 19:26:35 +01002799 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2800 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002801
Ilya Dryomova52cc682020-02-12 15:08:39 +01002802 down_read(&parent->header_rwsem);
2803 rbd_img_capture_header(child_img_req);
2804 up_read(&parent->header_rwsem);
2805
Ilya Dryomov21ed05a2019-08-30 17:31:06 +02002806 dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
2807 obj_req);
2808
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002809 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002810 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002811 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002812 ret = __rbd_img_fill_from_bio(child_img_req,
2813 obj_req->img_extents,
2814 obj_req->num_img_extents,
2815 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002816 break;
2817 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002818 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002819 ret = __rbd_img_fill_from_bvecs(child_img_req,
2820 obj_req->img_extents,
2821 obj_req->num_img_extents,
2822 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002823 break;
2824 default:
Arnd Bergmannd342a152019-03-22 15:36:37 +01002825 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002826 }
2827 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002828 ret = rbd_img_fill_from_bvecs(child_img_req,
2829 obj_req->img_extents,
2830 obj_req->num_img_extents,
2831 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002832 }
2833 if (ret) {
Hannes Reinecke679a97d2020-01-31 11:37:36 +01002834 rbd_img_request_destroy(child_img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002835 return ret;
2836 }
2837
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002838 /* avoid parent chain recursion */
2839 rbd_img_schedule(child_img_req, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002840 return 0;
2841}
2842
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002843static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002844{
2845 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2846 int ret;
2847
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002848again:
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002849 switch (obj_req->read_state) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002850 case RBD_OBJ_READ_START:
2851 rbd_assert(!*result);
2852
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002853 if (!rbd_obj_may_exist(obj_req)) {
2854 *result = -ENOENT;
2855 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2856 goto again;
2857 }
2858
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002859 ret = rbd_obj_read_object(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002860 if (ret) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002861 *result = ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002862 return true;
2863 }
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002864 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2865 return false;
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002866 case RBD_OBJ_READ_OBJECT:
2867 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2868 /* reverse map this object extent onto the parent */
2869 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002870 if (ret) {
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002871 *result = ret;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002872 return true;
2873 }
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002874 if (obj_req->num_img_extents) {
2875 ret = rbd_obj_read_from_parent(obj_req);
2876 if (ret) {
2877 *result = ret;
2878 return true;
2879 }
2880 obj_req->read_state = RBD_OBJ_READ_PARENT;
2881 return false;
2882 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002883 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002884
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002885 /*
2886 * -ENOENT means a hole in the image -- zero-fill the entire
2887 * length of the request. A short read also implies zero-fill
2888 * to the end of the request.
2889 */
2890 if (*result == -ENOENT) {
2891 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2892 *result = 0;
2893 } else if (*result >= 0) {
2894 if (*result < obj_req->ex.oe_len)
2895 rbd_obj_zero_range(obj_req, *result,
2896 obj_req->ex.oe_len - *result);
2897 else
2898 rbd_assert(*result == obj_req->ex.oe_len);
2899 *result = 0;
2900 }
2901 return true;
2902 case RBD_OBJ_READ_PARENT:
Ilya Dryomovd435c9a2019-08-27 16:45:10 +02002903 /*
2904 * The parent image is read only up to the overlap -- zero-fill
2905 * from the overlap to the end of the request.
2906 */
2907 if (!*result) {
2908 u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
2909
2910 if (obj_overlap < obj_req->ex.oe_len)
2911 rbd_obj_zero_range(obj_req, obj_overlap,
2912 obj_req->ex.oe_len - obj_overlap);
2913 }
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002914 return true;
2915 default:
2916 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002917 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002918}
2919
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002920static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
2921{
2922 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2923
2924 if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
2925 obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
2926
2927 if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
2928 (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
2929 dout("%s %p noop for nonexistent\n", __func__, obj_req);
2930 return true;
Alex Elder02c74fb2013-05-06 17:40:33 -05002931 }
2932
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02002933 return false;
2934}
2935
2936/*
2937 * Return:
2938 * 0 - object map update sent
2939 * 1 - object map update isn't needed
2940 * <0 - error
2941 */
2942static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
2943{
2944 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2945 u8 new_state;
2946
2947 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
2948 return 1;
2949
2950 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
2951 new_state = OBJECT_PENDING;
2952 else
2953 new_state = OBJECT_EXISTS;
2954
2955 return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
2956}
2957
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002958static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
2959{
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002960 struct ceph_osd_request *osd_req;
2961 int num_ops = count_write_ops(obj_req);
2962 int which = 0;
2963 int ret;
2964
2965 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2966 num_ops++; /* stat */
2967
2968 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
2969 if (IS_ERR(osd_req))
2970 return PTR_ERR(osd_req);
2971
2972 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2973 ret = rbd_osd_setup_stat(osd_req, which++);
2974 if (ret)
2975 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002976 }
2977
Ilya Dryomova086a1b2019-06-12 18:33:31 +02002978 rbd_osd_setup_write_ops(osd_req, which);
2979 rbd_osd_format_write(osd_req);
2980
2981 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
2982 if (ret)
2983 return ret;
2984
2985 rbd_osd_submit(osd_req);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002986 return 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002987}
2988
2989/*
2990 * copyup_bvecs pages are never highmem pages
2991 */
2992static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2993{
2994 struct ceph_bvec_iter it = {
2995 .bvecs = bvecs,
2996 .iter = { .bi_size = bytes },
2997 };
2998
2999 ceph_bvec_iter_advance_step(&it, bytes, ({
3000 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3001 bv.bv_len))
3002 return false;
3003 }));
3004 return true;
3005}
3006
Ilya Dryomov3a482502019-02-28 10:49:12 +01003007#define MODS_ONLY U32_MAX
3008
Ilya Dryomov793333a302019-06-13 17:44:08 +02003009static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
3010 u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003011{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003012 struct ceph_osd_request *osd_req;
Chengguang Xufe943d52018-04-12 12:04:55 +08003013 int ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003014
3015 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01003016 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003017
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003018 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
3019 if (IS_ERR(osd_req))
3020 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003021
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02003022 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
Chengguang Xufe943d52018-04-12 12:04:55 +08003023 if (ret)
3024 return ret;
3025
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003026 rbd_osd_format_write(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003027
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003028 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01003029 if (ret)
3030 return ret;
3031
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003032 rbd_osd_submit(osd_req);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01003033 return 0;
3034}
3035
Ilya Dryomov793333a302019-06-13 17:44:08 +02003036static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
3037 u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003038{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003039 struct ceph_osd_request *osd_req;
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003040 int num_ops = count_write_ops(obj_req);
3041 int which = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003042 int ret;
3043
3044 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003045
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003046 if (bytes != MODS_ONLY)
3047 num_ops++; /* copyup */
Ilya Dryomov13488d52019-02-25 12:37:50 +01003048
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003049 osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003050 if (IS_ERR(osd_req))
3051 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003052
Ilya Dryomov3a482502019-02-28 10:49:12 +01003053 if (bytes != MODS_ONLY) {
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02003054 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
Ilya Dryomov3a482502019-02-28 10:49:12 +01003055 if (ret)
3056 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003057 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003058
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003059 rbd_osd_setup_write_ops(osd_req, which);
3060 rbd_osd_format_write(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003061
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02003062 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov26f887e2018-10-15 16:11:37 +02003063 if (ret)
3064 return ret;
3065
Ilya Dryomova086a1b2019-06-12 18:33:31 +02003066 rbd_osd_submit(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003067 return 0;
3068}
3069
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01003070static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
3071{
3072 u32 i;
3073
3074 rbd_assert(!obj_req->copyup_bvecs);
3075 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3076 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3077 sizeof(*obj_req->copyup_bvecs),
3078 GFP_NOIO);
3079 if (!obj_req->copyup_bvecs)
3080 return -ENOMEM;
3081
3082 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3083 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3084
3085 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3086 if (!obj_req->copyup_bvecs[i].bv_page)
3087 return -ENOMEM;
3088
3089 obj_req->copyup_bvecs[i].bv_offset = 0;
3090 obj_req->copyup_bvecs[i].bv_len = len;
3091 obj_overlap -= len;
3092 }
3093
3094 rbd_assert(!obj_overlap);
3095 return 0;
3096}
3097
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003098/*
3099 * The target object doesn't exist. Read the data for the entire
3100 * target object up to the overlap point (if any) from the parent,
3101 * so we can use it for a copyup.
3102 */
Ilya Dryomov793333a302019-06-13 17:44:08 +02003103static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003104{
3105 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003106 int ret;
3107
Ilya Dryomov86bd7992018-02-06 19:26:33 +01003108 rbd_assert(obj_req->num_img_extents);
3109 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
3110 rbd_dev->parent_overlap);
3111 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003112 /*
3113 * The overlap has become 0 (most likely because the
Ilya Dryomov3a482502019-02-28 10:49:12 +01003114 * image has been flattened). Re-submit the original write
3115 * request -- pass MODS_ONLY since the copyup isn't needed
3116 * anymore.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003117 */
Ilya Dryomov793333a302019-06-13 17:44:08 +02003118 return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003119 }
3120
Ilya Dryomov86bd7992018-02-06 19:26:33 +01003121 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003122 if (ret)
3123 return ret;
3124
Ilya Dryomov86bd7992018-02-06 19:26:33 +01003125 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003126}
3127
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003128static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003129{
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003130 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3131 struct ceph_snap_context *snapc = obj_req->img_request->snapc;
3132 u8 new_state;
3133 u32 i;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003134 int ret;
3135
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003136 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3137
3138 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3139 return;
3140
3141 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3142 return;
3143
3144 for (i = 0; i < snapc->num_snaps; i++) {
3145 if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
3146 i + 1 < snapc->num_snaps)
3147 new_state = OBJECT_EXISTS_CLEAN;
3148 else
3149 new_state = OBJECT_EXISTS;
3150
3151 ret = rbd_object_map_update(obj_req, snapc->snaps[i],
3152 new_state, NULL);
3153 if (ret < 0) {
3154 obj_req->pending.result = ret;
3155 return;
3156 }
3157
3158 rbd_assert(!ret);
3159 obj_req->pending.num_pending++;
3160 }
3161}
3162
Ilya Dryomov793333a302019-06-13 17:44:08 +02003163static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
3164{
3165 u32 bytes = rbd_obj_img_extents_bytes(obj_req);
3166 int ret;
3167
3168 rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
3169
3170 /*
3171 * Only send non-zero copyup data to save some I/O and network
3172 * bandwidth -- zero copyup data is equivalent to the object not
3173 * existing.
3174 */
3175 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
3176 bytes = 0;
3177
3178 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
3179 /*
3180 * Send a copyup request with an empty snapshot context to
3181 * deep-copyup the object through all existing snapshots.
3182 * A second request with the current snapshot context will be
3183 * sent for the actual modification.
3184 */
3185 ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
3186 if (ret) {
3187 obj_req->pending.result = ret;
3188 return;
3189 }
3190
3191 obj_req->pending.num_pending++;
3192 bytes = MODS_ONLY;
3193 }
3194
3195 ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
3196 if (ret) {
3197 obj_req->pending.result = ret;
3198 return;
3199 }
3200
3201 obj_req->pending.num_pending++;
3202}
3203
3204static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
3205{
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003206 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov793333a302019-06-13 17:44:08 +02003207 int ret;
3208
3209again:
3210 switch (obj_req->copyup_state) {
3211 case RBD_OBJ_COPYUP_START:
3212 rbd_assert(!*result);
3213
3214 ret = rbd_obj_copyup_read_parent(obj_req);
3215 if (ret) {
3216 *result = ret;
3217 return true;
3218 }
3219 if (obj_req->num_img_extents)
3220 obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
3221 else
3222 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3223 return false;
3224 case RBD_OBJ_COPYUP_READ_PARENT:
3225 if (*result)
3226 return true;
3227
3228 if (is_zero_bvecs(obj_req->copyup_bvecs,
3229 rbd_obj_img_extents_bytes(obj_req))) {
3230 dout("%s %p detected zeros\n", __func__, obj_req);
3231 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
3232 }
3233
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003234 rbd_obj_copyup_object_maps(obj_req);
3235 if (!obj_req->pending.num_pending) {
3236 *result = obj_req->pending.result;
3237 obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
3238 goto again;
3239 }
3240 obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
3241 return false;
3242 case __RBD_OBJ_COPYUP_OBJECT_MAPS:
3243 if (!pending_result_dec(&obj_req->pending, result))
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003244 return false;
Gustavo A. R. Silvadf561f662020-08-23 17:36:59 -05003245 fallthrough;
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003246 case RBD_OBJ_COPYUP_OBJECT_MAPS:
3247 if (*result) {
3248 rbd_warn(rbd_dev, "snap object map update failed: %d",
3249 *result);
3250 return true;
3251 }
3252
Ilya Dryomov793333a302019-06-13 17:44:08 +02003253 rbd_obj_copyup_write_object(obj_req);
3254 if (!obj_req->pending.num_pending) {
3255 *result = obj_req->pending.result;
3256 obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
3257 goto again;
3258 }
3259 obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
3260 return false;
3261 case __RBD_OBJ_COPYUP_WRITE_OBJECT:
3262 if (!pending_result_dec(&obj_req->pending, result))
3263 return false;
Gustavo A. R. Silvadf561f662020-08-23 17:36:59 -05003264 fallthrough;
Ilya Dryomov793333a302019-06-13 17:44:08 +02003265 case RBD_OBJ_COPYUP_WRITE_OBJECT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003266 return true;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003267 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02003268 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003269 }
3270}
3271
3272/*
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003273 * Return:
3274 * 0 - object map update sent
3275 * 1 - object map update isn't needed
3276 * <0 - error
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003277 */
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003278static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003279{
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003280 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3281 u8 current_state = OBJECT_PENDING;
3282
3283 if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
3284 return 1;
3285
3286 if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
3287 return 1;
3288
3289 return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
3290 &current_state);
3291}
3292
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003293static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003294{
Ilya Dryomov793333a302019-06-13 17:44:08 +02003295 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003296 int ret;
3297
Ilya Dryomov793333a302019-06-13 17:44:08 +02003298again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003299 switch (obj_req->write_state) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003300 case RBD_OBJ_WRITE_START:
3301 rbd_assert(!*result);
3302
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003303 if (rbd_obj_write_is_noop(obj_req))
3304 return true;
3305
3306 ret = rbd_obj_write_pre_object_map(obj_req);
3307 if (ret < 0) {
3308 *result = ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003309 return true;
3310 }
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003311 obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
3312 if (ret > 0)
3313 goto again;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003314 return false;
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003315 case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
3316 if (*result) {
3317 rbd_warn(rbd_dev, "pre object map update failed: %d",
3318 *result);
3319 return true;
3320 }
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003321 ret = rbd_obj_write_object(obj_req);
3322 if (ret) {
3323 *result = ret;
3324 return true;
3325 }
3326 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
3327 return false;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003328 case RBD_OBJ_WRITE_OBJECT:
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02003329 if (*result == -ENOENT) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003330 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
Ilya Dryomov793333a302019-06-13 17:44:08 +02003331 *result = 0;
3332 obj_req->copyup_state = RBD_OBJ_COPYUP_START;
3333 obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
3334 goto again;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003335 }
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003336 /*
3337 * On a non-existent object:
3338 * delete - -ENOENT, truncate/zero - 0
3339 */
3340 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
3341 *result = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003342 }
Ilya Dryomov793333a302019-06-13 17:44:08 +02003343 if (*result)
3344 return true;
3345
3346 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3347 goto again;
3348 case __RBD_OBJ_WRITE_COPYUP:
3349 if (!rbd_obj_advance_copyup(obj_req, result))
3350 return false;
Gustavo A. R. Silvadf561f662020-08-23 17:36:59 -05003351 fallthrough;
Ilya Dryomov793333a302019-06-13 17:44:08 +02003352 case RBD_OBJ_WRITE_COPYUP:
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003353 if (*result) {
Ilya Dryomov793333a302019-06-13 17:44:08 +02003354 rbd_warn(rbd_dev, "copyup failed: %d", *result);
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003355 return true;
3356 }
3357 ret = rbd_obj_write_post_object_map(obj_req);
3358 if (ret < 0) {
3359 *result = ret;
3360 return true;
3361 }
3362 obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
3363 if (ret > 0)
3364 goto again;
3365 return false;
3366 case RBD_OBJ_WRITE_POST_OBJECT_MAP:
3367 if (*result)
3368 rbd_warn(rbd_dev, "post object map update failed: %d",
3369 *result);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003370 return true;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003371 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02003372 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003373 }
3374}
3375
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003376/*
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003377 * Return true if @obj_req is completed.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003378 */
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02003379static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
3380 int *result)
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003381{
3382 struct rbd_img_request *img_req = obj_req->img_request;
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003383 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003384 bool done;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003385
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003386 mutex_lock(&obj_req->state_mutex);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003387 if (!rbd_img_is_write(img_req))
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003388 done = rbd_obj_advance_read(obj_req, result);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003389 else
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02003390 done = rbd_obj_advance_write(obj_req, result);
3391 mutex_unlock(&obj_req->state_mutex);
Alex Elder02c74fb2013-05-06 17:40:33 -05003392
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003393 if (done && *result) {
3394 rbd_assert(*result < 0);
3395 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
3396 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
3397 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003398 }
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02003399 return done;
Alex Elder8b3e1a52013-01-24 16:13:36 -06003400}
3401
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003402/*
3403 * This is open-coded in rbd_img_handle_request() to avoid parent chain
3404 * recursion.
3405 */
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02003406static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
Alex Elder8b3e1a52013-01-24 16:13:36 -06003407{
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003408 if (__rbd_obj_handle_request(obj_req, &result))
3409 rbd_img_handle_request(obj_req->img_request, result);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003410}
Alex Elder8b3e1a52013-01-24 16:13:36 -06003411
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003412static bool need_exclusive_lock(struct rbd_img_request *img_req)
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003413{
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003414 struct rbd_device *rbd_dev = img_req->rbd_dev;
3415
3416 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
3417 return false;
3418
Ilya Dryomov3fe69922019-11-12 19:41:48 +01003419 if (rbd_is_ro(rbd_dev))
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003420 return false;
3421
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003422 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003423 if (rbd_dev->opts->lock_on_read ||
3424 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003425 return true;
Alex Elder8b3e1a52013-01-24 16:13:36 -06003426
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003427 return rbd_img_is_write(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003428}
Alex Elder8b3e1a52013-01-24 16:13:36 -06003429
Ilya Dryomov637cd062019-06-06 17:14:49 +02003430static bool rbd_lock_add_request(struct rbd_img_request *img_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003431{
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003432 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomov637cd062019-06-06 17:14:49 +02003433 bool locked;
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003434
3435 lockdep_assert_held(&rbd_dev->lock_rwsem);
Ilya Dryomov637cd062019-06-06 17:14:49 +02003436 locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003437 spin_lock(&rbd_dev->lock_lists_lock);
3438 rbd_assert(list_empty(&img_req->lock_item));
Ilya Dryomov637cd062019-06-06 17:14:49 +02003439 if (!locked)
3440 list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
3441 else
3442 list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003443 spin_unlock(&rbd_dev->lock_lists_lock);
Ilya Dryomov637cd062019-06-06 17:14:49 +02003444 return locked;
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003445}
3446
3447static void rbd_lock_del_request(struct rbd_img_request *img_req)
3448{
3449 struct rbd_device *rbd_dev = img_req->rbd_dev;
3450 bool need_wakeup;
3451
3452 lockdep_assert_held(&rbd_dev->lock_rwsem);
3453 spin_lock(&rbd_dev->lock_lists_lock);
3454 rbd_assert(!list_empty(&img_req->lock_item));
3455 list_del_init(&img_req->lock_item);
3456 need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
3457 list_empty(&rbd_dev->running_list));
3458 spin_unlock(&rbd_dev->lock_lists_lock);
3459 if (need_wakeup)
3460 complete(&rbd_dev->releasing_wait);
3461}
3462
Ilya Dryomov637cd062019-06-06 17:14:49 +02003463static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
3464{
3465 struct rbd_device *rbd_dev = img_req->rbd_dev;
3466
3467 if (!need_exclusive_lock(img_req))
3468 return 1;
3469
3470 if (rbd_lock_add_request(img_req))
3471 return 1;
3472
3473 if (rbd_dev->opts->exclusive) {
3474 WARN_ON(1); /* lock got released? */
3475 return -EROFS;
3476 }
3477
3478 /*
3479 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3480 * and cancel_delayed_work() in wake_lock_waiters().
3481 */
3482 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3483 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3484 return 0;
3485}
3486
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003487static void rbd_img_object_requests(struct rbd_img_request *img_req)
3488{
3489 struct rbd_obj_request *obj_req;
3490
3491 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
3492
3493 for_each_obj_request(img_req, obj_req) {
3494 int result = 0;
3495
3496 if (__rbd_obj_handle_request(obj_req, &result)) {
3497 if (result) {
3498 img_req->pending.result = result;
3499 return;
3500 }
3501 } else {
3502 img_req->pending.num_pending++;
3503 }
3504 }
3505}
3506
3507static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
3508{
Ilya Dryomov637cd062019-06-06 17:14:49 +02003509 struct rbd_device *rbd_dev = img_req->rbd_dev;
3510 int ret;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003511
3512again:
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003513 switch (img_req->state) {
3514 case RBD_IMG_START:
3515 rbd_assert(!*result);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003516
Ilya Dryomov637cd062019-06-06 17:14:49 +02003517 ret = rbd_img_exclusive_lock(img_req);
3518 if (ret < 0) {
3519 *result = ret;
3520 return true;
3521 }
3522 img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
3523 if (ret > 0)
3524 goto again;
3525 return false;
3526 case RBD_IMG_EXCLUSIVE_LOCK:
3527 if (*result)
3528 return true;
3529
3530 rbd_assert(!need_exclusive_lock(img_req) ||
3531 __rbd_is_lock_owner(rbd_dev));
3532
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003533 rbd_img_object_requests(img_req);
3534 if (!img_req->pending.num_pending) {
3535 *result = img_req->pending.result;
3536 img_req->state = RBD_IMG_OBJECT_REQUESTS;
3537 goto again;
3538 }
3539 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
3540 return false;
3541 case __RBD_IMG_OBJECT_REQUESTS:
3542 if (!pending_result_dec(&img_req->pending, result))
3543 return false;
Gustavo A. R. Silvadf561f662020-08-23 17:36:59 -05003544 fallthrough;
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003545 case RBD_IMG_OBJECT_REQUESTS:
3546 return true;
3547 default:
3548 BUG();
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003549 }
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003550}
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003551
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003552/*
3553 * Return true if @img_req is completed.
3554 */
3555static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
3556 int *result)
3557{
3558 struct rbd_device *rbd_dev = img_req->rbd_dev;
3559 bool done;
3560
Ilya Dryomove1fddc82019-05-30 16:07:48 +02003561 if (need_exclusive_lock(img_req)) {
3562 down_read(&rbd_dev->lock_rwsem);
3563 mutex_lock(&img_req->state_mutex);
3564 done = rbd_img_advance(img_req, result);
3565 if (done)
3566 rbd_lock_del_request(img_req);
3567 mutex_unlock(&img_req->state_mutex);
3568 up_read(&rbd_dev->lock_rwsem);
3569 } else {
3570 mutex_lock(&img_req->state_mutex);
3571 done = rbd_img_advance(img_req, result);
3572 mutex_unlock(&img_req->state_mutex);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003573 }
3574
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003575 if (done && *result) {
3576 rbd_assert(*result < 0);
3577 rbd_warn(rbd_dev, "%s%s result %d",
3578 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
3579 obj_op_name(img_req->op_type), *result);
3580 }
3581 return done;
3582}
3583
3584static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
3585{
3586again:
3587 if (!__rbd_img_handle_request(img_req, &result))
3588 return;
3589
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003590 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003591 struct rbd_obj_request *obj_req = img_req->obj_request;
3592
Hannes Reinecke679a97d2020-01-31 11:37:36 +01003593 rbd_img_request_destroy(img_req);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003594 if (__rbd_obj_handle_request(obj_req, &result)) {
3595 img_req = obj_req->img_request;
3596 goto again;
3597 }
3598 } else {
Ilya Dryomov59e542c2020-02-12 15:23:58 +01003599 struct request *rq = blk_mq_rq_from_pdu(img_req);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003600
Hannes Reinecke679a97d2020-01-31 11:37:36 +01003601 rbd_img_request_destroy(img_req);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02003602 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomov7114eda2018-02-01 11:50:47 +01003603 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06003604}
3605
Ilya Dryomoved95b212016-08-12 16:40:02 +02003606static const struct rbd_client_id rbd_empty_cid;
3607
3608static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3609 const struct rbd_client_id *rhs)
3610{
3611 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3612}
3613
3614static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3615{
3616 struct rbd_client_id cid;
3617
3618 mutex_lock(&rbd_dev->watch_mutex);
3619 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3620 cid.handle = rbd_dev->watch_cookie;
3621 mutex_unlock(&rbd_dev->watch_mutex);
3622 return cid;
3623}
3624
3625/*
3626 * lock_rwsem must be held for write
3627 */
3628static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3629 const struct rbd_client_id *cid)
3630{
3631 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3632 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3633 cid->gid, cid->handle);
3634 rbd_dev->owner_cid = *cid; /* struct */
3635}
3636
3637static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3638{
3639 mutex_lock(&rbd_dev->watch_mutex);
3640 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3641 mutex_unlock(&rbd_dev->watch_mutex);
3642}
3643
Florian Margaineedd8ca82017-12-13 16:43:59 +01003644static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3645{
3646 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3647
Ilya Dryomova2b1da02019-05-30 11:15:23 +02003648 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01003649 strcpy(rbd_dev->lock_cookie, cookie);
3650 rbd_set_owner_cid(rbd_dev, &cid);
3651 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3652}
3653
Ilya Dryomoved95b212016-08-12 16:40:02 +02003654/*
3655 * lock_rwsem must be held for write
3656 */
3657static int rbd_lock(struct rbd_device *rbd_dev)
3658{
3659 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003660 char cookie[32];
3661 int ret;
3662
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003663 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3664 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02003665
3666 format_lock_cookie(rbd_dev, cookie);
3667 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3668 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3669 RBD_LOCK_TAG, "", 0);
3670 if (ret)
3671 return ret;
3672
Florian Margaineedd8ca82017-12-13 16:43:59 +01003673 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003674 return 0;
3675}
3676
3677/*
3678 * lock_rwsem must be held for write
3679 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02003680static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003681{
3682 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003683 int ret;
3684
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003685 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3686 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02003687
Ilya Dryomoved95b212016-08-12 16:40:02 +02003688 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003689 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02003690 if (ret && ret != -ENOENT)
Ilya Dryomov637cd062019-06-06 17:14:49 +02003691 rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003692
Ilya Dryomovbbead742017-04-13 12:17:38 +02003693 /* treat errors as the image is unlocked */
3694 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003695 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02003696 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3697 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003698}
3699
3700static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3701 enum rbd_notify_op notify_op,
3702 struct page ***preply_pages,
3703 size_t *preply_len)
3704{
3705 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3706 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07003707 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3708 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003709 void *p = buf;
3710
3711 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3712
3713 /* encode *LockPayload NotifyMessage (op + ClientId) */
3714 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3715 ceph_encode_32(&p, notify_op);
3716 ceph_encode_64(&p, cid.gid);
3717 ceph_encode_64(&p, cid.handle);
3718
3719 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3720 &rbd_dev->header_oloc, buf, buf_size,
3721 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3722}
3723
3724static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3725 enum rbd_notify_op notify_op)
3726{
Ilya Dryomov8ae02992020-03-17 15:18:48 +01003727 __rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003728}
3729
3730static void rbd_notify_acquired_lock(struct work_struct *work)
3731{
3732 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3733 acquired_lock_work);
3734
3735 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3736}
3737
3738static void rbd_notify_released_lock(struct work_struct *work)
3739{
3740 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3741 released_lock_work);
3742
3743 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3744}
3745
3746static int rbd_request_lock(struct rbd_device *rbd_dev)
3747{
3748 struct page **reply_pages;
3749 size_t reply_len;
3750 bool lock_owner_responded = false;
3751 int ret;
3752
3753 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3754
3755 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3756 &reply_pages, &reply_len);
3757 if (ret && ret != -ETIMEDOUT) {
3758 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3759 goto out;
3760 }
3761
3762 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3763 void *p = page_address(reply_pages[0]);
3764 void *const end = p + reply_len;
3765 u32 n;
3766
3767 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3768 while (n--) {
3769 u8 struct_v;
3770 u32 len;
3771
3772 ceph_decode_need(&p, end, 8 + 8, e_inval);
3773 p += 8 + 8; /* skip gid and cookie */
3774
3775 ceph_decode_32_safe(&p, end, len, e_inval);
3776 if (!len)
3777 continue;
3778
3779 if (lock_owner_responded) {
3780 rbd_warn(rbd_dev,
3781 "duplicate lock owners detected");
3782 ret = -EIO;
3783 goto out;
3784 }
3785
3786 lock_owner_responded = true;
3787 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3788 &struct_v, &len);
3789 if (ret) {
3790 rbd_warn(rbd_dev,
3791 "failed to decode ResponseMessage: %d",
3792 ret);
3793 goto e_inval;
3794 }
3795
3796 ret = ceph_decode_32(&p);
3797 }
3798 }
3799
3800 if (!lock_owner_responded) {
3801 rbd_warn(rbd_dev, "no lock owners detected");
3802 ret = -ETIMEDOUT;
3803 }
3804
3805out:
3806 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3807 return ret;
3808
3809e_inval:
3810 ret = -EINVAL;
3811 goto out;
3812}
3813
Ilya Dryomov637cd062019-06-06 17:14:49 +02003814/*
3815 * Either image request state machine(s) or rbd_add_acquire_lock()
3816 * (i.e. "rbd map").
3817 */
3818static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003819{
Ilya Dryomov637cd062019-06-06 17:14:49 +02003820 struct rbd_img_request *img_req;
3821
3822 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
Linus Torvaldsd9b9c892019-07-18 11:05:25 -07003823 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003824
3825 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomov637cd062019-06-06 17:14:49 +02003826 if (!completion_done(&rbd_dev->acquire_wait)) {
3827 rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
3828 list_empty(&rbd_dev->running_list));
3829 rbd_dev->acquire_err = result;
3830 complete_all(&rbd_dev->acquire_wait);
3831 return;
3832 }
3833
3834 list_for_each_entry(img_req, &rbd_dev->acquiring_list, lock_item) {
3835 mutex_lock(&img_req->state_mutex);
3836 rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
3837 rbd_img_schedule(img_req, result);
3838 mutex_unlock(&img_req->state_mutex);
3839 }
3840
3841 list_splice_tail_init(&rbd_dev->acquiring_list, &rbd_dev->running_list);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003842}
3843
3844static int get_lock_owner_info(struct rbd_device *rbd_dev,
3845 struct ceph_locker **lockers, u32 *num_lockers)
3846{
3847 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3848 u8 lock_type;
3849 char *lock_tag;
3850 int ret;
3851
3852 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3853
3854 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3855 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3856 &lock_type, &lock_tag, lockers, num_lockers);
3857 if (ret)
3858 return ret;
3859
3860 if (*num_lockers == 0) {
3861 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3862 goto out;
3863 }
3864
3865 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3866 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3867 lock_tag);
3868 ret = -EBUSY;
3869 goto out;
3870 }
3871
3872 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3873 rbd_warn(rbd_dev, "shared lock type detected");
3874 ret = -EBUSY;
3875 goto out;
3876 }
3877
3878 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3879 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3880 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3881 (*lockers)[0].id.cookie);
3882 ret = -EBUSY;
3883 goto out;
3884 }
3885
3886out:
3887 kfree(lock_tag);
3888 return ret;
3889}
3890
3891static int find_watcher(struct rbd_device *rbd_dev,
3892 const struct ceph_locker *locker)
3893{
3894 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3895 struct ceph_watch_item *watchers;
3896 u32 num_watchers;
3897 u64 cookie;
3898 int i;
3899 int ret;
3900
3901 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3902 &rbd_dev->header_oloc, &watchers,
3903 &num_watchers);
3904 if (ret)
3905 return ret;
3906
3907 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3908 for (i = 0; i < num_watchers; i++) {
Ilya Dryomov313771e2020-11-25 14:41:59 +01003909 /*
3910 * Ignore addr->type while comparing. This mimics
3911 * entity_addr_t::get_legacy_str() + strcmp().
3912 */
3913 if (ceph_addr_equal_no_type(&watchers[i].addr,
3914 &locker->info.addr) &&
Ilya Dryomoved95b212016-08-12 16:40:02 +02003915 watchers[i].cookie == cookie) {
3916 struct rbd_client_id cid = {
3917 .gid = le64_to_cpu(watchers[i].name.num),
3918 .handle = cookie,
3919 };
3920
3921 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3922 rbd_dev, cid.gid, cid.handle);
3923 rbd_set_owner_cid(rbd_dev, &cid);
3924 ret = 1;
3925 goto out;
3926 }
3927 }
3928
3929 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3930 ret = 0;
3931out:
3932 kfree(watchers);
3933 return ret;
3934}
3935
3936/*
3937 * lock_rwsem must be held for write
3938 */
3939static int rbd_try_lock(struct rbd_device *rbd_dev)
3940{
3941 struct ceph_client *client = rbd_dev->rbd_client->client;
3942 struct ceph_locker *lockers;
3943 u32 num_lockers;
3944 int ret;
3945
3946 for (;;) {
3947 ret = rbd_lock(rbd_dev);
3948 if (ret != -EBUSY)
3949 return ret;
3950
3951 /* determine if the current lock holder is still alive */
3952 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3953 if (ret)
3954 return ret;
3955
3956 if (num_lockers == 0)
3957 goto again;
3958
3959 ret = find_watcher(rbd_dev, lockers);
Ilya Dryomov637cd062019-06-06 17:14:49 +02003960 if (ret)
3961 goto out; /* request lock or error */
Ilya Dryomoved95b212016-08-12 16:40:02 +02003962
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003963 rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
Ilya Dryomoved95b212016-08-12 16:40:02 +02003964 ENTITY_NAME(lockers[0].id.name));
3965
Ilya Dryomov0b98acd2020-09-14 13:39:19 +02003966 ret = ceph_monc_blocklist_add(&client->monc,
Ilya Dryomoved95b212016-08-12 16:40:02 +02003967 &lockers[0].info.addr);
3968 if (ret) {
Ilya Dryomov0b98acd2020-09-14 13:39:19 +02003969 rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d",
Ilya Dryomoved95b212016-08-12 16:40:02 +02003970 ENTITY_NAME(lockers[0].id.name), ret);
3971 goto out;
3972 }
3973
3974 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3975 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3976 lockers[0].id.cookie,
3977 &lockers[0].id.name);
3978 if (ret && ret != -ENOENT)
3979 goto out;
3980
3981again:
3982 ceph_free_lockers(lockers, num_lockers);
3983 }
3984
3985out:
3986 ceph_free_lockers(lockers, num_lockers);
3987 return ret;
3988}
3989
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003990static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003991{
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02003992 int ret;
3993
3994 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
3995 ret = rbd_object_map_open(rbd_dev);
3996 if (ret)
3997 return ret;
3998 }
3999
4000 return 0;
4001}
4002
Ilya Dryomoved95b212016-08-12 16:40:02 +02004003/*
Ilya Dryomov637cd062019-06-06 17:14:49 +02004004 * Return:
4005 * 0 - lock acquired
4006 * 1 - caller should call rbd_request_lock()
4007 * <0 - error
Ilya Dryomoved95b212016-08-12 16:40:02 +02004008 */
Ilya Dryomov637cd062019-06-06 17:14:49 +02004009static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004010{
Ilya Dryomov637cd062019-06-06 17:14:49 +02004011 int ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004012
4013 down_read(&rbd_dev->lock_rwsem);
4014 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
4015 rbd_dev->lock_state);
4016 if (__rbd_is_lock_owner(rbd_dev)) {
Ilya Dryomoved95b212016-08-12 16:40:02 +02004017 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004018 return 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004019 }
4020
4021 up_read(&rbd_dev->lock_rwsem);
4022 down_write(&rbd_dev->lock_rwsem);
4023 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
4024 rbd_dev->lock_state);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004025 if (__rbd_is_lock_owner(rbd_dev)) {
4026 up_write(&rbd_dev->lock_rwsem);
4027 return 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004028 }
4029
Ilya Dryomov637cd062019-06-06 17:14:49 +02004030 ret = rbd_try_lock(rbd_dev);
4031 if (ret < 0) {
4032 rbd_warn(rbd_dev, "failed to lock header: %d", ret);
Ilya Dryomov0b98acd2020-09-14 13:39:19 +02004033 if (ret == -EBLOCKLISTED)
Ilya Dryomov637cd062019-06-06 17:14:49 +02004034 goto out;
4035
4036 ret = 1; /* request lock anyway */
4037 }
4038 if (ret > 0) {
4039 up_write(&rbd_dev->lock_rwsem);
4040 return ret;
4041 }
4042
4043 rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
4044 rbd_assert(list_empty(&rbd_dev->running_list));
4045
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02004046 ret = rbd_post_acquire_action(rbd_dev);
4047 if (ret) {
4048 rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
4049 /*
4050 * Can't stay in RBD_LOCK_STATE_LOCKED because
4051 * rbd_lock_add_request() would let the request through,
4052 * assuming that e.g. object map is locked and loaded.
4053 */
4054 rbd_unlock(rbd_dev);
4055 }
4056
Ilya Dryomov637cd062019-06-06 17:14:49 +02004057out:
4058 wake_lock_waiters(rbd_dev, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004059 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004060 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004061}
4062
4063static void rbd_acquire_lock(struct work_struct *work)
4064{
4065 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4066 struct rbd_device, lock_dwork);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004067 int ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004068
4069 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4070again:
Ilya Dryomov637cd062019-06-06 17:14:49 +02004071 ret = rbd_try_acquire_lock(rbd_dev);
4072 if (ret <= 0) {
4073 dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004074 return;
4075 }
4076
4077 ret = rbd_request_lock(rbd_dev);
4078 if (ret == -ETIMEDOUT) {
4079 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02004080 } else if (ret == -EROFS) {
4081 rbd_warn(rbd_dev, "peer will not release lock");
Ilya Dryomov637cd062019-06-06 17:14:49 +02004082 down_write(&rbd_dev->lock_rwsem);
4083 wake_lock_waiters(rbd_dev, ret);
4084 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004085 } else if (ret < 0) {
4086 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
4087 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4088 RBD_RETRY_DELAY);
4089 } else {
4090 /*
4091 * lock owner acked, but resend if we don't see them
4092 * release the lock
4093 */
Colin Ian King6b0a8772019-11-07 22:36:46 +00004094 dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
Ilya Dryomoved95b212016-08-12 16:40:02 +02004095 rbd_dev);
4096 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
4097 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
4098 }
4099}
4100
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004101static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004102{
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004103 bool need_wait;
4104
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004105 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Linus Torvaldsd9b9c892019-07-18 11:05:25 -07004106 lockdep_assert_held_write(&rbd_dev->lock_rwsem);
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004107
Ilya Dryomoved95b212016-08-12 16:40:02 +02004108 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
4109 return false;
4110
Ilya Dryomoved95b212016-08-12 16:40:02 +02004111 /*
4112 * Ensure that all in-flight IO is flushed.
Ilya Dryomoved95b212016-08-12 16:40:02 +02004113 */
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004114 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
4115 rbd_assert(!completion_done(&rbd_dev->releasing_wait));
4116 need_wait = !list_empty(&rbd_dev->running_list);
4117 downgrade_write(&rbd_dev->lock_rwsem);
4118 if (need_wait)
4119 wait_for_completion(&rbd_dev->releasing_wait);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004120 up_read(&rbd_dev->lock_rwsem);
4121
4122 down_write(&rbd_dev->lock_rwsem);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004123 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
4124 return false;
4125
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004126 rbd_assert(list_empty(&rbd_dev->running_list));
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004127 return true;
4128}
4129
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02004130static void rbd_pre_release_action(struct rbd_device *rbd_dev)
4131{
4132 if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
4133 rbd_object_map_close(rbd_dev);
4134}
4135
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004136static void __rbd_release_lock(struct rbd_device *rbd_dev)
4137{
4138 rbd_assert(list_empty(&rbd_dev->running_list));
4139
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02004140 rbd_pre_release_action(rbd_dev);
Ilya Dryomovbbead742017-04-13 12:17:38 +02004141 rbd_unlock(rbd_dev);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004142}
4143
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004144/*
4145 * lock_rwsem must be held for write
4146 */
4147static void rbd_release_lock(struct rbd_device *rbd_dev)
4148{
4149 if (!rbd_quiesce_lock(rbd_dev))
4150 return;
4151
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004152 __rbd_release_lock(rbd_dev);
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004153
Ilya Dryomovbbead742017-04-13 12:17:38 +02004154 /*
4155 * Give others a chance to grab the lock - we would re-acquire
Ilya Dryomov637cd062019-06-06 17:14:49 +02004156 * almost immediately if we got new IO while draining the running
4157 * list otherwise. We need to ack our own notifications, so this
4158 * lock_dwork will be requeued from rbd_handle_released_lock() by
4159 * way of maybe_kick_acquire().
Ilya Dryomovbbead742017-04-13 12:17:38 +02004160 */
4161 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004162}
4163
4164static void rbd_release_lock_work(struct work_struct *work)
4165{
4166 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
4167 unlock_work);
4168
4169 down_write(&rbd_dev->lock_rwsem);
4170 rbd_release_lock(rbd_dev);
4171 up_write(&rbd_dev->lock_rwsem);
4172}
4173
Ilya Dryomov637cd062019-06-06 17:14:49 +02004174static void maybe_kick_acquire(struct rbd_device *rbd_dev)
4175{
4176 bool have_requests;
4177
4178 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4179 if (__rbd_is_lock_owner(rbd_dev))
4180 return;
4181
4182 spin_lock(&rbd_dev->lock_lists_lock);
4183 have_requests = !list_empty(&rbd_dev->acquiring_list);
4184 spin_unlock(&rbd_dev->lock_lists_lock);
4185 if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
4186 dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
4187 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4188 }
4189}
4190
Ilya Dryomoved95b212016-08-12 16:40:02 +02004191static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
4192 void **p)
4193{
4194 struct rbd_client_id cid = { 0 };
4195
4196 if (struct_v >= 2) {
4197 cid.gid = ceph_decode_64(p);
4198 cid.handle = ceph_decode_64(p);
4199 }
4200
4201 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4202 cid.handle);
4203 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4204 down_write(&rbd_dev->lock_rwsem);
4205 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4206 /*
4207 * we already know that the remote client is
4208 * the owner
4209 */
4210 up_write(&rbd_dev->lock_rwsem);
4211 return;
4212 }
4213
4214 rbd_set_owner_cid(rbd_dev, &cid);
4215 downgrade_write(&rbd_dev->lock_rwsem);
4216 } else {
4217 down_read(&rbd_dev->lock_rwsem);
4218 }
4219
Ilya Dryomov637cd062019-06-06 17:14:49 +02004220 maybe_kick_acquire(rbd_dev);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004221 up_read(&rbd_dev->lock_rwsem);
4222}
4223
4224static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
4225 void **p)
4226{
4227 struct rbd_client_id cid = { 0 };
4228
4229 if (struct_v >= 2) {
4230 cid.gid = ceph_decode_64(p);
4231 cid.handle = ceph_decode_64(p);
4232 }
4233
4234 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4235 cid.handle);
4236 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
4237 down_write(&rbd_dev->lock_rwsem);
4238 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
4239 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
4240 __func__, rbd_dev, cid.gid, cid.handle,
4241 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
4242 up_write(&rbd_dev->lock_rwsem);
4243 return;
4244 }
4245
4246 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4247 downgrade_write(&rbd_dev->lock_rwsem);
4248 } else {
4249 down_read(&rbd_dev->lock_rwsem);
4250 }
4251
Ilya Dryomov637cd062019-06-06 17:14:49 +02004252 maybe_kick_acquire(rbd_dev);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004253 up_read(&rbd_dev->lock_rwsem);
4254}
4255
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004256/*
4257 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
4258 * ResponseMessage is needed.
4259 */
4260static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
4261 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004262{
4263 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
4264 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004265 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004266
4267 if (struct_v >= 2) {
4268 cid.gid = ceph_decode_64(p);
4269 cid.handle = ceph_decode_64(p);
4270 }
4271
4272 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
4273 cid.handle);
4274 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004275 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004276
4277 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004278 if (__rbd_is_lock_owner(rbd_dev)) {
4279 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
4280 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
4281 goto out_unlock;
4282
4283 /*
4284 * encode ResponseMessage(0) so the peer can detect
4285 * a missing owner
4286 */
4287 result = 0;
4288
4289 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02004290 if (!rbd_dev->opts->exclusive) {
4291 dout("%s rbd_dev %p queueing unlock_work\n",
4292 __func__, rbd_dev);
4293 queue_work(rbd_dev->task_wq,
4294 &rbd_dev->unlock_work);
4295 } else {
4296 /* refuse to release the lock */
4297 result = -EROFS;
4298 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02004299 }
4300 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004301
4302out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02004303 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004304 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004305}
4306
4307static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
4308 u64 notify_id, u64 cookie, s32 *result)
4309{
4310 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07004311 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
4312 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004313 int ret;
4314
4315 if (result) {
4316 void *p = buf;
4317
4318 /* encode ResponseMessage */
4319 ceph_start_encoding(&p, 1, 1,
4320 buf_size - CEPH_ENCODING_START_BLK_LEN);
4321 ceph_encode_32(&p, *result);
4322 } else {
4323 buf_size = 0;
4324 }
4325
4326 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4327 &rbd_dev->header_oloc, notify_id, cookie,
4328 buf, buf_size);
4329 if (ret)
4330 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4331}
4332
4333static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4334 u64 cookie)
4335{
4336 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4337 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4338}
4339
4340static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4341 u64 notify_id, u64 cookie, s32 result)
4342{
4343 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4344 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4345}
Ilya Dryomov922dab62016-05-26 01:15:02 +02004346
4347static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4348 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06004349{
Ilya Dryomov922dab62016-05-26 01:15:02 +02004350 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004351 void *p = data;
4352 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02004353 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004354 u32 len;
4355 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06004356 int ret;
4357
Ilya Dryomoved95b212016-08-12 16:40:02 +02004358 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4359 __func__, rbd_dev, cookie, notify_id, data_len);
4360 if (data_len) {
4361 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4362 &struct_v, &len);
4363 if (ret) {
4364 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4365 ret);
4366 return;
4367 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004368
Ilya Dryomoved95b212016-08-12 16:40:02 +02004369 notify_op = ceph_decode_32(&p);
4370 } else {
4371 /* legacy notification for header updates */
4372 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4373 len = 0;
4374 }
Alex Elderb8d70032012-11-30 17:53:04 -06004375
Ilya Dryomoved95b212016-08-12 16:40:02 +02004376 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4377 switch (notify_op) {
4378 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4379 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4380 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4381 break;
4382 case RBD_NOTIFY_OP_RELEASED_LOCK:
4383 rbd_handle_released_lock(rbd_dev, struct_v, &p);
4384 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4385 break;
4386 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004387 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4388 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004389 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004390 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004391 else
4392 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4393 break;
4394 case RBD_NOTIFY_OP_HEADER_UPDATE:
4395 ret = rbd_dev_refresh(rbd_dev);
4396 if (ret)
4397 rbd_warn(rbd_dev, "refresh failed: %d", ret);
4398
4399 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4400 break;
4401 default:
4402 if (rbd_is_lock_owner(rbd_dev))
4403 rbd_acknowledge_notify_result(rbd_dev, notify_id,
4404 cookie, -EOPNOTSUPP);
4405 else
4406 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4407 break;
4408 }
Alex Elderb8d70032012-11-30 17:53:04 -06004409}
4410
Ilya Dryomov99d16942016-08-12 16:11:41 +02004411static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4412
Ilya Dryomov922dab62016-05-26 01:15:02 +02004413static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004414{
Ilya Dryomov922dab62016-05-26 01:15:02 +02004415 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004416
Ilya Dryomov922dab62016-05-26 01:15:02 +02004417 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004418
Ilya Dryomoved95b212016-08-12 16:40:02 +02004419 down_write(&rbd_dev->lock_rwsem);
4420 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4421 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004422
Ilya Dryomov99d16942016-08-12 16:11:41 +02004423 mutex_lock(&rbd_dev->watch_mutex);
4424 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4425 __rbd_unregister_watch(rbd_dev);
4426 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004427
Ilya Dryomov99d16942016-08-12 16:11:41 +02004428 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004429 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02004430 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004431}
4432
4433/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02004434 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06004435 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02004436static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06004437{
4438 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02004439 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06004440
Ilya Dryomov922dab62016-05-26 01:15:02 +02004441 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02004442 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06004443
Ilya Dryomov922dab62016-05-26 01:15:02 +02004444 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4445 &rbd_dev->header_oloc, rbd_watch_cb,
4446 rbd_watch_errcb, rbd_dev);
4447 if (IS_ERR(handle))
4448 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06004449
Ilya Dryomov922dab62016-05-26 01:15:02 +02004450 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004451 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06004452}
4453
Ilya Dryomov99d16942016-08-12 16:11:41 +02004454/*
4455 * watch_mutex must be locked
4456 */
4457static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02004458{
Ilya Dryomov922dab62016-05-26 01:15:02 +02004459 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4460 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004461
Ilya Dryomov99d16942016-08-12 16:11:41 +02004462 rbd_assert(rbd_dev->watch_handle);
4463 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004464
Ilya Dryomov922dab62016-05-26 01:15:02 +02004465 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4466 if (ret)
4467 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004468
Ilya Dryomov922dab62016-05-26 01:15:02 +02004469 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02004470}
4471
Ilya Dryomov99d16942016-08-12 16:11:41 +02004472static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02004473{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004474 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02004475
Ilya Dryomov99d16942016-08-12 16:11:41 +02004476 mutex_lock(&rbd_dev->watch_mutex);
4477 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4478 ret = __rbd_register_watch(rbd_dev);
4479 if (ret)
4480 goto out;
4481
4482 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4483 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4484
4485out:
4486 mutex_unlock(&rbd_dev->watch_mutex);
4487 return ret;
4488}
4489
4490static void cancel_tasks_sync(struct rbd_device *rbd_dev)
4491{
4492 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4493
Ilya Dryomoved95b212016-08-12 16:40:02 +02004494 cancel_work_sync(&rbd_dev->acquired_lock_work);
4495 cancel_work_sync(&rbd_dev->released_lock_work);
4496 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4497 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02004498}
4499
Ilya Dryomov0e4e1de52020-03-13 11:20:51 +01004500/*
4501 * header_rwsem must not be held to avoid a deadlock with
4502 * rbd_dev_refresh() when flushing notifies.
4503 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02004504static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4505{
4506 cancel_tasks_sync(rbd_dev);
4507
4508 mutex_lock(&rbd_dev->watch_mutex);
4509 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4510 __rbd_unregister_watch(rbd_dev);
4511 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4512 mutex_unlock(&rbd_dev->watch_mutex);
4513
Dongsheng Yang23edca82018-06-04 06:24:37 -04004514 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomov811c6682016-04-15 16:22:16 +02004515 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02004516}
4517
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004518/*
4519 * lock_rwsem must be held for write
4520 */
4521static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4522{
4523 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4524 char cookie[32];
4525 int ret;
4526
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004527 if (!rbd_quiesce_lock(rbd_dev))
4528 return;
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004529
4530 format_lock_cookie(rbd_dev, cookie);
4531 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4532 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4533 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4534 RBD_LOCK_TAG, cookie);
4535 if (ret) {
4536 if (ret != -EOPNOTSUPP)
4537 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4538 ret);
4539
4540 /*
4541 * Lock cookie cannot be updated on older OSDs, so do
4542 * a manual release and queue an acquire.
4543 */
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004544 __rbd_release_lock(rbd_dev);
Ilya Dryomova2b1da02019-05-30 11:15:23 +02004545 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004546 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01004547 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004548 wake_lock_waiters(rbd_dev, 0);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004549 }
4550}
4551
Ilya Dryomov99d16942016-08-12 16:11:41 +02004552static void rbd_reregister_watch(struct work_struct *work)
4553{
4554 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4555 struct rbd_device, watch_dwork);
4556 int ret;
4557
4558 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4559
4560 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004561 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4562 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004563 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004564 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02004565
4566 ret = __rbd_register_watch(rbd_dev);
4567 if (ret) {
4568 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov0b98acd2020-09-14 13:39:19 +02004569 if (ret != -EBLOCKLISTED && ret != -ENOENT) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02004570 queue_delayed_work(rbd_dev->task_wq,
4571 &rbd_dev->watch_dwork,
4572 RBD_RETRY_DELAY);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004573 mutex_unlock(&rbd_dev->watch_mutex);
4574 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004575 }
Ilya Dryomov637cd062019-06-06 17:14:49 +02004576
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004577 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov637cd062019-06-06 17:14:49 +02004578 down_write(&rbd_dev->lock_rwsem);
4579 wake_lock_waiters(rbd_dev, ret);
4580 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004581 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02004582 }
4583
4584 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4585 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4586 mutex_unlock(&rbd_dev->watch_mutex);
4587
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004588 down_write(&rbd_dev->lock_rwsem);
4589 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4590 rbd_reacquire_lock(rbd_dev);
4591 up_write(&rbd_dev->lock_rwsem);
4592
Ilya Dryomov99d16942016-08-12 16:11:41 +02004593 ret = rbd_dev_refresh(rbd_dev);
4594 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00004595 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02004596}
4597
Alex Elder36be9a72013-01-19 00:30:28 -06004598/*
Alex Elderf40eb342013-04-25 15:09:42 -05004599 * Synchronous osd object method call. Returns the number of bytes
4600 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06004601 */
4602static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004603 struct ceph_object_id *oid,
4604 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06004605 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05004606 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06004607 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05004608 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004609 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06004610{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004611 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4612 struct page *req_page = NULL;
4613 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06004614 int ret;
4615
4616 /*
Alex Elder6010a452013-04-05 01:27:11 -05004617 * Method calls are ultimately read operations. The result
4618 * should placed into the inbound buffer provided. They
4619 * also supply outbound data--parameters for the object
4620 * method. Currently if this is present it will be a
4621 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06004622 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004623 if (outbound) {
4624 if (outbound_size > PAGE_SIZE)
4625 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06004626
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004627 req_page = alloc_page(GFP_KERNEL);
4628 if (!req_page)
4629 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06004630
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004631 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05004632 }
Alex Elder430c28c2013-04-03 21:32:51 -05004633
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004634 reply_page = alloc_page(GFP_KERNEL);
4635 if (!reply_page) {
4636 if (req_page)
4637 __free_page(req_page);
4638 return -ENOMEM;
4639 }
Alex Elder36be9a72013-01-19 00:30:28 -06004640
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004641 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4642 CEPH_OSD_FLAG_READ, req_page, outbound_size,
Ilya Dryomov68ada912019-06-14 18:16:51 +02004643 &reply_page, &inbound_size);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004644 if (!ret) {
4645 memcpy(inbound, page_address(reply_page), inbound_size);
4646 ret = inbound_size;
4647 }
Alex Elder57385b52013-04-21 12:14:45 -05004648
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004649 if (req_page)
4650 __free_page(req_page);
4651 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06004652 return ret;
4653}
4654
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004655static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004656{
Ilya Dryomov59e542c2020-02-12 15:23:58 +01004657 struct rbd_img_request *img_request =
4658 container_of(work, struct rbd_img_request, work);
4659 struct rbd_device *rbd_dev = img_request->rbd_dev;
4660 enum obj_operation_type op_type = img_request->op_type;
4661 struct request *rq = blk_mq_rq_from_pdu(img_request);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004662 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4663 u64 length = blk_rq_bytes(rq);
Josh Durgin4e752f02014-04-08 11:12:11 -07004664 u64 mapping_size;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004665 int result;
4666
4667 /* Ignore/skip any zero-length requests */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004668 if (!length) {
4669 dout("%s: zero-length request\n", __func__);
4670 result = 0;
Ilya Dryomov59e542c2020-02-12 15:23:58 +01004671 goto err_img_request;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004672 }
4673
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004674 blk_mq_start_request(rq);
4675
Josh Durgin4e752f02014-04-08 11:12:11 -07004676 down_read(&rbd_dev->header_rwsem);
4677 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova52cc682020-02-12 15:08:39 +01004678 rbd_img_capture_header(img_request);
Josh Durgin4e752f02014-04-08 11:12:11 -07004679 up_read(&rbd_dev->header_rwsem);
4680
4681 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004682 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07004683 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004684 result = -EIO;
Ilya Dryomova52cc682020-02-12 15:08:39 +01004685 goto err_img_request;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004686 }
4687
Ilya Dryomov21ed05a2019-08-30 17:31:06 +02004688 dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
4689 img_request, obj_op_name(op_type), offset, length);
4690
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01004691 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
Ilya Dryomov5a237812018-02-06 19:26:34 +01004692 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004693 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01004694 result = rbd_img_fill_from_bio(img_request, offset, length,
4695 rq->bio);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02004696 if (result)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004697 goto err_img_request;
4698
Ilya Dryomove1fddc82019-05-30 16:07:48 +02004699 rbd_img_handle_request(img_request, 0);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004700 return;
4701
4702err_img_request:
Hannes Reinecke679a97d2020-01-31 11:37:36 +01004703 rbd_img_request_destroy(img_request);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004704 if (result)
4705 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004706 obj_op_name(op_type), length, offset, result);
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02004707 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004708}
4709
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004710static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004711 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004712{
Ilya Dryomov59e542c2020-02-12 15:23:58 +01004713 struct rbd_device *rbd_dev = hctx->queue->queuedata;
4714 struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
4715 enum obj_operation_type op_type;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004716
Ilya Dryomov59e542c2020-02-12 15:23:58 +01004717 switch (req_op(bd->rq)) {
4718 case REQ_OP_DISCARD:
4719 op_type = OBJ_OP_DISCARD;
4720 break;
4721 case REQ_OP_WRITE_ZEROES:
4722 op_type = OBJ_OP_ZEROOUT;
4723 break;
4724 case REQ_OP_WRITE:
4725 op_type = OBJ_OP_WRITE;
4726 break;
4727 case REQ_OP_READ:
4728 op_type = OBJ_OP_READ;
4729 break;
4730 default:
4731 rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
4732 return BLK_STS_IOERR;
4733 }
4734
4735 rbd_img_request_init(img_req, rbd_dev, op_type);
4736
4737 if (rbd_img_is_write(img_req)) {
4738 if (rbd_is_ro(rbd_dev)) {
4739 rbd_warn(rbd_dev, "%s on read-only mapping",
4740 obj_op_name(img_req->op_type));
4741 return BLK_STS_IOERR;
4742 }
4743 rbd_assert(!rbd_is_snap(rbd_dev));
4744 }
4745
4746 INIT_WORK(&img_req->work, rbd_queue_workfn);
4747 queue_work(rbd_wq, &img_req->work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004748 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06004749}
4750
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004751static void rbd_free_disk(struct rbd_device *rbd_dev)
4752{
Christoph Hellwig195b1952021-06-02 09:53:37 +03004753 blk_cleanup_disk(rbd_dev->disk);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004754 blk_mq_free_tag_set(&rbd_dev->tag_set);
Alex Eldera0cab922013-04-25 23:15:08 -05004755 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004756}
4757
Alex Elder788e2df2013-01-17 12:25:27 -06004758static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004759 struct ceph_object_id *oid,
4760 struct ceph_object_locator *oloc,
4761 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06004762
4763{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004764 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4765 struct ceph_osd_request *req;
4766 struct page **pages;
4767 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06004768 int ret;
4769
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004770 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4771 if (!req)
4772 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06004773
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004774 ceph_oid_copy(&req->r_base_oid, oid);
4775 ceph_oloc_copy(&req->r_base_oloc, oloc);
4776 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06004777
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004778 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4779 if (IS_ERR(pages)) {
4780 ret = PTR_ERR(pages);
4781 goto out_req;
4782 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06004783
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004784 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4785 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4786 true);
Alex Elder788e2df2013-01-17 12:25:27 -06004787
Ilya Dryomov26f887e2018-10-15 16:11:37 +02004788 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4789 if (ret)
4790 goto out_req;
4791
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004792 ceph_osdc_start_request(osdc, req, false);
4793 ret = ceph_osdc_wait_request(osdc, req);
4794 if (ret >= 0)
4795 ceph_copy_from_page_vector(pages, buf, 0, ret);
4796
4797out_req:
4798 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06004799 return ret;
4800}
4801
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004802/*
Alex Elder662518b2013-05-06 09:51:29 -05004803 * Read the complete header for the given rbd device. On successful
4804 * return, the rbd_dev->header field will contain up-to-date
4805 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05004806 */
Alex Elder99a41eb2013-05-06 09:51:30 -05004807static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05004808{
4809 struct rbd_image_header_ondisk *ondisk = NULL;
4810 u32 snap_count = 0;
4811 u64 names_size = 0;
4812 u32 want_count;
4813 int ret;
4814
4815 /*
4816 * The complete header will include an array of its 64-bit
4817 * snapshot ids, followed by the names of those snapshots as
4818 * a contiguous block of NUL-terminated strings. Note that
4819 * the number of snapshots could change by the time we read
4820 * it in, in which case we re-read it.
4821 */
4822 do {
4823 size_t size;
4824
4825 kfree(ondisk);
4826
4827 size = sizeof (*ondisk);
4828 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4829 size += names_size;
4830 ondisk = kmalloc(size, GFP_KERNEL);
4831 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05004832 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05004833
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004834 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4835 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05004836 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05004837 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004838 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05004839 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004840 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4841 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05004842 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004843 }
4844 if (!rbd_dev_ondisk_valid(ondisk)) {
4845 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004846 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05004847 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004848 }
4849
4850 names_size = le64_to_cpu(ondisk->snap_names_len);
4851 want_count = snap_count;
4852 snap_count = le32_to_cpu(ondisk->snap_count);
4853 } while (snap_count != want_count);
4854
Alex Elder662518b2013-05-06 09:51:29 -05004855 ret = rbd_header_from_disk(rbd_dev, ondisk);
4856out:
Alex Elder4156d992012-08-02 11:29:46 -05004857 kfree(ondisk);
4858
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004859 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004860}
4861
Josh Durgin98752012013-08-29 17:26:31 -07004862static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4863{
4864 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07004865
4866 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02004867 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4868 * try to update its size. If REMOVING is set, updating size
4869 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07004870 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02004871 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4872 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07004873 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4874 dout("setting size to %llu sectors", (unsigned long long)size);
Christoph Hellwige864e492020-11-16 15:57:07 +01004875 set_capacity_and_notify(rbd_dev->disk, size);
Josh Durgin98752012013-08-29 17:26:31 -07004876 }
4877}
4878
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004879static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05004880{
Alex Eldere627db02013-05-06 07:40:30 -05004881 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05004882 int ret;
4883
Alex Eldercfbf6372013-05-31 17:40:45 -05004884 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004885 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004886
4887 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004888 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004889 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004890
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004891 /*
4892 * If there is a parent, see if it has disappeared due to the
4893 * mapped image getting flattened.
4894 */
4895 if (rbd_dev->parent) {
4896 ret = rbd_dev_v2_parent_info(rbd_dev);
4897 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004898 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004899 }
4900
Ilya Dryomov686238b2019-11-18 12:51:02 +01004901 rbd_assert(!rbd_is_snap(rbd_dev));
4902 rbd_dev->mapping.size = rbd_dev->header.image_size;
Alex Elder15228ed2013-05-01 12:43:03 -05004903
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004904out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004905 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004906 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004907 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004908
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004909 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004910}
4911
Eric Biggersf363b082017-03-30 13:39:16 -07004912static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004913 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004914};
4915
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004916static int rbd_init_disk(struct rbd_device *rbd_dev)
4917{
4918 struct gendisk *disk;
4919 struct request_queue *q;
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004920 unsigned int objset_bytes =
4921 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004922 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004923
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004924 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4925 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004926 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004927 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ming Lei56d18f62019-02-15 19:13:24 +08004928 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
Hannes Reineckef9b6b982020-01-31 11:37:39 +01004929 rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
Ilya Dryomov59e542c2020-02-12 15:23:58 +01004930 rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004931
4932 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4933 if (err)
Christoph Hellwig195b1952021-06-02 09:53:37 +03004934 return err;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004935
Christoph Hellwig195b1952021-06-02 09:53:37 +03004936 disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev);
4937 if (IS_ERR(disk)) {
4938 err = PTR_ERR(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004939 goto out_tag_set;
4940 }
Christoph Hellwig195b1952021-06-02 09:53:37 +03004941 q = disk->queue;
4942
4943 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
4944 rbd_dev->dev_id);
4945 disk->major = rbd_dev->major;
4946 disk->first_minor = rbd_dev->minor;
4947 if (single_major) {
4948 disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT);
4949 disk->flags |= GENHD_FL_EXT_DEVT;
4950 } else {
4951 disk->minors = RBD_MINORS_PER_MAJOR;
4952 }
4953 disk->fops = &rbd_bd_ops;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004954
Bart Van Assche8b904b52018-03-07 17:10:10 -08004955 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004956 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004957
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004958 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004959 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01004960 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01004961 blk_queue_max_segment_size(q, UINT_MAX);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01004962 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
4963 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004964
Ilya Dryomovd9360542018-03-23 06:14:47 +01004965 if (rbd_dev->opts->trim) {
4966 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01004967 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
Ilya Dryomovd9360542018-03-23 06:14:47 +01004968 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4969 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4970 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004971
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004972 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Christoph Hellwig1cb039f2020-09-24 08:51:38 +02004973 blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004974
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004975 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004976
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004977 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004978out_tag_set:
4979 blk_mq_free_tag_set(&rbd_dev->tag_set);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004980 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004981}
4982
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004983/*
4984 sysfs
4985*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004986
Alex Elder593a9e72012-02-07 12:03:37 -06004987static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4988{
4989 return container_of(dev, struct rbd_device, dev);
4990}
4991
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004992static ssize_t rbd_size_show(struct device *dev,
4993 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004994{
Alex Elder593a9e72012-02-07 12:03:37 -06004995 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004996
Alex Elderfc71d832013-04-26 15:44:36 -05004997 return sprintf(buf, "%llu\n",
4998 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004999}
5000
Alex Elder34b13182012-07-13 20:35:12 -05005001static ssize_t rbd_features_show(struct device *dev,
5002 struct device_attribute *attr, char *buf)
5003{
5004 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5005
Ilya Dryomovfa58bca2019-11-05 13:16:52 +01005006 return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
Alex Elder34b13182012-07-13 20:35:12 -05005007}
5008
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005009static ssize_t rbd_major_show(struct device *dev,
5010 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005011{
Alex Elder593a9e72012-02-07 12:03:37 -06005012 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005013
Alex Elderfc71d832013-04-26 15:44:36 -05005014 if (rbd_dev->major)
5015 return sprintf(buf, "%d\n", rbd_dev->major);
5016
5017 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02005018}
Alex Elderfc71d832013-04-26 15:44:36 -05005019
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02005020static ssize_t rbd_minor_show(struct device *dev,
5021 struct device_attribute *attr, char *buf)
5022{
5023 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5024
5025 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005026}
5027
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02005028static ssize_t rbd_client_addr_show(struct device *dev,
5029 struct device_attribute *attr, char *buf)
5030{
5031 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5032 struct ceph_entity_addr *client_addr =
5033 ceph_client_addr(rbd_dev->rbd_client->client);
5034
5035 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
5036 le32_to_cpu(client_addr->nonce));
5037}
5038
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005039static ssize_t rbd_client_id_show(struct device *dev,
5040 struct device_attribute *attr, char *buf)
5041{
Alex Elder593a9e72012-02-07 12:03:37 -06005042 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005043
Alex Elder1dbb4392012-01-24 10:08:37 -06005044 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02005045 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005046}
5047
Mike Christie267fb902016-08-18 18:38:43 +02005048static ssize_t rbd_cluster_fsid_show(struct device *dev,
5049 struct device_attribute *attr, char *buf)
5050{
5051 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5052
5053 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
5054}
5055
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005056static ssize_t rbd_config_info_show(struct device *dev,
5057 struct device_attribute *attr, char *buf)
5058{
5059 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5060
Ilya Dryomovf44d04e2020-09-03 13:24:11 +02005061 if (!capable(CAP_SYS_ADMIN))
5062 return -EPERM;
5063
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005064 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005065}
5066
5067static ssize_t rbd_pool_show(struct device *dev,
5068 struct device_attribute *attr, char *buf)
5069{
Alex Elder593a9e72012-02-07 12:03:37 -06005070 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005071
Alex Elder0d7dbfc2012-10-25 23:34:41 -05005072 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005073}
5074
Alex Elder9bb2f332012-07-12 10:46:35 -05005075static ssize_t rbd_pool_id_show(struct device *dev,
5076 struct device_attribute *attr, char *buf)
5077{
5078 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5079
Alex Elder0d7dbfc2012-10-25 23:34:41 -05005080 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05005081 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05005082}
5083
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005084static ssize_t rbd_pool_ns_show(struct device *dev,
5085 struct device_attribute *attr, char *buf)
5086{
5087 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5088
5089 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
5090}
5091
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005092static ssize_t rbd_name_show(struct device *dev,
5093 struct device_attribute *attr, char *buf)
5094{
Alex Elder593a9e72012-02-07 12:03:37 -06005095 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005096
Alex Eldera92ffdf2012-10-30 19:40:33 -05005097 if (rbd_dev->spec->image_name)
5098 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
5099
5100 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005101}
5102
Alex Elder589d30e2012-07-10 20:30:11 -05005103static ssize_t rbd_image_id_show(struct device *dev,
5104 struct device_attribute *attr, char *buf)
5105{
5106 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5107
Alex Elder0d7dbfc2012-10-25 23:34:41 -05005108 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005109}
5110
Alex Elder34b13182012-07-13 20:35:12 -05005111/*
5112 * Shows the name of the currently-mapped snapshot (or
5113 * RBD_SNAP_HEAD_NAME for the base image).
5114 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005115static ssize_t rbd_snap_show(struct device *dev,
5116 struct device_attribute *attr,
5117 char *buf)
5118{
Alex Elder593a9e72012-02-07 12:03:37 -06005119 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005120
Alex Elder0d7dbfc2012-10-25 23:34:41 -05005121 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005122}
5123
Mike Christie92a58672016-08-18 18:38:44 +02005124static ssize_t rbd_snap_id_show(struct device *dev,
5125 struct device_attribute *attr, char *buf)
5126{
5127 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5128
5129 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
5130}
5131
Alex Elder86b00e02012-10-25 23:34:42 -05005132/*
Ilya Dryomovff961282014-07-22 21:53:07 +04005133 * For a v2 image, shows the chain of parent images, separated by empty
5134 * lines. For v1 images or if there is no parent, shows "(no parent
5135 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05005136 */
5137static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04005138 struct device_attribute *attr,
5139 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05005140{
5141 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04005142 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05005143
Ilya Dryomovff961282014-07-22 21:53:07 +04005144 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05005145 return sprintf(buf, "(no parent image)\n");
5146
Ilya Dryomovff961282014-07-22 21:53:07 +04005147 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
5148 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05005149
Ilya Dryomovff961282014-07-22 21:53:07 +04005150 count += sprintf(&buf[count], "%s"
5151 "pool_id %llu\npool_name %s\n"
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005152 "pool_ns %s\n"
Ilya Dryomovff961282014-07-22 21:53:07 +04005153 "image_id %s\nimage_name %s\n"
5154 "snap_id %llu\nsnap_name %s\n"
5155 "overlap %llu\n",
5156 !count ? "" : "\n", /* first? */
5157 spec->pool_id, spec->pool_name,
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005158 spec->pool_ns ?: "",
Ilya Dryomovff961282014-07-22 21:53:07 +04005159 spec->image_id, spec->image_name ?: "(unknown)",
5160 spec->snap_id, spec->snap_name,
5161 rbd_dev->parent_overlap);
5162 }
Alex Elder86b00e02012-10-25 23:34:42 -05005163
Ilya Dryomovff961282014-07-22 21:53:07 +04005164 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05005165}
5166
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005167static ssize_t rbd_image_refresh(struct device *dev,
5168 struct device_attribute *attr,
5169 const char *buf,
5170 size_t size)
5171{
Alex Elder593a9e72012-02-07 12:03:37 -06005172 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05005173 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005174
Ilya Dryomovf44d04e2020-09-03 13:24:11 +02005175 if (!capable(CAP_SYS_ADMIN))
5176 return -EPERM;
5177
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005178 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05005179 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04005180 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05005181
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04005182 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005183}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005184
Joe Perches5657a812018-05-24 13:38:59 -06005185static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
5186static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
5187static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
5188static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
5189static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
5190static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
5191static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
5192static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
5193static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
5194static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005195static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
Joe Perches5657a812018-05-24 13:38:59 -06005196static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
5197static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
5198static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
5199static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
5200static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
5201static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005202
5203static struct attribute *rbd_attrs[] = {
5204 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05005205 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005206 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02005207 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02005208 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005209 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02005210 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005211 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005212 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05005213 &dev_attr_pool_id.attr,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005214 &dev_attr_pool_ns.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005215 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05005216 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005217 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02005218 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05005219 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005220 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005221 NULL
5222};
5223
5224static struct attribute_group rbd_attr_group = {
5225 .attrs = rbd_attrs,
5226};
5227
5228static const struct attribute_group *rbd_attr_groups[] = {
5229 &rbd_attr_group,
5230 NULL
5231};
5232
Ilya Dryomov6cac4692015-10-16 20:11:25 +02005233static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005234
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05305235static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005236 .name = "rbd",
5237 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02005238 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005239};
5240
Alex Elder8b8fb992012-10-26 17:25:24 -05005241static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5242{
5243 kref_get(&spec->kref);
5244
5245 return spec;
5246}
5247
5248static void rbd_spec_free(struct kref *kref);
5249static void rbd_spec_put(struct rbd_spec *spec)
5250{
5251 if (spec)
5252 kref_put(&spec->kref, rbd_spec_free);
5253}
5254
5255static struct rbd_spec *rbd_spec_alloc(void)
5256{
5257 struct rbd_spec *spec;
5258
5259 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5260 if (!spec)
5261 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04005262
5263 spec->pool_id = CEPH_NOPOOL;
5264 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05005265 kref_init(&spec->kref);
5266
Alex Elder8b8fb992012-10-26 17:25:24 -05005267 return spec;
5268}
5269
5270static void rbd_spec_free(struct kref *kref)
5271{
5272 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5273
5274 kfree(spec->pool_name);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005275 kfree(spec->pool_ns);
Alex Elder8b8fb992012-10-26 17:25:24 -05005276 kfree(spec->image_id);
5277 kfree(spec->image_name);
5278 kfree(spec->snap_name);
5279 kfree(spec);
5280}
5281
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005282static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005283{
Ilya Dryomov99d16942016-08-12 16:11:41 +02005284 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02005285 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005286
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005287 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02005288 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005289 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005290
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005291 rbd_put_client(rbd_dev->rbd_client);
5292 rbd_spec_put(rbd_dev->spec);
5293 kfree(rbd_dev->opts);
5294 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005295}
5296
5297static void rbd_dev_release(struct device *dev)
5298{
5299 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5300 bool need_put = !!rbd_dev->opts;
5301
5302 if (need_put) {
5303 destroy_workqueue(rbd_dev->task_wq);
5304 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5305 }
5306
5307 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005308
5309 /*
5310 * This is racy, but way better than putting module outside of
5311 * the release callback. The race window is pretty small, so
5312 * doing something similar to dm (dm-builtin.c) is overkill.
5313 */
5314 if (need_put)
5315 module_put(THIS_MODULE);
5316}
5317
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005318static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5319 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05005320{
5321 struct rbd_device *rbd_dev;
5322
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005323 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05005324 if (!rbd_dev)
5325 return NULL;
5326
5327 spin_lock_init(&rbd_dev->lock);
5328 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05005329 init_rwsem(&rbd_dev->header_rwsem);
5330
Ilya Dryomov7e973322017-01-25 18:16:22 +01005331 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005332 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01005333 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005334 if (spec->pool_ns) {
5335 WARN_ON(!*spec->pool_ns);
5336 rbd_dev->header_oloc.pool_ns =
5337 ceph_find_or_create_string(spec->pool_ns,
5338 strlen(spec->pool_ns));
5339 }
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005340
Ilya Dryomov99d16942016-08-12 16:11:41 +02005341 mutex_init(&rbd_dev->watch_mutex);
5342 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5343 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5344
Ilya Dryomoved95b212016-08-12 16:40:02 +02005345 init_rwsem(&rbd_dev->lock_rwsem);
5346 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5347 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5348 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5349 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5350 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02005351 spin_lock_init(&rbd_dev->lock_lists_lock);
Ilya Dryomov637cd062019-06-06 17:14:49 +02005352 INIT_LIST_HEAD(&rbd_dev->acquiring_list);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02005353 INIT_LIST_HEAD(&rbd_dev->running_list);
Ilya Dryomov637cd062019-06-06 17:14:49 +02005354 init_completion(&rbd_dev->acquire_wait);
Ilya Dryomove1fddc82019-05-30 16:07:48 +02005355 init_completion(&rbd_dev->releasing_wait);
Ilya Dryomoved95b212016-08-12 16:40:02 +02005356
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02005357 spin_lock_init(&rbd_dev->object_map_lock);
Alex Elderc53d5892012-10-25 23:34:42 -05005358
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005359 rbd_dev->dev.bus = &rbd_bus_type;
5360 rbd_dev->dev.type = &rbd_device_type;
5361 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005362 device_initialize(&rbd_dev->dev);
5363
Alex Elderc53d5892012-10-25 23:34:42 -05005364 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03005365 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06005366
Alex Elderc53d5892012-10-25 23:34:42 -05005367 return rbd_dev;
5368}
5369
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005370/*
5371 * Create a mapping rbd_dev.
5372 */
5373static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5374 struct rbd_spec *spec,
5375 struct rbd_options *opts)
5376{
5377 struct rbd_device *rbd_dev;
5378
5379 rbd_dev = __rbd_dev_create(rbdc, spec);
5380 if (!rbd_dev)
5381 return NULL;
5382
5383 rbd_dev->opts = opts;
5384
5385 /* get an id and fill in device name */
5386 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5387 minor_to_rbd_dev_id(1 << MINORBITS),
5388 GFP_KERNEL);
5389 if (rbd_dev->dev_id < 0)
5390 goto fail_rbd_dev;
5391
5392 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5393 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5394 rbd_dev->name);
5395 if (!rbd_dev->task_wq)
5396 goto fail_dev_id;
5397
5398 /* we have a ref from do_rbd_add() */
5399 __module_get(THIS_MODULE);
5400
5401 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5402 return rbd_dev;
5403
5404fail_dev_id:
5405 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5406fail_rbd_dev:
5407 rbd_dev_free(rbd_dev);
5408 return NULL;
5409}
5410
Alex Elderc53d5892012-10-25 23:34:42 -05005411static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5412{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005413 if (rbd_dev)
5414 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005415}
5416
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005417/*
Alex Elder9d475de2012-07-03 16:01:19 -05005418 * Get the size and object order for an image snapshot, or if
5419 * snap_id is CEPH_NOSNAP, gets this information for the base
5420 * image.
5421 */
5422static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5423 u8 *order, u64 *snap_size)
5424{
5425 __le64 snapid = cpu_to_le64(snap_id);
5426 int ret;
5427 struct {
5428 u8 order;
5429 __le64 size;
5430 } __attribute__ ((packed)) size_buf = { 0 };
5431
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005432 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5433 &rbd_dev->header_oloc, "get_size",
5434 &snapid, sizeof(snapid),
5435 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06005436 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05005437 if (ret < 0)
5438 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05005439 if (ret < sizeof (size_buf))
5440 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05005441
Josh Durginc3545572013-08-28 17:08:10 -07005442 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05005443 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07005444 dout(" order %u", (unsigned int)*order);
5445 }
Alex Elder9d475de2012-07-03 16:01:19 -05005446 *snap_size = le64_to_cpu(size_buf.size);
5447
Josh Durginc3545572013-08-28 17:08:10 -07005448 dout(" snap_id 0x%016llx snap_size = %llu\n",
5449 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05005450 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05005451
5452 return 0;
5453}
5454
5455static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5456{
5457 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5458 &rbd_dev->header.obj_order,
5459 &rbd_dev->header.image_size);
5460}
5461
Alex Elder1e130192012-07-03 16:01:19 -05005462static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5463{
Dongsheng Yang5435d2062019-08-09 07:05:27 +00005464 size_t size;
Alex Elder1e130192012-07-03 16:01:19 -05005465 void *reply_buf;
5466 int ret;
5467 void *p;
5468
Dongsheng Yang5435d2062019-08-09 07:05:27 +00005469 /* Response will be an encoded string, which includes a length */
5470 size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
5471 reply_buf = kzalloc(size, GFP_KERNEL);
Alex Elder1e130192012-07-03 16:01:19 -05005472 if (!reply_buf)
5473 return -ENOMEM;
5474
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005475 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5476 &rbd_dev->header_oloc, "get_object_prefix",
Dongsheng Yang5435d2062019-08-09 07:05:27 +00005477 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005478 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05005479 if (ret < 0)
5480 goto out;
5481
5482 p = reply_buf;
5483 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05005484 p + ret, NULL, GFP_NOIO);
5485 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005486
5487 if (IS_ERR(rbd_dev->header.object_prefix)) {
5488 ret = PTR_ERR(rbd_dev->header.object_prefix);
5489 rbd_dev->header.object_prefix = NULL;
5490 } else {
5491 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
5492 }
Alex Elder1e130192012-07-03 16:01:19 -05005493out:
5494 kfree(reply_buf);
5495
5496 return ret;
5497}
5498
Alex Elderb1b54022012-07-03 16:01:19 -05005499static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
Ilya Dryomov196e2d62019-11-05 15:38:46 +01005500 bool read_only, u64 *snap_features)
Alex Elderb1b54022012-07-03 16:01:19 -05005501{
Ilya Dryomov196e2d62019-11-05 15:38:46 +01005502 struct {
5503 __le64 snap_id;
5504 u8 read_only;
5505 } features_in;
Alex Elderb1b54022012-07-03 16:01:19 -05005506 struct {
5507 __le64 features;
5508 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05005509 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005510 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05005511 int ret;
5512
Ilya Dryomov196e2d62019-11-05 15:38:46 +01005513 features_in.snap_id = cpu_to_le64(snap_id);
5514 features_in.read_only = read_only;
5515
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005516 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5517 &rbd_dev->header_oloc, "get_features",
Ilya Dryomov196e2d62019-11-05 15:38:46 +01005518 &features_in, sizeof(features_in),
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005519 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06005520 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05005521 if (ret < 0)
5522 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05005523 if (ret < sizeof (features_buf))
5524 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07005525
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005526 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5527 if (unsup) {
5528 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5529 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05005530 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005531 }
Alex Elderd8891402012-10-09 13:50:17 -07005532
Alex Elderb1b54022012-07-03 16:01:19 -05005533 *snap_features = le64_to_cpu(features_buf.features);
5534
5535 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05005536 (unsigned long long)snap_id,
5537 (unsigned long long)*snap_features,
5538 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05005539
5540 return 0;
5541}
5542
5543static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5544{
5545 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
Ilya Dryomov196e2d62019-11-05 15:38:46 +01005546 rbd_is_ro(rbd_dev),
5547 &rbd_dev->header.features);
Alex Elderb1b54022012-07-03 16:01:19 -05005548}
5549
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02005550/*
5551 * These are generic image flags, but since they are used only for
5552 * object map, store them in rbd_dev->object_map_flags.
5553 *
5554 * For the same reason, this function is called only on object map
5555 * (re)load and not on header refresh.
5556 */
5557static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
5558{
5559 __le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
5560 __le64 flags;
5561 int ret;
5562
5563 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5564 &rbd_dev->header_oloc, "get_flags",
5565 &snapid, sizeof(snapid),
5566 &flags, sizeof(flags));
5567 if (ret < 0)
5568 return ret;
5569 if (ret < sizeof(flags))
5570 return -EBADMSG;
5571
5572 rbd_dev->object_map_flags = le64_to_cpu(flags);
5573 return 0;
5574}
5575
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005576struct parent_image_info {
5577 u64 pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005578 const char *pool_ns;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005579 const char *image_id;
5580 u64 snap_id;
5581
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005582 bool has_overlap;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005583 u64 overlap;
5584};
5585
5586/*
5587 * The caller is responsible for @pii.
5588 */
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005589static int decode_parent_image_spec(void **p, void *end,
5590 struct parent_image_info *pii)
5591{
5592 u8 struct_v;
5593 u32 struct_len;
5594 int ret;
5595
5596 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
5597 &struct_v, &struct_len);
5598 if (ret)
5599 return ret;
5600
5601 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
5602 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5603 if (IS_ERR(pii->pool_ns)) {
5604 ret = PTR_ERR(pii->pool_ns);
5605 pii->pool_ns = NULL;
5606 return ret;
5607 }
5608 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
5609 if (IS_ERR(pii->image_id)) {
5610 ret = PTR_ERR(pii->image_id);
5611 pii->image_id = NULL;
5612 return ret;
5613 }
5614 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
5615 return 0;
5616
5617e_inval:
5618 return -EINVAL;
5619}
5620
5621static int __get_parent_info(struct rbd_device *rbd_dev,
5622 struct page *req_page,
5623 struct page *reply_page,
5624 struct parent_image_info *pii)
5625{
5626 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5627 size_t reply_len = PAGE_SIZE;
5628 void *p, *end;
5629 int ret;
5630
5631 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5632 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
Ilya Dryomov68ada912019-06-14 18:16:51 +02005633 req_page, sizeof(u64), &reply_page, &reply_len);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005634 if (ret)
5635 return ret == -EOPNOTSUPP ? 1 : ret;
5636
5637 p = page_address(reply_page);
5638 end = p + reply_len;
5639 ret = decode_parent_image_spec(&p, end, pii);
5640 if (ret)
5641 return ret;
5642
5643 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5644 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
Ilya Dryomov68ada912019-06-14 18:16:51 +02005645 req_page, sizeof(u64), &reply_page, &reply_len);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005646 if (ret)
5647 return ret;
5648
5649 p = page_address(reply_page);
5650 end = p + reply_len;
5651 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
5652 if (pii->has_overlap)
5653 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5654
5655 return 0;
5656
5657e_inval:
5658 return -EINVAL;
5659}
5660
5661/*
5662 * The caller is responsible for @pii.
5663 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005664static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
5665 struct page *req_page,
5666 struct page *reply_page,
5667 struct parent_image_info *pii)
5668{
5669 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5670 size_t reply_len = PAGE_SIZE;
5671 void *p, *end;
5672 int ret;
5673
5674 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
5675 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
Ilya Dryomov68ada912019-06-14 18:16:51 +02005676 req_page, sizeof(u64), &reply_page, &reply_len);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005677 if (ret)
5678 return ret;
5679
5680 p = page_address(reply_page);
5681 end = p + reply_len;
5682 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
5683 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5684 if (IS_ERR(pii->image_id)) {
5685 ret = PTR_ERR(pii->image_id);
5686 pii->image_id = NULL;
5687 return ret;
5688 }
5689 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005690 pii->has_overlap = true;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005691 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5692
5693 return 0;
5694
5695e_inval:
5696 return -EINVAL;
5697}
5698
5699static int get_parent_info(struct rbd_device *rbd_dev,
5700 struct parent_image_info *pii)
5701{
5702 struct page *req_page, *reply_page;
5703 void *p;
5704 int ret;
5705
5706 req_page = alloc_page(GFP_KERNEL);
5707 if (!req_page)
5708 return -ENOMEM;
5709
5710 reply_page = alloc_page(GFP_KERNEL);
5711 if (!reply_page) {
5712 __free_page(req_page);
5713 return -ENOMEM;
5714 }
5715
5716 p = page_address(req_page);
5717 ceph_encode_64(&p, rbd_dev->spec->snap_id);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005718 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5719 if (ret > 0)
5720 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5721 pii);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005722
5723 __free_page(req_page);
5724 __free_page(reply_page);
5725 return ret;
5726}
5727
Alex Elder86b00e02012-10-25 23:34:42 -05005728static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5729{
5730 struct rbd_spec *parent_spec;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005731 struct parent_image_info pii = { 0 };
Alex Elder86b00e02012-10-25 23:34:42 -05005732 int ret;
5733
5734 parent_spec = rbd_spec_alloc();
5735 if (!parent_spec)
5736 return -ENOMEM;
5737
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005738 ret = get_parent_info(rbd_dev, &pii);
5739 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05005740 goto out_err;
5741
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005742 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5743 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5744 pii.has_overlap, pii.overlap);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005745
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005746 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
Alex Elder392a9da2013-05-06 17:40:33 -05005747 /*
5748 * Either the parent never existed, or we have
5749 * record of it but the image got flattened so it no
5750 * longer has a parent. When the parent of a
5751 * layered image disappears we immediately set the
5752 * overlap to 0. The effect of this is that all new
5753 * requests will be treated as if the image had no
5754 * parent.
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005755 *
5756 * If !pii.has_overlap, the parent image spec is not
5757 * applicable. It's there to avoid duplication in each
5758 * snapshot record.
Alex Elder392a9da2013-05-06 17:40:33 -05005759 */
5760 if (rbd_dev->parent_overlap) {
5761 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05005762 rbd_dev_parent_put(rbd_dev);
5763 pr_info("%s: clone image has been flattened\n",
5764 rbd_dev->disk->disk_name);
5765 }
5766
Alex Elder86b00e02012-10-25 23:34:42 -05005767 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05005768 }
Alex Elder86b00e02012-10-25 23:34:42 -05005769
Alex Elder0903e872012-11-14 12:25:19 -06005770 /* The ceph file layout needs to fit pool id in 32 bits */
5771
5772 ret = -EIO;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005773 if (pii.pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04005774 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005775 (unsigned long long)pii.pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05005776 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005777 }
Alex Elder0903e872012-11-14 12:25:19 -06005778
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005779 /*
5780 * The parent won't change (except when the clone is
5781 * flattened, already handled that). So we only need to
5782 * record the parent spec we have not already done so.
5783 */
5784 if (!rbd_dev->parent_spec) {
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005785 parent_spec->pool_id = pii.pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005786 if (pii.pool_ns && *pii.pool_ns) {
5787 parent_spec->pool_ns = pii.pool_ns;
5788 pii.pool_ns = NULL;
5789 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005790 parent_spec->image_id = pii.image_id;
5791 pii.image_id = NULL;
5792 parent_spec->snap_id = pii.snap_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005793
Alex Elder70cf49c2013-05-06 17:40:33 -05005794 rbd_dev->parent_spec = parent_spec;
5795 parent_spec = NULL; /* rbd_dev now owns this */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005796 }
5797
5798 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005799 * We always update the parent overlap. If it's zero we issue
5800 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005801 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005802 if (!pii.overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005803 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005804 /* refresh, careful to warn just once */
5805 if (rbd_dev->parent_overlap)
5806 rbd_warn(rbd_dev,
5807 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005808 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005809 /* initial probe */
5810 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005811 }
Alex Elder70cf49c2013-05-06 17:40:33 -05005812 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005813 rbd_dev->parent_overlap = pii.overlap;
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005814
Alex Elder86b00e02012-10-25 23:34:42 -05005815out:
5816 ret = 0;
5817out_err:
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005818 kfree(pii.pool_ns);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005819 kfree(pii.image_id);
Alex Elder86b00e02012-10-25 23:34:42 -05005820 rbd_spec_put(parent_spec);
Alex Elder86b00e02012-10-25 23:34:42 -05005821 return ret;
5822}
5823
Alex Eldercc070d52013-04-21 12:14:45 -05005824static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5825{
5826 struct {
5827 __le64 stripe_unit;
5828 __le64 stripe_count;
5829 } __attribute__ ((packed)) striping_info_buf = { 0 };
5830 size_t size = sizeof (striping_info_buf);
5831 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05005832 int ret;
5833
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005834 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5835 &rbd_dev->header_oloc, "get_stripe_unit_count",
5836 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05005837 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5838 if (ret < 0)
5839 return ret;
5840 if (ret < size)
5841 return -ERANGE;
5842
Alex Eldercc070d52013-04-21 12:14:45 -05005843 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01005844 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5845 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05005846 return 0;
5847}
5848
Ilya Dryomov7e973322017-01-25 18:16:22 +01005849static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5850{
5851 __le64 data_pool_id;
5852 int ret;
5853
5854 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5855 &rbd_dev->header_oloc, "get_data_pool",
5856 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5857 if (ret < 0)
5858 return ret;
5859 if (ret < sizeof(data_pool_id))
5860 return -EBADMSG;
5861
5862 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5863 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5864 return 0;
5865}
5866
Alex Elder9e15b772012-10-30 19:40:33 -05005867static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5868{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005869 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05005870 size_t image_id_size;
5871 char *image_id;
5872 void *p;
5873 void *end;
5874 size_t size;
5875 void *reply_buf = NULL;
5876 size_t len = 0;
5877 char *image_name = NULL;
5878 int ret;
5879
5880 rbd_assert(!rbd_dev->spec->image_name);
5881
Alex Elder69e7a022012-11-01 08:39:26 -05005882 len = strlen(rbd_dev->spec->image_id);
5883 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05005884 image_id = kmalloc(image_id_size, GFP_KERNEL);
5885 if (!image_id)
5886 return NULL;
5887
5888 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05005889 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05005890 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05005891
5892 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5893 reply_buf = kmalloc(size, GFP_KERNEL);
5894 if (!reply_buf)
5895 goto out;
5896
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005897 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5898 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5899 "dir_get_name", image_id, image_id_size,
5900 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005901 if (ret < 0)
5902 goto out;
5903 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005904 end = reply_buf + ret;
5905
Alex Elder9e15b772012-10-30 19:40:33 -05005906 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5907 if (IS_ERR(image_name))
5908 image_name = NULL;
5909 else
5910 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5911out:
5912 kfree(reply_buf);
5913 kfree(image_id);
5914
5915 return image_name;
5916}
5917
Alex Elder2ad3d712013-04-30 00:44:33 -05005918static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5919{
5920 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5921 const char *snap_name;
5922 u32 which = 0;
5923
5924 /* Skip over names until we find the one we are looking for */
5925
5926 snap_name = rbd_dev->header.snap_names;
5927 while (which < snapc->num_snaps) {
5928 if (!strcmp(name, snap_name))
5929 return snapc->snaps[which];
5930 snap_name += strlen(snap_name) + 1;
5931 which++;
5932 }
5933 return CEPH_NOSNAP;
5934}
5935
5936static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5937{
5938 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5939 u32 which;
5940 bool found = false;
5941 u64 snap_id;
5942
5943 for (which = 0; !found && which < snapc->num_snaps; which++) {
5944 const char *snap_name;
5945
5946 snap_id = snapc->snaps[which];
5947 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005948 if (IS_ERR(snap_name)) {
5949 /* ignore no-longer existing snapshots */
5950 if (PTR_ERR(snap_name) == -ENOENT)
5951 continue;
5952 else
5953 break;
5954 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005955 found = !strcmp(name, snap_name);
5956 kfree(snap_name);
5957 }
5958 return found ? snap_id : CEPH_NOSNAP;
5959}
5960
5961/*
5962 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5963 * no snapshot by that name is found, or if an error occurs.
5964 */
5965static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5966{
5967 if (rbd_dev->image_format == 1)
5968 return rbd_v1_snap_id_by_name(rbd_dev, name);
5969
5970 return rbd_v2_snap_id_by_name(rbd_dev, name);
5971}
5972
Alex Elder9e15b772012-10-30 19:40:33 -05005973/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005974 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005975 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005976static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5977{
5978 struct rbd_spec *spec = rbd_dev->spec;
5979
5980 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5981 rbd_assert(spec->image_id && spec->image_name);
5982 rbd_assert(spec->snap_name);
5983
5984 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5985 u64 snap_id;
5986
5987 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5988 if (snap_id == CEPH_NOSNAP)
5989 return -ENOENT;
5990
5991 spec->snap_id = snap_id;
5992 } else {
5993 spec->snap_id = CEPH_NOSNAP;
5994 }
5995
5996 return 0;
5997}
5998
5999/*
6000 * A parent image will have all ids but none of the names.
6001 *
6002 * All names in an rbd spec are dynamically allocated. It's OK if we
6003 * can't figure out the name for an image id.
6004 */
6005static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05006006{
Alex Elder2e9f7f12013-04-26 09:43:48 -05006007 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
6008 struct rbd_spec *spec = rbd_dev->spec;
6009 const char *pool_name;
6010 const char *image_name;
6011 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05006012 int ret;
6013
Ilya Dryomov04077592014-07-23 17:11:20 +04006014 rbd_assert(spec->pool_id != CEPH_NOPOOL);
6015 rbd_assert(spec->image_id);
6016 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05006017
Alex Elder2e9f7f12013-04-26 09:43:48 -05006018 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05006019
Alex Elder2e9f7f12013-04-26 09:43:48 -05006020 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
6021 if (!pool_name) {
6022 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05006023 return -EIO;
6024 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05006025 pool_name = kstrdup(pool_name, GFP_KERNEL);
6026 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05006027 return -ENOMEM;
6028
6029 /* Fetch the image name; tolerate failure here */
6030
Alex Elder2e9f7f12013-04-26 09:43:48 -05006031 image_name = rbd_dev_image_name(rbd_dev);
6032 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05006033 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05006034
Ilya Dryomov04077592014-07-23 17:11:20 +04006035 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05006036
Alex Elder2e9f7f12013-04-26 09:43:48 -05006037 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07006038 if (IS_ERR(snap_name)) {
6039 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05006040 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05006041 }
6042
6043 spec->pool_name = pool_name;
6044 spec->image_name = image_name;
6045 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05006046
6047 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04006048
Alex Elder9e15b772012-10-30 19:40:33 -05006049out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05006050 kfree(image_name);
6051 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05006052 return ret;
6053}
6054
Alex Eldercc4a38bd2013-04-30 00:44:33 -05006055static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05006056{
6057 size_t size;
6058 int ret;
6059 void *reply_buf;
6060 void *p;
6061 void *end;
6062 u64 seq;
6063 u32 snap_count;
6064 struct ceph_snap_context *snapc;
6065 u32 i;
6066
6067 /*
6068 * We'll need room for the seq value (maximum snapshot id),
6069 * snapshot count, and array of that many snapshot ids.
6070 * For now we have a fixed upper limit on the number we're
6071 * prepared to receive.
6072 */
6073 size = sizeof (__le64) + sizeof (__le32) +
6074 RBD_MAX_SNAP_COUNT * sizeof (__le64);
6075 reply_buf = kzalloc(size, GFP_KERNEL);
6076 if (!reply_buf)
6077 return -ENOMEM;
6078
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006079 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6080 &rbd_dev->header_oloc, "get_snapcontext",
6081 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06006082 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05006083 if (ret < 0)
6084 goto out;
6085
Alex Elder35d489f2012-07-03 16:01:19 -05006086 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05006087 end = reply_buf + ret;
6088 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05006089 ceph_decode_64_safe(&p, end, seq, out);
6090 ceph_decode_32_safe(&p, end, snap_count, out);
6091
6092 /*
6093 * Make sure the reported number of snapshot ids wouldn't go
6094 * beyond the end of our buffer. But before checking that,
6095 * make sure the computed size of the snapshot context we
6096 * allocate is representable in a size_t.
6097 */
6098 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
6099 / sizeof (u64)) {
6100 ret = -EINVAL;
6101 goto out;
6102 }
6103 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
6104 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05006105 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05006106
Alex Elder812164f82013-04-30 00:44:32 -05006107 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05006108 if (!snapc) {
6109 ret = -ENOMEM;
6110 goto out;
6111 }
Alex Elder35d489f2012-07-03 16:01:19 -05006112 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05006113 for (i = 0; i < snap_count; i++)
6114 snapc->snaps[i] = ceph_decode_64(&p);
6115
Alex Elder49ece552013-05-06 08:37:00 -05006116 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05006117 rbd_dev->header.snapc = snapc;
6118
6119 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05006120 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05006121out:
6122 kfree(reply_buf);
6123
Alex Elder57385b52013-04-21 12:14:45 -05006124 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05006125}
6126
Alex Elder54cac612013-04-30 00:44:33 -05006127static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
6128 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006129{
6130 size_t size;
6131 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05006132 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006133 int ret;
6134 void *p;
6135 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006136 char *snap_name;
6137
6138 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
6139 reply_buf = kmalloc(size, GFP_KERNEL);
6140 if (!reply_buf)
6141 return ERR_PTR(-ENOMEM);
6142
Alex Elder54cac612013-04-30 00:44:33 -05006143 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006144 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
6145 &rbd_dev->header_oloc, "get_snapshot_name",
6146 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06006147 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05006148 if (ret < 0) {
6149 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006150 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05006151 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006152
6153 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05006154 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05006155 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05006156 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006157 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006158
Alex Elderf40eb342013-04-25 15:09:42 -05006159 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05006160 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006161out:
6162 kfree(reply_buf);
6163
Alex Elderf40eb342013-04-25 15:09:42 -05006164 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05006165}
6166
Alex Elder2df3fac2013-05-06 09:51:30 -05006167static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05006168{
Alex Elder2df3fac2013-05-06 09:51:30 -05006169 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05006170 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05006171
Josh Durgin1617e402013-06-12 14:43:10 -07006172 ret = rbd_dev_v2_image_size(rbd_dev);
6173 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05006174 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07006175
Alex Elder2df3fac2013-05-06 09:51:30 -05006176 if (first_time) {
6177 ret = rbd_dev_v2_header_onetime(rbd_dev);
6178 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05006179 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05006180 }
6181
Alex Eldercc4a38bd2013-04-30 00:44:33 -05006182 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03006183 if (ret && first_time) {
6184 kfree(rbd_dev->header.object_prefix);
6185 rbd_dev->header.object_prefix = NULL;
6186 }
Alex Elder117973f2012-08-31 17:29:55 -05006187
6188 return ret;
6189}
6190
Ilya Dryomova720ae02014-07-23 17:11:19 +04006191static int rbd_dev_header_info(struct rbd_device *rbd_dev)
6192{
6193 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
6194
6195 if (rbd_dev->image_format == 1)
6196 return rbd_dev_v1_header_info(rbd_dev);
6197
6198 return rbd_dev_v2_header_info(rbd_dev);
6199}
6200
Alex Elder1ddbe942012-01-29 13:57:44 -06006201/*
Alex Eldere28fff262012-02-02 08:13:30 -06006202 * Skips over white space at *buf, and updates *buf to point to the
6203 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06006204 * the token (string of non-white space characters) found. Note
6205 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06006206 */
6207static inline size_t next_token(const char **buf)
6208{
6209 /*
6210 * These are the characters that produce nonzero for
6211 * isspace() in the "C" and "POSIX" locales.
6212 */
6213 const char *spaces = " \f\n\r\t\v";
6214
6215 *buf += strspn(*buf, spaces); /* Find start of token */
6216
6217 return strcspn(*buf, spaces); /* Return token length */
6218}
6219
6220/*
Alex Elderea3352f2012-07-09 21:04:23 -05006221 * Finds the next token in *buf, dynamically allocates a buffer big
6222 * enough to hold a copy of it, and copies the token into the new
6223 * buffer. The copy is guaranteed to be terminated with '\0'. Note
6224 * that a duplicate buffer is created even for a zero-length token.
6225 *
6226 * Returns a pointer to the newly-allocated duplicate, or a null
6227 * pointer if memory for the duplicate was not available. If
6228 * the lenp argument is a non-null pointer, the length of the token
6229 * (not including the '\0') is returned in *lenp.
6230 *
6231 * If successful, the *buf pointer will be updated to point beyond
6232 * the end of the found token.
6233 *
6234 * Note: uses GFP_KERNEL for allocation.
6235 */
6236static inline char *dup_token(const char **buf, size_t *lenp)
6237{
6238 char *dup;
6239 size_t len;
6240
6241 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05006242 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05006243 if (!dup)
6244 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05006245 *(dup + len) = '\0';
6246 *buf += len;
6247
6248 if (lenp)
6249 *lenp = len;
6250
6251 return dup;
6252}
6253
David Howells82995cc2019-03-25 16:38:32 +00006254static int rbd_parse_param(struct fs_parameter *param,
6255 struct rbd_parse_opts_ctx *pctx)
6256{
6257 struct rbd_options *opt = pctx->opts;
6258 struct fs_parse_result result;
Al Viro3fbb8d52019-12-20 23:43:32 -05006259 struct p_log log = {.prefix = "rbd"};
David Howells82995cc2019-03-25 16:38:32 +00006260 int token, ret;
6261
6262 ret = ceph_parse_param(param, pctx->copts, NULL);
6263 if (ret != -ENOPARAM)
6264 return ret;
6265
Al Virod7167b12019-09-07 07:23:15 -04006266 token = __fs_parse(&log, rbd_parameters, param, &result);
David Howells82995cc2019-03-25 16:38:32 +00006267 dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
6268 if (token < 0) {
Al Viro2c3f3dc2019-12-20 23:43:32 -05006269 if (token == -ENOPARAM)
6270 return inval_plog(&log, "Unknown parameter '%s'",
6271 param->key);
David Howells82995cc2019-03-25 16:38:32 +00006272 return token;
6273 }
6274
6275 switch (token) {
6276 case Opt_queue_depth:
6277 if (result.uint_32 < 1)
6278 goto out_of_range;
6279 opt->queue_depth = result.uint_32;
6280 break;
6281 case Opt_alloc_size:
6282 if (result.uint_32 < SECTOR_SIZE)
6283 goto out_of_range;
Al Viro2c3f3dc2019-12-20 23:43:32 -05006284 if (!is_power_of_2(result.uint_32))
6285 return inval_plog(&log, "alloc_size must be a power of 2");
David Howells82995cc2019-03-25 16:38:32 +00006286 opt->alloc_size = result.uint_32;
6287 break;
6288 case Opt_lock_timeout:
6289 /* 0 is "wait forever" (i.e. infinite timeout) */
6290 if (result.uint_32 > INT_MAX / 1000)
6291 goto out_of_range;
6292 opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
6293 break;
6294 case Opt_pool_ns:
6295 kfree(pctx->spec->pool_ns);
6296 pctx->spec->pool_ns = param->string;
6297 param->string = NULL;
6298 break;
Ilya Dryomovdc1dad82020-05-29 20:51:23 +02006299 case Opt_compression_hint:
6300 switch (result.uint_32) {
6301 case Opt_compression_hint_none:
6302 opt->alloc_hint_flags &=
6303 ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
6304 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
6305 break;
6306 case Opt_compression_hint_compressible:
6307 opt->alloc_hint_flags |=
6308 CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6309 opt->alloc_hint_flags &=
6310 ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6311 break;
6312 case Opt_compression_hint_incompressible:
6313 opt->alloc_hint_flags |=
6314 CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
6315 opt->alloc_hint_flags &=
6316 ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
6317 break;
6318 default:
6319 BUG();
6320 }
6321 break;
David Howells82995cc2019-03-25 16:38:32 +00006322 case Opt_read_only:
6323 opt->read_only = true;
6324 break;
6325 case Opt_read_write:
6326 opt->read_only = false;
6327 break;
6328 case Opt_lock_on_read:
6329 opt->lock_on_read = true;
6330 break;
6331 case Opt_exclusive:
6332 opt->exclusive = true;
6333 break;
6334 case Opt_notrim:
6335 opt->trim = false;
6336 break;
6337 default:
6338 BUG();
6339 }
6340
6341 return 0;
6342
6343out_of_range:
Al Viro2c3f3dc2019-12-20 23:43:32 -05006344 return inval_plog(&log, "%s out of range", param->key);
David Howells82995cc2019-03-25 16:38:32 +00006345}
6346
6347/*
6348 * This duplicates most of generic_parse_monolithic(), untying it from
6349 * fs_context and skipping standard superblock and security options.
6350 */
6351static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
6352{
6353 char *key;
6354 int ret = 0;
6355
6356 dout("%s '%s'\n", __func__, options);
6357 while ((key = strsep(&options, ",")) != NULL) {
6358 if (*key) {
6359 struct fs_parameter param = {
6360 .key = key,
Al Viro0f895892019-12-17 14:15:04 -05006361 .type = fs_value_is_flag,
David Howells82995cc2019-03-25 16:38:32 +00006362 };
6363 char *value = strchr(key, '=');
6364 size_t v_len = 0;
6365
6366 if (value) {
6367 if (value == key)
6368 continue;
6369 *value++ = 0;
6370 v_len = strlen(value);
David Howells82995cc2019-03-25 16:38:32 +00006371 param.string = kmemdup_nul(value, v_len,
6372 GFP_KERNEL);
6373 if (!param.string)
6374 return -ENOMEM;
Al Viro0f895892019-12-17 14:15:04 -05006375 param.type = fs_value_is_string;
David Howells82995cc2019-03-25 16:38:32 +00006376 }
6377 param.size = v_len;
6378
6379 ret = rbd_parse_param(&param, pctx);
6380 kfree(param.string);
6381 if (ret)
6382 break;
6383 }
6384 }
6385
6386 return ret;
6387}
6388
Alex Elderea3352f2012-07-09 21:04:23 -05006389/*
Alex Elder859c31d2012-10-25 23:34:42 -05006390 * Parse the options provided for an "rbd add" (i.e., rbd image
6391 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
6392 * and the data written is passed here via a NUL-terminated buffer.
6393 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05006394 *
Alex Elder859c31d2012-10-25 23:34:42 -05006395 * The information extracted from these options is recorded in
6396 * the other parameters which return dynamically-allocated
6397 * structures:
6398 * ceph_opts
6399 * The address of a pointer that will refer to a ceph options
6400 * structure. Caller must release the returned pointer using
6401 * ceph_destroy_options() when it is no longer needed.
6402 * rbd_opts
6403 * Address of an rbd options pointer. Fully initialized by
6404 * this function; caller must release with kfree().
6405 * spec
6406 * Address of an rbd image specification pointer. Fully
6407 * initialized by this function based on parsed options.
6408 * Caller must release with rbd_spec_put().
6409 *
6410 * The options passed take this form:
6411 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
6412 * where:
6413 * <mon_addrs>
6414 * A comma-separated list of one or more monitor addresses.
6415 * A monitor address is an ip address, optionally followed
6416 * by a port number (separated by a colon).
6417 * I.e.: ip1[:port1][,ip2[:port2]...]
6418 * <options>
6419 * A comma-separated list of ceph and/or rbd options.
6420 * <pool_name>
6421 * The name of the rados pool containing the rbd image.
6422 * <image_name>
6423 * The name of the image in that pool to map.
6424 * <snap_id>
6425 * An optional snapshot id. If provided, the mapping will
6426 * present data from the image at the time that snapshot was
6427 * created. The image head is used if no snapshot id is
6428 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06006429 */
Alex Elder859c31d2012-10-25 23:34:42 -05006430static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05006431 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05006432 struct rbd_options **opts,
6433 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06006434{
Alex Elderd22f76e2012-07-12 10:46:35 -05006435 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05006436 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05006437 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05006438 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05006439 size_t mon_addrs_size;
David Howells82995cc2019-03-25 16:38:32 +00006440 struct rbd_parse_opts_ctx pctx = { 0 };
Alex Elderdc79b112012-10-25 23:34:41 -05006441 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06006442
6443 /* The first four tokens are required */
6444
Alex Elder7ef32142012-02-02 08:13:30 -06006445 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05006446 if (!len) {
6447 rbd_warn(NULL, "no monitor address(es) provided");
6448 return -EINVAL;
6449 }
Alex Elder0ddebc02012-10-25 23:34:41 -05006450 mon_addrs = buf;
David Howells82995cc2019-03-25 16:38:32 +00006451 mon_addrs_size = len;
Alex Elder7ef32142012-02-02 08:13:30 -06006452 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06006453
Alex Elderdc79b112012-10-25 23:34:41 -05006454 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05006455 options = dup_token(&buf, NULL);
6456 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05006457 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05006458 if (!*options) {
6459 rbd_warn(NULL, "no options provided");
6460 goto out_err;
6461 }
Alex Eldera725f65e2012-02-02 08:13:30 -06006462
Ilya Dryomovc3001562018-07-03 15:28:43 +02006463 pctx.spec = rbd_spec_alloc();
6464 if (!pctx.spec)
Alex Elderf28e5652012-10-25 23:34:41 -05006465 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05006466
Ilya Dryomovc3001562018-07-03 15:28:43 +02006467 pctx.spec->pool_name = dup_token(&buf, NULL);
6468 if (!pctx.spec->pool_name)
Alex Elder859c31d2012-10-25 23:34:42 -05006469 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02006470 if (!*pctx.spec->pool_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05006471 rbd_warn(NULL, "no pool name provided");
6472 goto out_err;
6473 }
Alex Eldere28fff262012-02-02 08:13:30 -06006474
Ilya Dryomovc3001562018-07-03 15:28:43 +02006475 pctx.spec->image_name = dup_token(&buf, NULL);
6476 if (!pctx.spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05006477 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02006478 if (!*pctx.spec->image_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05006479 rbd_warn(NULL, "no image name provided");
6480 goto out_err;
6481 }
Alex Eldere28fff262012-02-02 08:13:30 -06006482
Alex Elderf28e5652012-10-25 23:34:41 -05006483 /*
6484 * Snapshot name is optional; default is to use "-"
6485 * (indicating the head/no snapshot).
6486 */
Alex Elder3feeb8942012-08-31 17:29:52 -05006487 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05006488 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05006489 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
6490 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05006491 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05006492 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05006493 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05006494 }
Alex Elderecb4dc22013-04-26 09:43:47 -05006495 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6496 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05006497 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05006498 *(snap_name + len) = '\0';
Ilya Dryomovc3001562018-07-03 15:28:43 +02006499 pctx.spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05006500
David Howells82995cc2019-03-25 16:38:32 +00006501 pctx.copts = ceph_alloc_options();
6502 if (!pctx.copts)
6503 goto out_mem;
6504
Alex Elder0ddebc02012-10-25 23:34:41 -05006505 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06006506
Ilya Dryomovc3001562018-07-03 15:28:43 +02006507 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
6508 if (!pctx.opts)
Alex Elder4e9afeb2012-10-25 23:34:41 -05006509 goto out_mem;
6510
Ilya Dryomovc3001562018-07-03 15:28:43 +02006511 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
6512 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01006513 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
Ilya Dryomovc3001562018-07-03 15:28:43 +02006514 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
6515 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
6516 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
6517 pctx.opts->trim = RBD_TRIM_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05006518
David Howells82995cc2019-03-25 16:38:32 +00006519 ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL);
6520 if (ret)
Alex Elderdc79b112012-10-25 23:34:41 -05006521 goto out_err;
Alex Elder859c31d2012-10-25 23:34:42 -05006522
David Howells82995cc2019-03-25 16:38:32 +00006523 ret = rbd_parse_options(options, &pctx);
6524 if (ret)
6525 goto out_err;
6526
6527 *ceph_opts = pctx.copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +02006528 *opts = pctx.opts;
6529 *rbd_spec = pctx.spec;
David Howells82995cc2019-03-25 16:38:32 +00006530 kfree(options);
Alex Elderdc79b112012-10-25 23:34:41 -05006531 return 0;
David Howells82995cc2019-03-25 16:38:32 +00006532
Alex Elderf28e5652012-10-25 23:34:41 -05006533out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05006534 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05006535out_err:
Ilya Dryomovc3001562018-07-03 15:28:43 +02006536 kfree(pctx.opts);
David Howells82995cc2019-03-25 16:38:32 +00006537 ceph_destroy_options(pctx.copts);
Ilya Dryomovc3001562018-07-03 15:28:43 +02006538 rbd_spec_put(pctx.spec);
Alex Elderf28e5652012-10-25 23:34:41 -05006539 kfree(options);
Alex Elderdc79b112012-10-25 23:34:41 -05006540 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06006541}
6542
Ilya Dryomove010dd02017-04-13 12:17:39 +02006543static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6544{
6545 down_write(&rbd_dev->lock_rwsem);
6546 if (__rbd_is_lock_owner(rbd_dev))
Ilya Dryomove1fddc82019-05-30 16:07:48 +02006547 __rbd_release_lock(rbd_dev);
Ilya Dryomove010dd02017-04-13 12:17:39 +02006548 up_write(&rbd_dev->lock_rwsem);
6549}
6550
Ilya Dryomov637cd062019-06-06 17:14:49 +02006551/*
6552 * If the wait is interrupted, an error is returned even if the lock
6553 * was successfully acquired. rbd_dev_image_unlock() will release it
6554 * if needed.
6555 */
Ilya Dryomove010dd02017-04-13 12:17:39 +02006556static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6557{
Ilya Dryomov637cd062019-06-06 17:14:49 +02006558 long ret;
Ilya Dryomov2f18d462018-04-04 10:15:38 +02006559
Ilya Dryomove010dd02017-04-13 12:17:39 +02006560 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
Ilya Dryomov637cd062019-06-06 17:14:49 +02006561 if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
6562 return 0;
6563
Ilya Dryomove010dd02017-04-13 12:17:39 +02006564 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6565 return -EINVAL;
6566 }
6567
Ilya Dryomov3fe69922019-11-12 19:41:48 +01006568 if (rbd_is_ro(rbd_dev))
Ilya Dryomov637cd062019-06-06 17:14:49 +02006569 return 0;
6570
6571 rbd_assert(!rbd_is_lock_owner(rbd_dev));
6572 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
6573 ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
6574 ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
Dongsheng Yang25e6be22019-09-27 15:33:22 +00006575 if (ret > 0) {
Ilya Dryomov637cd062019-06-06 17:14:49 +02006576 ret = rbd_dev->acquire_err;
Dongsheng Yang25e6be22019-09-27 15:33:22 +00006577 } else {
6578 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
6579 if (!ret)
6580 ret = -ETIMEDOUT;
6581 }
Ilya Dryomov637cd062019-06-06 17:14:49 +02006582
Ilya Dryomov2f18d462018-04-04 10:15:38 +02006583 if (ret) {
Ilya Dryomov637cd062019-06-06 17:14:49 +02006584 rbd_warn(rbd_dev, "failed to acquire exclusive lock: %ld", ret);
6585 return ret;
Ilya Dryomove010dd02017-04-13 12:17:39 +02006586 }
6587
Ilya Dryomov637cd062019-06-06 17:14:49 +02006588 /*
6589 * The lock may have been released by now, unless automatic lock
6590 * transitions are disabled.
6591 */
6592 rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
Ilya Dryomove010dd02017-04-13 12:17:39 +02006593 return 0;
6594}
6595
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006596/*
Alex Elder589d30e2012-07-10 20:30:11 -05006597 * An rbd format 2 image has a unique identifier, distinct from the
6598 * name given to it by the user. Internally, that identifier is
6599 * what's used to specify the names of objects related to the image.
6600 *
6601 * A special "rbd id" object is used to map an rbd image name to its
6602 * id. If that object doesn't exist, then there is no v2 rbd image
6603 * with the supplied name.
6604 *
6605 * This function will record the given rbd_dev's image_id field if
6606 * it can be determined, and in that case will return 0. If any
6607 * errors occur a negative errno will be returned and the rbd_dev's
6608 * image_id field will be unchanged (and should be NULL).
6609 */
6610static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6611{
6612 int ret;
6613 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006614 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05006615 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05006616 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05006617
Alex Elder589d30e2012-07-10 20:30:11 -05006618 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05006619 * When probing a parent image, the image id is already
6620 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05006621 * need to fetch the image id again in this case. We
6622 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05006623 */
Alex Elderc0fba362013-04-25 23:15:08 -05006624 if (rbd_dev->spec->image_id) {
6625 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6626
Alex Elder2c0d0a12012-10-30 19:40:33 -05006627 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05006628 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05006629
6630 /*
Alex Elder589d30e2012-07-10 20:30:11 -05006631 * First, see if the format 2 image id file exists, and if
6632 * so, get the image's persistent id from it.
6633 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006634 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6635 rbd_dev->spec->image_name);
6636 if (ret)
6637 return ret;
6638
6639 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05006640
6641 /* Response will be an encoded string, which includes a length */
Alex Elder589d30e2012-07-10 20:30:11 -05006642 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6643 response = kzalloc(size, GFP_NOIO);
6644 if (!response) {
6645 ret = -ENOMEM;
6646 goto out;
6647 }
6648
Alex Elderc0fba362013-04-25 23:15:08 -05006649 /* If it doesn't exist we'll assume it's a format 1 image */
6650
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006651 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6652 "get_id", NULL, 0,
Dongsheng Yang5435d2062019-08-09 07:05:27 +00006653 response, size);
Alex Elder36be9a72013-01-19 00:30:28 -06006654 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05006655 if (ret == -ENOENT) {
6656 image_id = kstrdup("", GFP_KERNEL);
6657 ret = image_id ? 0 : -ENOMEM;
6658 if (!ret)
6659 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04006660 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05006661 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05006662
Alex Elderc0fba362013-04-25 23:15:08 -05006663 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05006664 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08006665 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05006666 if (!ret)
6667 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05006668 }
6669
6670 if (!ret) {
6671 rbd_dev->spec->image_id = image_id;
6672 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05006673 }
6674out:
6675 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006676 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05006677 return ret;
6678}
6679
Alex Elder3abef3b2013-05-13 20:35:37 -05006680/*
6681 * Undo whatever state changes are made by v1 or v2 header info
6682 * call.
6683 */
Alex Elder6fd48b32013-04-28 23:32:34 -05006684static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6685{
6686 struct rbd_image_header *header;
6687
Ilya Dryomove69b8d42015-01-19 12:06:14 +03006688 rbd_dev_parent_put(rbd_dev);
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02006689 rbd_object_map_free(rbd_dev);
Ilya Dryomovda5ef6be2019-06-17 15:29:49 +02006690 rbd_dev_mapping_clear(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006691
6692 /* Free dynamic fields from the header, then zero it out */
6693
6694 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05006695 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05006696 kfree(header->snap_sizes);
6697 kfree(header->snap_names);
6698 kfree(header->object_prefix);
6699 memset(header, 0, sizeof (*header));
6700}
6701
Alex Elder2df3fac2013-05-06 09:51:30 -05006702static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05006703{
6704 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006705
Alex Elder1e130192012-07-03 16:01:19 -05006706 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05006707 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05006708 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05006709
Alex Elder2df3fac2013-05-06 09:51:30 -05006710 /*
6711 * Get the and check features for the image. Currently the
6712 * features are assumed to never change.
6713 */
Alex Elderb1b54022012-07-03 16:01:19 -05006714 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05006715 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05006716 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05006717
Alex Eldercc070d52013-04-21 12:14:45 -05006718 /* If the image supports fancy striping, get its parameters */
6719
6720 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6721 ret = rbd_dev_v2_striping_info(rbd_dev);
6722 if (ret < 0)
6723 goto out_err;
6724 }
Alex Eldera30b71b2012-07-10 20:30:11 -05006725
Ilya Dryomov7e973322017-01-25 18:16:22 +01006726 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6727 ret = rbd_dev_v2_data_pool(rbd_dev);
6728 if (ret)
6729 goto out_err;
6730 }
6731
Ilya Dryomov263423f2017-01-25 18:16:22 +01006732 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05006733 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01006734
Alex Elder9d475de2012-07-03 16:01:19 -05006735out_err:
Alex Elder642a2532013-05-06 17:40:33 -05006736 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05006737 kfree(rbd_dev->header.object_prefix);
6738 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05006739 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006740}
6741
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006742/*
6743 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6744 * rbd_dev_image_probe() recursion depth, which means it's also the
6745 * length of the already discovered part of the parent chain.
6746 */
6747static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05006748{
Alex Elder2f82ee52012-10-30 19:40:33 -05006749 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05006750 int ret;
6751
6752 if (!rbd_dev->parent_spec)
6753 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05006754
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006755 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6756 pr_info("parent chain is too long (%d)\n", depth);
6757 ret = -EINVAL;
6758 goto out_err;
6759 }
6760
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006761 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006762 if (!parent) {
6763 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05006764 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006765 }
6766
6767 /*
6768 * Images related by parent/child relationships always share
6769 * rbd_client and spec/parent_spec, so bump their refcounts.
6770 */
6771 __rbd_get_client(rbd_dev->rbd_client);
6772 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05006773
Ilya Dryomov39258aa2019-11-07 17:16:23 +01006774 __set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
6775
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006776 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05006777 if (ret < 0)
6778 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006779
Alex Elder124afba2013-04-26 15:44:36 -05006780 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05006781 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05006782 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05006783
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006784out_err:
6785 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01006786 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05006787 return ret;
6788}
6789
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006790static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6791{
6792 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006793 rbd_free_disk(rbd_dev);
6794 if (!single_major)
6795 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6796}
6797
Ilya Dryomov811c6682016-04-15 16:22:16 +02006798/*
6799 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6800 * upon return.
6801 */
Alex Elder200a6a82013-04-28 23:32:34 -05006802static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05006803{
Alex Elder83a06262012-10-30 15:47:17 -05006804 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05006805
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006806 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05006807
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006808 if (!single_major) {
6809 ret = register_blkdev(0, rbd_dev->name);
6810 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006811 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006812
6813 rbd_dev->major = ret;
6814 rbd_dev->minor = 0;
6815 } else {
6816 rbd_dev->major = rbd_major;
6817 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6818 }
Alex Elder83a06262012-10-30 15:47:17 -05006819
6820 /* Set up the blkdev mapping. */
6821
6822 ret = rbd_init_disk(rbd_dev);
6823 if (ret)
6824 goto err_out_blkdev;
6825
Alex Elderf35a4de2013-05-06 09:51:29 -05006826 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov39258aa2019-11-07 17:16:23 +01006827 set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
Alex Elderf35a4de2013-05-06 09:51:29 -05006828
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006829 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05006830 if (ret)
Ilya Dryomovda5ef6be2019-06-17 15:29:49 +02006831 goto err_out_disk;
Alex Elder83a06262012-10-30 15:47:17 -05006832
Alex Elder129b79d2013-04-26 15:44:36 -05006833 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006834 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006835 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05006836
Alex Elder83a06262012-10-30 15:47:17 -05006837err_out_disk:
6838 rbd_free_disk(rbd_dev);
6839err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006840 if (!single_major)
6841 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006842err_out_unlock:
6843 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05006844 return ret;
6845}
6846
Alex Elder332bb122013-04-27 09:59:30 -05006847static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6848{
6849 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006850 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05006851
6852 /* Record the header object name for this rbd image. */
6853
6854 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05006855 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006856 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6857 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05006858 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006859 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6860 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05006861
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006862 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05006863}
6864
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006865static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
6866{
6867 if (!is_snap) {
6868 pr_info("image %s/%s%s%s does not exist\n",
6869 rbd_dev->spec->pool_name,
6870 rbd_dev->spec->pool_ns ?: "",
6871 rbd_dev->spec->pool_ns ? "/" : "",
6872 rbd_dev->spec->image_name);
6873 } else {
6874 pr_info("snap %s/%s%s%s@%s does not exist\n",
6875 rbd_dev->spec->pool_name,
6876 rbd_dev->spec->pool_ns ?: "",
6877 rbd_dev->spec->pool_ns ? "/" : "",
6878 rbd_dev->spec->image_name,
6879 rbd_dev->spec->snap_name);
6880 }
6881}
6882
Alex Elder200a6a82013-04-28 23:32:34 -05006883static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6884{
Ilya Dryomovb8776052020-03-16 17:16:28 +01006885 if (!rbd_is_ro(rbd_dev))
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006886 rbd_unregister_watch(rbd_dev);
Ilya Dryomov952c48b2020-03-16 15:52:54 +01006887
6888 rbd_dev_unprobe(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006889 rbd_dev->image_format = 0;
6890 kfree(rbd_dev->spec->image_id);
6891 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05006892}
6893
Alex Eldera30b71b2012-07-10 20:30:11 -05006894/*
6895 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05006896 * device. If this image is the one being mapped (i.e., not a
6897 * parent), initiate a watch on its header object before using that
6898 * object to get detailed information about the rbd image.
Ilya Dryomov0e4e1de52020-03-13 11:20:51 +01006899 *
6900 * On success, returns with header_rwsem held for write if called
6901 * with @depth == 0.
Alex Eldera30b71b2012-07-10 20:30:11 -05006902 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006903static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05006904{
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006905 bool need_watch = !rbd_is_ro(rbd_dev);
Alex Eldera30b71b2012-07-10 20:30:11 -05006906 int ret;
6907
6908 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05006909 * Get the id from the image id object. Unless there's an
6910 * error, rbd_dev->spec->image_id will be filled in with
6911 * a dynamically-allocated string, and rbd_dev->image_format
6912 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05006913 */
6914 ret = rbd_dev_image_id(rbd_dev);
6915 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05006916 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05006917
Alex Elder332bb122013-04-27 09:59:30 -05006918 ret = rbd_dev_header_name(rbd_dev);
6919 if (ret)
6920 goto err_out_format;
6921
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006922 if (need_watch) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02006923 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006924 if (ret) {
6925 if (ret == -ENOENT)
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006926 rbd_print_dne(rbd_dev, false);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006927 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006928 }
Alex Elder1f3ef782013-05-06 17:40:33 -05006929 }
Alex Elderb644de22013-04-27 09:59:31 -05006930
Ilya Dryomov0e4e1de52020-03-13 11:20:51 +01006931 if (!depth)
6932 down_write(&rbd_dev->header_rwsem);
6933
Ilya Dryomova720ae02014-07-23 17:11:19 +04006934 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006935 if (ret) {
6936 if (ret == -ENOENT && !need_watch)
6937 rbd_print_dne(rbd_dev, false);
Ilya Dryomov952c48b2020-03-16 15:52:54 +01006938 goto err_out_probe;
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006939 }
Alex Elder83a06262012-10-30 15:47:17 -05006940
Ilya Dryomov04077592014-07-23 17:11:20 +04006941 /*
6942 * If this image is the one being mapped, we have pool name and
6943 * id, image name and id, and snap name - need to fill snap id.
6944 * Otherwise this is a parent image, identified by pool, image
6945 * and snap ids - need to fill in names for those ids.
6946 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006947 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04006948 ret = rbd_spec_fill_snap_id(rbd_dev);
6949 else
6950 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006951 if (ret) {
6952 if (ret == -ENOENT)
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006953 rbd_print_dne(rbd_dev, true);
Alex Elder33dca392013-04-30 00:44:33 -05006954 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006955 }
Alex Elder9bb81c92013-04-27 09:59:30 -05006956
Ilya Dryomovda5ef6be2019-06-17 15:29:49 +02006957 ret = rbd_dev_mapping_set(rbd_dev);
6958 if (ret)
6959 goto err_out_probe;
6960
Ilya Dryomovf3c0e452019-11-07 16:22:10 +01006961 if (rbd_is_snap(rbd_dev) &&
Ilya Dryomov22e8bd52019-06-05 19:25:11 +02006962 (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
6963 ret = rbd_object_map_load(rbd_dev);
6964 if (ret)
6965 goto err_out_probe;
6966 }
6967
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006968 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6969 ret = rbd_dev_v2_parent_info(rbd_dev);
6970 if (ret)
6971 goto err_out_probe;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006972 }
6973
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006974 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05006975 if (ret)
6976 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05006977
Alex Elder30d60ba2013-05-06 09:51:30 -05006978 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006979 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05006980 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006981
Alex Elder6fd48b32013-04-28 23:32:34 -05006982err_out_probe:
Ilya Dryomov0e4e1de52020-03-13 11:20:51 +01006983 if (!depth)
6984 up_write(&rbd_dev->header_rwsem);
Ilya Dryomovb9ef2b82019-11-12 20:20:04 +01006985 if (need_watch)
Ilya Dryomov99d16942016-08-12 16:11:41 +02006986 rbd_unregister_watch(rbd_dev);
Ilya Dryomov952c48b2020-03-16 15:52:54 +01006987 rbd_dev_unprobe(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05006988err_out_format:
6989 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05006990 kfree(rbd_dev->spec->image_id);
6991 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05006992 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006993}
6994
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006995static ssize_t do_rbd_add(struct bus_type *bus,
6996 const char *buf,
6997 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006998{
Alex Eldercb8627c2012-07-09 21:04:23 -05006999 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05007000 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05007001 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05007002 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05007003 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02007004 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007005
Ilya Dryomovf44d04e2020-09-03 13:24:11 +02007006 if (!capable(CAP_SYS_ADMIN))
7007 return -EPERM;
7008
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007009 if (!try_module_get(THIS_MODULE))
7010 return -ENODEV;
7011
Alex Eldera725f65e2012-02-02 08:13:30 -06007012 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05007013 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05007014 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02007015 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06007016
Alex Elder9d3997f2012-10-25 23:34:42 -05007017 rbdc = rbd_get_client(ceph_opts);
7018 if (IS_ERR(rbdc)) {
7019 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05007020 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05007021 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007022
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007023 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01007024 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03007025 if (rc < 0) {
7026 if (rc == -ENOENT)
7027 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007028 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03007029 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05007030 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05007031
Ilya Dryomovd1475432015-06-22 13:24:48 +03007032 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02007033 if (!rbd_dev) {
7034 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05007035 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02007036 }
Alex Elderc53d5892012-10-25 23:34:42 -05007037 rbdc = NULL; /* rbd_dev now owns this */
7038 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03007039 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007040
Ilya Dryomov39258aa2019-11-07 17:16:23 +01007041 /* if we are mapping a snapshot it will be a read-only mapping */
7042 if (rbd_dev->opts->read_only ||
7043 strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
7044 __set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
7045
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02007046 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
7047 if (!rbd_dev->config_info) {
7048 rc = -ENOMEM;
7049 goto err_out_rbd_dev;
7050 }
7051
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02007052 rc = rbd_dev_image_probe(rbd_dev, 0);
Ilya Dryomov0e4e1de52020-03-13 11:20:51 +01007053 if (rc < 0)
Alex Elderc53d5892012-10-25 23:34:42 -05007054 goto err_out_rbd_dev;
Alex Elder05fd6f62012-08-29 17:11:07 -05007055
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01007056 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
7057 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
7058 rbd_dev->layout.object_size);
7059 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
7060 }
7061
Alex Elderb536f692013-04-28 23:32:34 -05007062 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02007063 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02007064 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05007065
Ilya Dryomov637cd062019-06-06 17:14:49 +02007066 rc = rbd_add_acquire_lock(rbd_dev);
7067 if (rc)
7068 goto err_out_image_lock;
Alex Elderb536f692013-04-28 23:32:34 -05007069
Ilya Dryomov5769ed02017-04-13 12:17:38 +02007070 /* Everything's ready. Announce the disk to the world. */
7071
7072 rc = device_add(&rbd_dev->dev);
7073 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02007074 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02007075
Hannes Reinecke33253222020-01-23 13:44:33 +01007076 device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02007077
7078 spin_lock(&rbd_dev_list_lock);
7079 list_add_tail(&rbd_dev->node, &rbd_dev_list);
7080 spin_unlock(&rbd_dev_list_lock);
7081
7082 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
7083 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
7084 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02007085 rc = count;
7086out:
7087 module_put(THIS_MODULE);
7088 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05007089
Ilya Dryomove010dd02017-04-13 12:17:39 +02007090err_out_image_lock:
7091 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02007092 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02007093err_out_image_probe:
7094 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05007095err_out_rbd_dev:
7096 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05007097err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05007098 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05007099err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05007100 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03007101 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02007102 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007103}
7104
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01007105static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007106{
7107 if (single_major)
7108 return -EINVAL;
7109
7110 return do_rbd_add(bus, buf, count);
7111}
7112
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01007113static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
7114 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007115{
7116 return do_rbd_add(bus, buf, count);
7117}
7118
Alex Elder05a46af2013-04-26 15:44:36 -05007119static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
7120{
Alex Elderad945fc2013-04-26 15:44:36 -05007121 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05007122 struct rbd_device *first = rbd_dev;
7123 struct rbd_device *second = first->parent;
7124 struct rbd_device *third;
7125
7126 /*
7127 * Follow to the parent with no grandparent and
7128 * remove it.
7129 */
7130 while (second && (third = second->parent)) {
7131 first = second;
7132 second = third;
7133 }
Alex Elderad945fc2013-04-26 15:44:36 -05007134 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05007135 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02007136 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05007137 first->parent = NULL;
7138 first->parent_overlap = 0;
7139
7140 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05007141 rbd_spec_put(first->parent_spec);
7142 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05007143 }
7144}
7145
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007146static ssize_t do_rbd_remove(struct bus_type *bus,
7147 const char *buf,
7148 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007149{
7150 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05007151 struct list_head *tmp;
7152 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02007153 char opt_buf[6];
Mike Christie0276dca2016-08-18 18:38:45 +02007154 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05007155 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007156
Ilya Dryomovf44d04e2020-09-03 13:24:11 +02007157 if (!capable(CAP_SYS_ADMIN))
7158 return -EPERM;
7159
Mike Christie0276dca2016-08-18 18:38:45 +02007160 dev_id = -1;
7161 opt_buf[0] = '\0';
7162 sscanf(buf, "%d %5s", &dev_id, opt_buf);
7163 if (dev_id < 0) {
7164 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007165 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02007166 }
7167 if (opt_buf[0] != '\0') {
7168 if (!strcmp(opt_buf, "force")) {
7169 force = true;
7170 } else {
7171 pr_err("bad remove option at '%s'\n", opt_buf);
7172 return -EINVAL;
7173 }
7174 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007175
Alex Elder751cc0e2013-05-31 15:17:01 -05007176 ret = -ENOENT;
7177 spin_lock(&rbd_dev_list_lock);
7178 list_for_each(tmp, &rbd_dev_list) {
7179 rbd_dev = list_entry(tmp, struct rbd_device, node);
7180 if (rbd_dev->dev_id == dev_id) {
7181 ret = 0;
7182 break;
7183 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007184 }
Alex Elder751cc0e2013-05-31 15:17:01 -05007185 if (!ret) {
7186 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02007187 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05007188 ret = -EBUSY;
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01007189 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
7190 &rbd_dev->flags))
7191 ret = -EINPROGRESS;
Alex Elder751cc0e2013-05-31 15:17:01 -05007192 spin_unlock_irq(&rbd_dev->lock);
7193 }
7194 spin_unlock(&rbd_dev_list_lock);
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01007195 if (ret)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05007196 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05007197
Mike Christie0276dca2016-08-18 18:38:45 +02007198 if (force) {
7199 /*
7200 * Prevent new IO from being queued and wait for existing
7201 * IO to complete/fail.
7202 */
7203 blk_mq_freeze_queue(rbd_dev->disk->queue);
7204 blk_set_queue_dying(rbd_dev->disk->queue);
7205 }
7206
Ilya Dryomov5769ed02017-04-13 12:17:38 +02007207 del_gendisk(rbd_dev->disk);
7208 spin_lock(&rbd_dev_list_lock);
7209 list_del_init(&rbd_dev->node);
7210 spin_unlock(&rbd_dev_list_lock);
7211 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02007212
Ilya Dryomove010dd02017-04-13 12:17:39 +02007213 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02007214 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05007215 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02007216 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05007217 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007218}
7219
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01007220static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007221{
7222 if (single_major)
7223 return -EINVAL;
7224
7225 return do_rbd_remove(bus, buf, count);
7226}
7227
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01007228static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
7229 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007230{
7231 return do_rbd_remove(bus, buf, count);
7232}
7233
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007234/*
7235 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08007236 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007237 */
Chengguang Xu7d8dc532018-08-12 23:06:54 +08007238static int __init rbd_sysfs_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007239{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08007240 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007241
Alex Elderfed4c142012-02-07 12:03:36 -06007242 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06007243 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08007244 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007245
Alex Elderfed4c142012-02-07 12:03:36 -06007246 ret = bus_register(&rbd_bus_type);
7247 if (ret < 0)
7248 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007249
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007250 return ret;
7251}
7252
Chengguang Xu7d8dc532018-08-12 23:06:54 +08007253static void __exit rbd_sysfs_cleanup(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007254{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08007255 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06007256 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007257}
7258
Chengguang Xu7d8dc532018-08-12 23:06:54 +08007259static int __init rbd_slab_init(void)
Alex Elder1c2a9df2013-05-01 12:43:03 -05007260{
7261 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08007262 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05007263 if (!rbd_img_request_cache)
7264 return -ENOMEM;
7265
7266 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08007267 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05007268 if (!rbd_obj_request_cache)
7269 goto out_err;
7270
Ilya Dryomov6c696d82017-01-25 18:16:23 +01007271 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05007272
Ilya Dryomov6c696d82017-01-25 18:16:23 +01007273out_err:
Alex Elder868311b2013-05-01 12:43:03 -05007274 kmem_cache_destroy(rbd_img_request_cache);
7275 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05007276 return -ENOMEM;
7277}
7278
7279static void rbd_slab_exit(void)
7280{
Alex Elder868311b2013-05-01 12:43:03 -05007281 rbd_assert(rbd_obj_request_cache);
7282 kmem_cache_destroy(rbd_obj_request_cache);
7283 rbd_obj_request_cache = NULL;
7284
Alex Elder1c2a9df2013-05-01 12:43:03 -05007285 rbd_assert(rbd_img_request_cache);
7286 kmem_cache_destroy(rbd_img_request_cache);
7287 rbd_img_request_cache = NULL;
7288}
7289
Alex Eldercc344fa2013-02-19 12:25:56 -06007290static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007291{
7292 int rc;
7293
Alex Elder1e32d342013-01-30 11:13:33 -06007294 if (!libceph_compatible(NULL)) {
7295 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06007296 return -EINVAL;
7297 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02007298
Alex Elder1c2a9df2013-05-01 12:43:03 -05007299 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007300 if (rc)
7301 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02007302
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04007303 /*
7304 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03007305 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04007306 */
7307 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
7308 if (!rbd_wq) {
7309 rc = -ENOMEM;
7310 goto err_out_slab;
7311 }
7312
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007313 if (single_major) {
7314 rbd_major = register_blkdev(0, RBD_DRV_NAME);
7315 if (rbd_major < 0) {
7316 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04007317 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007318 }
7319 }
7320
Alex Elder1c2a9df2013-05-01 12:43:03 -05007321 rc = rbd_sysfs_init();
7322 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007323 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05007324
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007325 if (single_major)
7326 pr_info("loaded (major %d)\n", rbd_major);
7327 else
7328 pr_info("loaded\n");
7329
Ilya Dryomove1b4d962013-12-13 15:28:57 +02007330 return 0;
7331
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007332err_out_blkdev:
7333 if (single_major)
7334 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04007335err_out_wq:
7336 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02007337err_out_slab:
7338 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05007339 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007340}
7341
Alex Eldercc344fa2013-02-19 12:25:56 -06007342static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007343{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04007344 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007345 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02007346 if (single_major)
7347 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04007348 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05007349 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007350}
7351
7352module_init(rbd_init);
7353module_exit(rbd_exit);
7354
Alex Elderd552c612013-05-31 20:13:09 -05007355MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007356MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
7357MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007358/* following authorship retained from original osdblk.c */
7359MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
7360
Ilya Dryomov90da2582013-12-13 15:28:56 +02007361MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07007362MODULE_LICENSE("GPL");