blob: dd63dbcefdc5e2a406a561b17a176457fc34c695 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
Mark Rutlandbfc18e32018-06-21 13:13:04 +010064 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
Alex Eldera2acd002013-05-08 22:50:04 -050065 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100118#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100119#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100120#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100121
Ilya Dryomoved95b212016-08-12 16:40:02 +0200122#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
123 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100124 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100125 RBD_FEATURE_DEEP_FLATTEN | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100126 RBD_FEATURE_DATA_POOL | \
127 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700128
129/* Features supported by this (client software) implementation. */
130
Alex Elder770eba62012-10-25 23:34:40 -0500131#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700132
Alex Elder81a89792012-02-02 08:13:30 -0600133/*
134 * An RBD device name will be "rbd#", where the "rbd" comes from
135 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600136 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700137#define DEV_NAME_LEN 32
138
139/*
140 * block device image metadata (in-memory version)
141 */
142struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500143 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500144 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500146 u64 stripe_unit;
147 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100148 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500149 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700150
Alex Elderf84344f2012-08-31 17:29:51 -0500151 /* The remaining fields need to be updated occasionally */
152 u64 image_size;
153 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500154 char *snap_names; /* format 1 only */
155 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700156};
157
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500158/*
159 * An rbd image specification.
160 *
161 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500162 * identify an image. Each rbd_dev structure includes a pointer to
163 * an rbd_spec structure that encapsulates this identity.
164 *
165 * Each of the id's in an rbd_spec has an associated name. For a
166 * user-mapped image, the names are supplied and the id's associated
167 * with them are looked up. For a layered image, a parent image is
168 * defined by the tuple, and the names are looked up.
169 *
170 * An rbd_dev structure contains a parent_spec pointer which is
171 * non-null if the image it represents is a child in a layered
172 * image. This pointer will refer to the rbd_spec structure used
173 * by the parent rbd_dev for its own identity (i.e., the structure
174 * is shared between the parent and child).
175 *
176 * Since these structures are populated once, during the discovery
177 * phase of image construction, they are effectively immutable so
178 * we make no effort to synchronize access to them.
179 *
180 * Note that code herein does not assume the image name is known (it
181 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500182 */
183struct rbd_spec {
184 u64 pool_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500185 const char *pool_name;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200186 const char *pool_ns; /* NULL if default, never "" */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500187
Alex Elderecb4dc222013-04-26 09:43:47 -0500188 const char *image_id;
189 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500190
191 u64 snap_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500192 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500193
194 struct kref kref;
195};
196
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700197/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600198 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199 */
200struct rbd_client {
201 struct ceph_client *client;
202 struct kref kref;
203 struct list_head node;
204};
205
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200206struct pending_result {
207 int result; /* first nonzero result */
208 int num_pending;
209};
210
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212
Alex Elder9969ebc2013-01-18 12:31:10 -0600213enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100214 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100215 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100216 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100217 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600218};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600219
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800220enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100221 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800222 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800223 OBJ_OP_DISCARD,
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100224 OBJ_OP_ZEROOUT,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800225};
226
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200227#define RBD_OBJ_FLAG_DELETION (1U << 0)
228#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
229
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200230enum rbd_obj_read_state {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200231 RBD_OBJ_READ_START = 1,
232 RBD_OBJ_READ_OBJECT,
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200233 RBD_OBJ_READ_PARENT,
234};
235
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100236/*
237 * Writes go through the following state machine to deal with
238 * layering:
239 *
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100240 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
241 * . | .
242 * . v .
243 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
244 * . | . .
245 * . v v (deep-copyup .
246 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
247 * flattened) v | . .
248 * . v . .
249 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
250 * | not needed) v
251 * v .
252 * done . . . . . . . . . . . . . . . . . .
253 * ^
254 * |
255 * RBD_OBJ_WRITE_FLAT
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100256 *
257 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100258 * assert_exists guard is needed or not (in some cases it's not needed
259 * even if there is a parent).
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100260 */
261enum rbd_obj_write_state {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200262 RBD_OBJ_WRITE_START = 1,
263 RBD_OBJ_WRITE_OBJECT,
Ilya Dryomov3a482502019-02-28 10:49:12 +0100264 RBD_OBJ_WRITE_READ_FROM_PARENT,
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100265 RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC,
Ilya Dryomov3a482502019-02-28 10:49:12 +0100266 RBD_OBJ_WRITE_COPYUP_OPS,
Alex Elder926f9b32013-02-11 12:33:24 -0600267};
268
Alex Elderbf0d5f502012-11-22 00:00:08 -0600269struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100270 struct ceph_object_extent ex;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200271 unsigned int flags; /* RBD_OBJ_FLAG_* */
Alex Elderc5b5ef62013-02-11 12:33:24 -0600272 union {
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200273 enum rbd_obj_read_state read_state; /* for reads */
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100274 enum rbd_obj_write_state write_state; /* for writes */
275 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276
Ilya Dryomov51c35092018-01-29 14:04:08 +0100277 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100278 struct ceph_file_extent *img_extents;
279 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600280
Alex Elder788e2df2013-01-17 12:25:27 -0600281 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100282 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600283 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100284 struct ceph_bvec_iter bvec_pos;
285 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100286 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600287 };
288 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100289 struct bio_vec *copyup_bvecs;
290 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600291
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +0200292 struct list_head osd_reqs; /* w/ r_private_item */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200294 struct mutex state_mutex;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600295 struct kref kref;
296};
297
Alex Elder0c425242013-02-08 09:55:49 -0600298enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600299 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600300 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600301};
302
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200303enum rbd_img_state {
304 RBD_IMG_START = 1,
305 __RBD_IMG_OBJECT_REQUESTS,
306 RBD_IMG_OBJECT_REQUESTS,
307};
308
Alex Elderbf0d5f502012-11-22 00:00:08 -0600309struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600310 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100311 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100312 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600313 unsigned long flags;
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200314 enum rbd_img_state state;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600315 union {
Alex Elder9849e982013-01-24 16:13:36 -0600316 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600317 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600318 };
319 union {
320 struct request *rq; /* block request */
321 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600322 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600323
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100324 struct list_head object_extents; /* obj_req.ex structs */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600325
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200326 struct mutex state_mutex;
327 struct pending_result pending;
328 struct work_struct work;
329 int work_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600330 struct kref kref;
331};
332
333#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100334 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600335#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100336 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600337
Ilya Dryomov99d16942016-08-12 16:11:41 +0200338enum rbd_watch_state {
339 RBD_WATCH_STATE_UNREGISTERED,
340 RBD_WATCH_STATE_REGISTERED,
341 RBD_WATCH_STATE_ERROR,
342};
343
Ilya Dryomoved95b212016-08-12 16:40:02 +0200344enum rbd_lock_state {
345 RBD_LOCK_STATE_UNLOCKED,
346 RBD_LOCK_STATE_LOCKED,
347 RBD_LOCK_STATE_RELEASING,
348};
349
350/* WatchNotify::ClientId */
351struct rbd_client_id {
352 u64 gid;
353 u64 handle;
354};
355
Alex Elderf84344f2012-08-31 17:29:51 -0500356struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500357 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500358 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500359};
360
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700361/*
362 * a single device
363 */
364struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500365 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700366
367 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200368 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700370
Alex Eldera30b71b2012-07-10 20:30:11 -0500371 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700372 struct rbd_client *rbd_client;
373
374 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
375
Alex Elderb82d1672013-01-14 12:43:31 -0600376 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377
378 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600379 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500380 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300381 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200382 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200384 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200385 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500386
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200387 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600388
Ilya Dryomov99d16942016-08-12 16:11:41 +0200389 struct mutex watch_mutex;
390 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200391 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200392 u64 watch_cookie;
393 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700394
Ilya Dryomoved95b212016-08-12 16:40:02 +0200395 struct rw_semaphore lock_rwsem;
396 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200397 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200398 struct rbd_client_id owner_cid;
399 struct work_struct acquired_lock_work;
400 struct work_struct released_lock_work;
401 struct delayed_work lock_dwork;
402 struct work_struct unlock_work;
403 wait_queue_head_t lock_waitq;
404
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200405 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406
Alex Elder86b00e02012-10-25 23:34:42 -0500407 struct rbd_spec *parent_spec;
408 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500409 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500410 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500411
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100412 /* Block layer tags. */
413 struct blk_mq_tag_set tag_set;
414
Josh Durginc6666012011-11-21 17:11:12 -0800415 /* protects updating the header */
416 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500417
418 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419
420 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800421
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800422 /* sysfs related */
423 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600424 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800425};
426
Alex Elderb82d1672013-01-14 12:43:31 -0600427/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200428 * Flag bits for rbd_dev->flags:
429 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
430 * by rbd_dev->lock
431 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600432 */
Alex Elder6d292902013-01-14 12:43:31 -0600433enum rbd_dev_flags {
434 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600435 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200436 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600437};
438
Alex Eldercfbf6372013-05-31 17:40:45 -0500439static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600440
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600442static DEFINE_SPINLOCK(rbd_dev_list_lock);
443
Alex Elder432b8582012-01-29 13:57:44 -0600444static LIST_HEAD(rbd_client_list); /* clients */
445static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446
Alex Elder78c2a442013-05-01 12:43:04 -0500447/* Slab caches for frequently-allocated structures */
448
Alex Elder1c2a9df2013-05-01 12:43:03 -0500449static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500450static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500451
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200452static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200453static DEFINE_IDA(rbd_dev_id_ida);
454
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400455static struct workqueue_struct *rbd_wq;
456
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100457static struct ceph_snap_context rbd_empty_snapc = {
458 .nref = REFCOUNT_INIT(1),
459};
460
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200461/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100462 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200463 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100464static bool single_major = true;
Joe Perches5657a812018-05-24 13:38:59 -0600465module_param(single_major, bool, 0444);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100466MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200467
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100468static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
469static ssize_t remove_store(struct bus_type *bus, const char *buf,
470 size_t count);
471static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
472 size_t count);
473static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
474 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200475static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600476
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200477static int rbd_dev_id_to_minor(int dev_id)
478{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200479 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200480}
481
482static int minor_to_rbd_dev_id(int minor)
483{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200484 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200485}
486
Ilya Dryomoved95b212016-08-12 16:40:02 +0200487static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
488{
489 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
490 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
491}
492
493static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
494{
495 bool is_lock_owner;
496
497 down_read(&rbd_dev->lock_rwsem);
498 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
499 up_read(&rbd_dev->lock_rwsem);
500 return is_lock_owner;
501}
502
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100503static ssize_t supported_features_show(struct bus_type *bus, char *buf)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100504{
505 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
506}
507
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100508static BUS_ATTR_WO(add);
509static BUS_ATTR_WO(remove);
510static BUS_ATTR_WO(add_single_major);
511static BUS_ATTR_WO(remove_single_major);
512static BUS_ATTR_RO(supported_features);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700513
514static struct attribute *rbd_bus_attrs[] = {
515 &bus_attr_add.attr,
516 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200517 &bus_attr_add_single_major.attr,
518 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100519 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700520 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600521};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200522
523static umode_t rbd_bus_is_visible(struct kobject *kobj,
524 struct attribute *attr, int index)
525{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200526 if (!single_major &&
527 (attr == &bus_attr_add_single_major.attr ||
528 attr == &bus_attr_remove_single_major.attr))
529 return 0;
530
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200531 return attr->mode;
532}
533
534static const struct attribute_group rbd_bus_group = {
535 .attrs = rbd_bus_attrs,
536 .is_visible = rbd_bus_is_visible,
537};
538__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600539
540static struct bus_type rbd_bus_type = {
541 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700542 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600543};
544
545static void rbd_root_dev_release(struct device *dev)
546{
547}
548
549static struct device rbd_root_dev = {
550 .init_name = "rbd",
551 .release = rbd_root_dev_release,
552};
553
Alex Elder06ecc6c2012-11-01 10:17:15 -0500554static __printf(2, 3)
555void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
556{
557 struct va_format vaf;
558 va_list args;
559
560 va_start(args, fmt);
561 vaf.fmt = fmt;
562 vaf.va = &args;
563
564 if (!rbd_dev)
565 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
566 else if (rbd_dev->disk)
567 printk(KERN_WARNING "%s: %s: %pV\n",
568 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
569 else if (rbd_dev->spec && rbd_dev->spec->image_name)
570 printk(KERN_WARNING "%s: image %s: %pV\n",
571 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
572 else if (rbd_dev->spec && rbd_dev->spec->image_id)
573 printk(KERN_WARNING "%s: id %s: %pV\n",
574 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
575 else /* punt */
576 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
577 RBD_DRV_NAME, rbd_dev, &vaf);
578 va_end(args);
579}
580
Alex Elderaafb2302012-09-06 16:00:54 -0500581#ifdef RBD_DEBUG
582#define rbd_assert(expr) \
583 if (unlikely(!(expr))) { \
584 printk(KERN_ERR "\nAssertion failure in %s() " \
585 "at line %d:\n\n" \
586 "\trbd_assert(%s);\n\n", \
587 __func__, __LINE__, #expr); \
588 BUG(); \
589 }
590#else /* !RBD_DEBUG */
591# define rbd_assert(expr) ((void) 0)
592#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800593
Alex Elder05a46af2013-04-26 15:44:36 -0500594static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600595
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500596static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500597static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400598static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400599static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500600static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
601 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500602static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
603 u8 *order, u64 *snap_size);
604static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
605 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700606
Ilya Dryomov54ab3b22019-05-11 16:21:49 +0200607static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200608static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
609
610/*
611 * Return true if nothing else is pending.
612 */
613static bool pending_result_dec(struct pending_result *pending, int *result)
614{
615 rbd_assert(pending->num_pending > 0);
616
617 if (*result && !pending->result)
618 pending->result = *result;
619 if (--pending->num_pending)
620 return false;
621
622 *result = pending->result;
623 return true;
624}
Ilya Dryomov54ab3b22019-05-11 16:21:49 +0200625
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626static int rbd_open(struct block_device *bdev, fmode_t mode)
627{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600628 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600629 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700630
Alex Eldera14ea262013-02-05 13:23:12 -0600631 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600632 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
633 removing = true;
634 else
635 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600636 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600637 if (removing)
638 return -ENOENT;
639
Alex Elderc3e946c2012-11-16 09:29:16 -0600640 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700641
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 return 0;
643}
644
Al Virodb2a1442013-05-05 21:52:57 -0400645static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800646{
647 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600648 unsigned long open_count_before;
649
Alex Eldera14ea262013-02-05 13:23:12 -0600650 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600651 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600652 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600653 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800654
Alex Elderc3e946c2012-11-16 09:29:16 -0600655 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800656}
657
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800658static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
659{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200660 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800661
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200662 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800663 return -EFAULT;
664
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200665 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800666 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
667 return -EROFS;
668
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200669 /* Let blkdev_roset() handle it */
670 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800671}
672
673static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
674 unsigned int cmd, unsigned long arg)
675{
676 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200677 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800678
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800679 switch (cmd) {
680 case BLKROSET:
681 ret = rbd_ioctl_set_ro(rbd_dev, arg);
682 break;
683 default:
684 ret = -ENOTTY;
685 }
686
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800687 return ret;
688}
689
690#ifdef CONFIG_COMPAT
691static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
692 unsigned int cmd, unsigned long arg)
693{
694 return rbd_ioctl(bdev, mode, cmd, arg);
695}
696#endif /* CONFIG_COMPAT */
697
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700698static const struct block_device_operations rbd_bd_ops = {
699 .owner = THIS_MODULE,
700 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800701 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800702 .ioctl = rbd_ioctl,
703#ifdef CONFIG_COMPAT
704 .compat_ioctl = rbd_compat_ioctl,
705#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706};
707
708/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500709 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500710 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700711 */
Alex Elderf8c38922012-08-10 13:12:07 -0700712static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713{
714 struct rbd_client *rbdc;
715 int ret = -ENOMEM;
716
Alex Elder37206ee2013-02-20 17:32:08 -0600717 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700718 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
719 if (!rbdc)
720 goto out_opt;
721
722 kref_init(&rbdc->kref);
723 INIT_LIST_HEAD(&rbdc->node);
724
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100725 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500727 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500728 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729
730 ret = ceph_open_session(rbdc->client);
731 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500732 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733
Alex Elder432b8582012-01-29 13:57:44 -0600734 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600736 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700737
Alex Elder37206ee2013-02-20 17:32:08 -0600738 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600739
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700740 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500741out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500743out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744 kfree(rbdc);
745out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500746 if (ceph_opts)
747 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600748 dout("%s: error %d\n", __func__, ret);
749
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400750 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751}
752
Alex Elder2f82ee52012-10-30 19:40:33 -0500753static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
754{
755 kref_get(&rbdc->kref);
756
757 return rbdc;
758}
759
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700760/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700761 * Find a ceph client with specific addr and configuration. If
762 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700763 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700764static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700765{
766 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700767 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700768
Alex Elder43ae4702012-07-03 16:01:18 -0500769 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770 return NULL;
771
Alex Elder1f7ba332012-08-10 13:12:07 -0700772 spin_lock(&rbd_client_list_lock);
773 list_for_each_entry(client_node, &rbd_client_list, node) {
774 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500775 __rbd_get_client(client_node);
776
Alex Elder1f7ba332012-08-10 13:12:07 -0700777 found = true;
778 break;
779 }
780 }
781 spin_unlock(&rbd_client_list_lock);
782
783 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784}
785
786/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300787 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700788 */
789enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300790 Opt_queue_depth,
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100791 Opt_alloc_size,
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400792 Opt_lock_timeout,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700793 Opt_last_int,
794 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200795 Opt_pool_ns,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700796 Opt_last_string,
797 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700798 Opt_read_only,
799 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200800 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200801 Opt_exclusive,
Ilya Dryomovd9360542018-03-23 06:14:47 +0100802 Opt_notrim,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300803 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700804};
805
Alex Elder43ae4702012-07-03 16:01:18 -0500806static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300807 {Opt_queue_depth, "queue_depth=%d"},
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100808 {Opt_alloc_size, "alloc_size=%d"},
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400809 {Opt_lock_timeout, "lock_timeout=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700810 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200811 {Opt_pool_ns, "_pool_ns=%s"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700812 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500813 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700814 {Opt_read_only, "ro"}, /* Alternate spelling */
815 {Opt_read_write, "read_write"},
816 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200817 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200818 {Opt_exclusive, "exclusive"},
Ilya Dryomovd9360542018-03-23 06:14:47 +0100819 {Opt_notrim, "notrim"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300820 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700821};
822
Alex Elder98571b52013-01-20 14:44:42 -0600823struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300824 int queue_depth;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100825 int alloc_size;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400826 unsigned long lock_timeout;
Alex Elder98571b52013-01-20 14:44:42 -0600827 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200828 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200829 bool exclusive;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100830 bool trim;
Alex Elder98571b52013-01-20 14:44:42 -0600831};
832
Ilya Dryomovb5584182015-06-23 16:21:19 +0300833#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100834#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400835#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
Alex Elder98571b52013-01-20 14:44:42 -0600836#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200837#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200838#define RBD_EXCLUSIVE_DEFAULT false
Ilya Dryomovd9360542018-03-23 06:14:47 +0100839#define RBD_TRIM_DEFAULT true
Alex Elder98571b52013-01-20 14:44:42 -0600840
Ilya Dryomovc3001562018-07-03 15:28:43 +0200841struct parse_rbd_opts_ctx {
842 struct rbd_spec *spec;
843 struct rbd_options *opts;
844};
845
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700846static int parse_rbd_opts_token(char *c, void *private)
847{
Ilya Dryomovc3001562018-07-03 15:28:43 +0200848 struct parse_rbd_opts_ctx *pctx = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700849 substring_t argstr[MAX_OPT_ARGS];
850 int token, intval, ret;
851
Alex Elder43ae4702012-07-03 16:01:18 -0500852 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700853 if (token < Opt_last_int) {
854 ret = match_int(&argstr[0], &intval);
855 if (ret < 0) {
Ilya Dryomov2f56b6b2018-06-27 16:38:13 +0200856 pr_err("bad option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700857 return ret;
858 }
859 dout("got int token %d val %d\n", token, intval);
860 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300861 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700862 } else {
863 dout("got token %d\n", token);
864 }
865
866 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300867 case Opt_queue_depth:
868 if (intval < 1) {
869 pr_err("queue_depth out of range\n");
870 return -EINVAL;
871 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200872 pctx->opts->queue_depth = intval;
Ilya Dryomovb5584182015-06-23 16:21:19 +0300873 break;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100874 case Opt_alloc_size:
Ilya Dryomov16d80c52019-03-15 14:50:04 +0100875 if (intval < SECTOR_SIZE) {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100876 pr_err("alloc_size out of range\n");
877 return -EINVAL;
878 }
879 if (!is_power_of_2(intval)) {
880 pr_err("alloc_size must be a power of 2\n");
881 return -EINVAL;
882 }
883 pctx->opts->alloc_size = intval;
884 break;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400885 case Opt_lock_timeout:
886 /* 0 is "wait forever" (i.e. infinite timeout) */
887 if (intval < 0 || intval > INT_MAX / 1000) {
888 pr_err("lock_timeout out of range\n");
889 return -EINVAL;
890 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200891 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400892 break;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200893 case Opt_pool_ns:
894 kfree(pctx->spec->pool_ns);
895 pctx->spec->pool_ns = match_strdup(argstr);
896 if (!pctx->spec->pool_ns)
897 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700898 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700899 case Opt_read_only:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200900 pctx->opts->read_only = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700901 break;
902 case Opt_read_write:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200903 pctx->opts->read_only = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700904 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200905 case Opt_lock_on_read:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200906 pctx->opts->lock_on_read = true;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200907 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200908 case Opt_exclusive:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200909 pctx->opts->exclusive = true;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200910 break;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100911 case Opt_notrim:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200912 pctx->opts->trim = false;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100913 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700914 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300915 /* libceph prints "bad option" msg */
916 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700917 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300918
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700919 return 0;
920}
921
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800922static char* obj_op_name(enum obj_operation_type op_type)
923{
924 switch (op_type) {
925 case OBJ_OP_READ:
926 return "read";
927 case OBJ_OP_WRITE:
928 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800929 case OBJ_OP_DISCARD:
930 return "discard";
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100931 case OBJ_OP_ZEROOUT:
932 return "zeroout";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800933 default:
934 return "???";
935 }
936}
937
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700938/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700939 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600940 *
Alex Elder432b8582012-01-29 13:57:44 -0600941 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700942 */
943static void rbd_client_release(struct kref *kref)
944{
945 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
946
Alex Elder37206ee2013-02-20 17:32:08 -0600947 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500948 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500950 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700951
952 ceph_destroy_client(rbdc->client);
953 kfree(rbdc);
954}
955
956/*
957 * Drop reference to ceph client node. If it's not referenced anymore, release
958 * it.
959 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500960static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700961{
Alex Elderc53d5892012-10-25 23:34:42 -0500962 if (rbdc)
963 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700964}
965
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100966/*
967 * Get a ceph client with specific addr and configuration, if one does
968 * not exist create it. Either way, ceph_opts is consumed by this
969 * function.
970 */
971static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
972{
973 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100974 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100975
Ilya Dryomova32e4142019-05-02 15:56:00 +0200976 mutex_lock(&client_mutex);
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100977 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100978 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100979 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100980
981 /*
982 * Using an existing client. Make sure ->pg_pools is up to
983 * date before we look up the pool id in do_rbd_add().
984 */
Ilya Dryomov9d4a2272019-03-20 10:58:05 +0100985 ret = ceph_wait_for_latest_osdmap(rbdc->client,
986 rbdc->client->options->mount_timeout);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100987 if (ret) {
988 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
989 rbd_put_client(rbdc);
990 rbdc = ERR_PTR(ret);
991 }
992 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100993 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100994 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100995 mutex_unlock(&client_mutex);
996
997 return rbdc;
998}
999
Alex Eldera30b71b2012-07-10 20:30:11 -05001000static bool rbd_image_format_valid(u32 image_format)
1001{
1002 return image_format == 1 || image_format == 2;
1003}
1004
Alex Elder8e94af82012-07-25 09:32:40 -05001005static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
1006{
Alex Elder103a1502012-08-02 11:29:45 -05001007 size_t size;
1008 u32 snap_count;
1009
1010 /* The header has to start with the magic rbd header text */
1011 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
1012 return false;
1013
Alex Elderdb2388b2012-10-20 22:17:27 -05001014 /* The bio layer requires at least sector-sized I/O */
1015
1016 if (ondisk->options.order < SECTOR_SHIFT)
1017 return false;
1018
1019 /* If we use u64 in a few spots we may be able to loosen this */
1020
1021 if (ondisk->options.order > 8 * sizeof (int) - 1)
1022 return false;
1023
Alex Elder103a1502012-08-02 11:29:45 -05001024 /*
1025 * The size of a snapshot header has to fit in a size_t, and
1026 * that limits the number of snapshots.
1027 */
1028 snap_count = le32_to_cpu(ondisk->snap_count);
1029 size = SIZE_MAX - sizeof (struct ceph_snap_context);
1030 if (snap_count > size / sizeof (__le64))
1031 return false;
1032
1033 /*
1034 * Not only that, but the size of the entire the snapshot
1035 * header must also be representable in a size_t.
1036 */
1037 size -= snap_count * sizeof (__le64);
1038 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1039 return false;
1040
1041 return true;
Alex Elder8e94af82012-07-25 09:32:40 -05001042}
1043
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001044/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001045 * returns the size of an object in the image
1046 */
1047static u32 rbd_obj_bytes(struct rbd_image_header *header)
1048{
1049 return 1U << header->obj_order;
1050}
1051
Ilya Dryomov263423f2017-01-25 18:16:22 +01001052static void rbd_init_layout(struct rbd_device *rbd_dev)
1053{
1054 if (rbd_dev->header.stripe_unit == 0 ||
1055 rbd_dev->header.stripe_count == 0) {
1056 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1057 rbd_dev->header.stripe_count = 1;
1058 }
1059
1060 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1061 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1062 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +01001063 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1064 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001065 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1066}
1067
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001068/*
Alex Elderbb23e372013-05-06 09:51:29 -05001069 * Fill an rbd image header with information from the given format 1
1070 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001071 */
Alex Elder662518b2013-05-06 09:51:29 -05001072static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001073 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001074{
Alex Elder662518b2013-05-06 09:51:29 -05001075 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001076 bool first_time = header->object_prefix == NULL;
1077 struct ceph_snap_context *snapc;
1078 char *object_prefix = NULL;
1079 char *snap_names = NULL;
1080 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001081 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001082 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001083 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084
Alex Elderbb23e372013-05-06 09:51:29 -05001085 /* Allocate this now to avoid having to handle failure below */
1086
1087 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001088 object_prefix = kstrndup(ondisk->object_prefix,
1089 sizeof(ondisk->object_prefix),
1090 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001091 if (!object_prefix)
1092 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001093 }
1094
1095 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001096
Alex Elder103a1502012-08-02 11:29:45 -05001097 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001098 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1099 if (!snapc)
1100 goto out_err;
1101 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001103 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001104 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1105
Alex Elderbb23e372013-05-06 09:51:29 -05001106 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001107
Alex Elderbb23e372013-05-06 09:51:29 -05001108 if (snap_names_len > (u64)SIZE_MAX)
1109 goto out_2big;
1110 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1111 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001112 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001113
1114 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001115 snap_sizes = kmalloc_array(snap_count,
1116 sizeof(*header->snap_sizes),
1117 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001118 if (!snap_sizes)
1119 goto out_err;
1120
Alex Elderf785cc12012-08-23 23:22:06 -05001121 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001122 * Copy the names, and fill in each snapshot's id
1123 * and size.
1124 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001125 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001126 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001127 * snap_names_len bytes beyond the end of the
1128 * snapshot id array, this memcpy() is safe.
1129 */
Alex Elderbb23e372013-05-06 09:51:29 -05001130 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1131 snaps = ondisk->snaps;
1132 for (i = 0; i < snap_count; i++) {
1133 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1134 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1135 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001136 }
Alex Elder849b4262012-07-09 21:04:24 -05001137
Alex Elderbb23e372013-05-06 09:51:29 -05001138 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001139
Alex Elderbb23e372013-05-06 09:51:29 -05001140 if (first_time) {
1141 header->object_prefix = object_prefix;
1142 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001143 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001144 } else {
1145 ceph_put_snap_context(header->snapc);
1146 kfree(header->snap_names);
1147 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001148 }
1149
1150 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001151
Alex Elderf84344f2012-08-31 17:29:51 -05001152 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001153 header->snapc = snapc;
1154 header->snap_names = snap_names;
1155 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001156
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001157 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001158out_2big:
1159 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001160out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001161 kfree(snap_sizes);
1162 kfree(snap_names);
1163 ceph_put_snap_context(snapc);
1164 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001165
Alex Elderbb23e372013-05-06 09:51:29 -05001166 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167}
1168
Alex Elder9682fc62013-04-30 00:44:33 -05001169static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1170{
1171 const char *snap_name;
1172
1173 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1174
1175 /* Skip over names until we find the one we are looking for */
1176
1177 snap_name = rbd_dev->header.snap_names;
1178 while (which--)
1179 snap_name += strlen(snap_name) + 1;
1180
1181 return kstrdup(snap_name, GFP_KERNEL);
1182}
1183
Alex Elder30d1cff2013-05-01 12:43:03 -05001184/*
1185 * Snapshot id comparison function for use with qsort()/bsearch().
1186 * Note that result is for snapshots in *descending* order.
1187 */
1188static int snapid_compare_reverse(const void *s1, const void *s2)
1189{
1190 u64 snap_id1 = *(u64 *)s1;
1191 u64 snap_id2 = *(u64 *)s2;
1192
1193 if (snap_id1 < snap_id2)
1194 return 1;
1195 return snap_id1 == snap_id2 ? 0 : -1;
1196}
1197
1198/*
1199 * Search a snapshot context to see if the given snapshot id is
1200 * present.
1201 *
1202 * Returns the position of the snapshot id in the array if it's found,
1203 * or BAD_SNAP_INDEX otherwise.
1204 *
1205 * Note: The snapshot array is in kept sorted (by the osd) in
1206 * reverse order, highest snapshot id first.
1207 */
Alex Elder9682fc62013-04-30 00:44:33 -05001208static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1209{
1210 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001211 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001212
Alex Elder30d1cff2013-05-01 12:43:03 -05001213 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1214 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001215
Alex Elder30d1cff2013-05-01 12:43:03 -05001216 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001217}
1218
Alex Elder2ad3d712013-04-30 00:44:33 -05001219static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1220 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001221{
1222 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001223 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001224
1225 which = rbd_dev_snap_index(rbd_dev, snap_id);
1226 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001227 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001228
Josh Durginda6a6b62013-09-04 17:57:31 -07001229 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1230 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001231}
1232
Alex Elder9e15b772012-10-30 19:40:33 -05001233static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1234{
Alex Elder9e15b772012-10-30 19:40:33 -05001235 if (snap_id == CEPH_NOSNAP)
1236 return RBD_SNAP_HEAD_NAME;
1237
Alex Elder54cac612013-04-30 00:44:33 -05001238 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1239 if (rbd_dev->image_format == 1)
1240 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001241
Alex Elder54cac612013-04-30 00:44:33 -05001242 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001243}
1244
Alex Elder2ad3d712013-04-30 00:44:33 -05001245static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1246 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001247{
Alex Elder2ad3d712013-04-30 00:44:33 -05001248 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1249 if (snap_id == CEPH_NOSNAP) {
1250 *snap_size = rbd_dev->header.image_size;
1251 } else if (rbd_dev->image_format == 1) {
1252 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001253
Alex Elder2ad3d712013-04-30 00:44:33 -05001254 which = rbd_dev_snap_index(rbd_dev, snap_id);
1255 if (which == BAD_SNAP_INDEX)
1256 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001257
Alex Elder2ad3d712013-04-30 00:44:33 -05001258 *snap_size = rbd_dev->header.snap_sizes[which];
1259 } else {
1260 u64 size = 0;
1261 int ret;
1262
1263 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1264 if (ret)
1265 return ret;
1266
1267 *snap_size = size;
1268 }
1269 return 0;
1270}
1271
1272static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1273 u64 *snap_features)
1274{
1275 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1276 if (snap_id == CEPH_NOSNAP) {
1277 *snap_features = rbd_dev->header.features;
1278 } else if (rbd_dev->image_format == 1) {
1279 *snap_features = 0; /* No features for format 1 */
1280 } else {
1281 u64 features = 0;
1282 int ret;
1283
1284 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1285 if (ret)
1286 return ret;
1287
1288 *snap_features = features;
1289 }
1290 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001291}
1292
Alex Elderd1cf5782013-04-27 09:59:30 -05001293static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001294{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001295 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001296 u64 size = 0;
1297 u64 features = 0;
1298 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001299
Alex Elder2ad3d712013-04-30 00:44:33 -05001300 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1301 if (ret)
1302 return ret;
1303 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1304 if (ret)
1305 return ret;
1306
1307 rbd_dev->mapping.size = size;
1308 rbd_dev->mapping.features = features;
1309
Alex Elder8b0241f2013-04-25 23:15:08 -05001310 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001311}
1312
Alex Elderd1cf5782013-04-27 09:59:30 -05001313static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1314{
1315 rbd_dev->mapping.size = 0;
1316 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001317}
1318
Ilya Dryomov5359a172018-01-20 10:30:10 +01001319static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001320{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001321 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001322 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001323
Ilya Dryomov5359a172018-01-20 10:30:10 +01001324 buf = bvec_kmap_irq(bv, &flags);
1325 memset(buf, 0, bv->bv_len);
1326 flush_dcache_page(bv->bv_page);
1327 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001328}
1329
Ilya Dryomov5359a172018-01-20 10:30:10 +01001330static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001331{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001332 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001333
Ilya Dryomov5359a172018-01-20 10:30:10 +01001334 ceph_bio_iter_advance(&it, off);
1335 ceph_bio_iter_advance_step(&it, bytes, ({
1336 zero_bvec(&bv);
1337 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001338}
1339
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001340static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001341{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001342 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001343
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001344 ceph_bvec_iter_advance(&it, off);
1345 ceph_bvec_iter_advance_step(&it, bytes, ({
1346 zero_bvec(&bv);
1347 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001348}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001349
Alex Elderf7760da2012-10-20 22:17:27 -05001350/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001351 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001352 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001353 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001354 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001355 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001356static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1357 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001358{
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001359 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1360
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001361 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001362 case OBJ_REQUEST_BIO:
1363 zero_bios(&obj_req->bio_pos, off, bytes);
1364 break;
1365 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001366 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001367 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1368 break;
1369 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001370 BUG();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001371 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001372}
1373
1374static void rbd_obj_request_destroy(struct kref *kref);
1375static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1376{
1377 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001378 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001379 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001380 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1381}
1382
Alex Elderbf0d5f502012-11-22 00:00:08 -06001383static void rbd_img_request_destroy(struct kref *kref);
1384static void rbd_img_request_put(struct rbd_img_request *img_request)
1385{
1386 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001387 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001388 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001389 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001390}
1391
1392static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1393 struct rbd_obj_request *obj_request)
1394{
Alex Elder25dcf952013-01-25 17:08:55 -06001395 rbd_assert(obj_request->img_request == NULL);
1396
Alex Elderb155e862013-04-15 14:50:37 -05001397 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001398 obj_request->img_request = img_request;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001399 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001400}
1401
1402static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1403 struct rbd_obj_request *obj_request)
1404{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001405 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001406 list_del(&obj_request->ex.oe_item);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001407 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001408 rbd_obj_request_put(obj_request);
1409}
1410
Ilya Dryomov980917f2016-09-12 18:59:42 +02001411static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001412{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001413 struct ceph_osd_request *osd_req =
1414 list_last_entry(&obj_request->osd_reqs, struct ceph_osd_request,
1415 r_private_item);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001416
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001417 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001418 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1419 obj_request->ex.oe_len, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001420 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001421}
1422
Alex Elder0c425242013-02-08 09:55:49 -06001423/*
1424 * The default/initial value for all image request flags is 0. Each
1425 * is conditionally set to 1 at image request initialization time
1426 * and currently never change thereafter.
1427 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001428static void img_request_layered_set(struct rbd_img_request *img_request)
1429{
1430 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1431 smp_mb();
1432}
1433
Alex Eldera2acd002013-05-08 22:50:04 -05001434static void img_request_layered_clear(struct rbd_img_request *img_request)
1435{
1436 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1437 smp_mb();
1438}
1439
Alex Elderd0b2e942013-01-24 16:13:36 -06001440static bool img_request_layered_test(struct rbd_img_request *img_request)
1441{
1442 smp_mb();
1443 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1444}
1445
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001446static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001447{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001448 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1449
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001450 return !obj_req->ex.oe_off &&
1451 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001452}
1453
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001454static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001455{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001456 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001457
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001458 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001459 rbd_dev->layout.object_size;
1460}
1461
Ilya Dryomov13488d52019-02-25 12:37:50 +01001462/*
1463 * Must be called after rbd_obj_calc_img_extents().
1464 */
1465static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1466{
1467 if (!obj_req->num_img_extents ||
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01001468 (rbd_obj_is_entire(obj_req) &&
1469 !obj_req->img_request->snapc->num_snaps))
Ilya Dryomov13488d52019-02-25 12:37:50 +01001470 return false;
1471
1472 return true;
1473}
1474
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001475static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1476{
1477 return ceph_file_extents_bytes(obj_req->img_extents,
1478 obj_req->num_img_extents);
1479}
1480
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001481static bool rbd_img_is_write(struct rbd_img_request *img_req)
1482{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001483 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001484 case OBJ_OP_READ:
1485 return false;
1486 case OBJ_OP_WRITE:
1487 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001488 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001489 return true;
1490 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02001491 BUG();
Alex Elder6e2a4502013-03-27 09:16:30 -05001492 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001493}
1494
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001495static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001496{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001497 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001498 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001499
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001500 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1501 osd_req->r_result, obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001502
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001503 /*
1504 * Writes aren't allowed to return a data payload. In some
1505 * guarded write cases (e.g. stat + zero on an empty object)
1506 * a stat response makes it through, but we don't care.
1507 */
1508 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1509 result = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001510 else
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001511 result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001512
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001513 rbd_obj_handle_request(obj_req, result);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001514}
1515
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001516static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
Alex Elder430c28c2013-04-03 21:32:51 -05001517{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001518 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elder430c28c2013-04-03 21:32:51 -05001519
Ilya Dryomova162b302018-01-30 17:52:10 +01001520 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001521 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001522}
1523
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001524static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
Alex Elder9d4df012013-04-19 15:34:50 -05001525{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001526 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elder9d4df012013-04-19 15:34:50 -05001527
Ilya Dryomova162b302018-01-30 17:52:10 +01001528 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001529 ktime_get_real_ts64(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001530 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001531}
1532
Ilya Dryomovbc812072017-01-25 18:16:23 +01001533static struct ceph_osd_request *
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001534__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1535 struct ceph_snap_context *snapc, int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001536{
Ilya Dryomove28eded2019-02-25 11:42:26 +01001537 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001538 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1539 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001540 const char *name_format = rbd_dev->image_format == 1 ?
1541 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001542 int ret;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001543
Ilya Dryomove28eded2019-02-25 11:42:26 +01001544 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001545 if (!req)
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001546 return ERR_PTR(-ENOMEM);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001547
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001548 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001549 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001550 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001551
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001552 /*
1553 * Data objects may be stored in a separate pool, but always in
1554 * the same namespace in that pool as the header in its pool.
1555 */
1556 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001557 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001558
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001559 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1560 rbd_dev->header.object_prefix,
1561 obj_req->ex.oe_objno);
1562 if (ret)
1563 return ERR_PTR(ret);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001564
Ilya Dryomovbc812072017-01-25 18:16:23 +01001565 return req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001566}
1567
Ilya Dryomove28eded2019-02-25 11:42:26 +01001568static struct ceph_osd_request *
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001569rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
Ilya Dryomove28eded2019-02-25 11:42:26 +01001570{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001571 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1572 num_ops);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001573}
1574
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001575static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001576{
1577 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001578
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001579 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001580 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001581 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001582
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001583 ceph_object_extent_init(&obj_request->ex);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001584 INIT_LIST_HEAD(&obj_request->osd_reqs);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001585 mutex_init(&obj_request->state_mutex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001586 kref_init(&obj_request->kref);
1587
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001588 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001589 return obj_request;
1590}
1591
1592static void rbd_obj_request_destroy(struct kref *kref)
1593{
1594 struct rbd_obj_request *obj_request;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001595 struct ceph_osd_request *osd_req;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001596 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001597
1598 obj_request = container_of(kref, struct rbd_obj_request, kref);
1599
Alex Elder37206ee2013-02-20 17:32:08 -06001600 dout("%s: obj %p\n", __func__, obj_request);
1601
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001602 while (!list_empty(&obj_request->osd_reqs)) {
1603 osd_req = list_first_entry(&obj_request->osd_reqs,
1604 struct ceph_osd_request, r_private_item);
1605 list_del_init(&osd_req->r_private_item);
1606 ceph_osdc_put_request(osd_req);
1607 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001608
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001609 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001610 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001611 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001612 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001613 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001614 case OBJ_REQUEST_OWN_BVECS:
1615 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001616 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001617 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001618 BUG();
Alex Elderbf0d5f502012-11-22 00:00:08 -06001619 }
1620
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001621 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001622 if (obj_request->copyup_bvecs) {
1623 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1624 if (obj_request->copyup_bvecs[i].bv_page)
1625 __free_page(obj_request->copyup_bvecs[i].bv_page);
1626 }
1627 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001628 }
1629
Alex Elder868311b2013-05-01 12:43:03 -05001630 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001631}
1632
Alex Elderfb65d2282013-05-08 22:50:04 -05001633/* It's OK to call this for a device with no parent */
1634
1635static void rbd_spec_put(struct rbd_spec *spec);
1636static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1637{
1638 rbd_dev_remove_parent(rbd_dev);
1639 rbd_spec_put(rbd_dev->parent_spec);
1640 rbd_dev->parent_spec = NULL;
1641 rbd_dev->parent_overlap = 0;
1642}
1643
Alex Elderbf0d5f502012-11-22 00:00:08 -06001644/*
Alex Eldera2acd002013-05-08 22:50:04 -05001645 * Parent image reference counting is used to determine when an
1646 * image's parent fields can be safely torn down--after there are no
1647 * more in-flight requests to the parent image. When the last
1648 * reference is dropped, cleaning them up is safe.
1649 */
1650static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1651{
1652 int counter;
1653
1654 if (!rbd_dev->parent_spec)
1655 return;
1656
1657 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1658 if (counter > 0)
1659 return;
1660
1661 /* Last reference; clean up parent data structures */
1662
1663 if (!counter)
1664 rbd_dev_unparent(rbd_dev);
1665 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001666 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001667}
1668
1669/*
1670 * If an image has a non-zero parent overlap, get a reference to its
1671 * parent.
1672 *
1673 * Returns true if the rbd device has a parent with a non-zero
1674 * overlap and a reference for it was successfully taken, or
1675 * false otherwise.
1676 */
1677static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1678{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001679 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001680
1681 if (!rbd_dev->parent_spec)
1682 return false;
1683
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001684 down_read(&rbd_dev->header_rwsem);
1685 if (rbd_dev->parent_overlap)
1686 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1687 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001688
1689 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001690 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001691
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001692 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001693}
1694
Alex Elderbf0d5f502012-11-22 00:00:08 -06001695/*
1696 * Caller is responsible for filling in the list of object requests
1697 * that comprises the image request, and the Linux request pointer
1698 * (if there is one).
1699 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001700static struct rbd_img_request *rbd_img_request_create(
1701 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001702 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001703 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001704{
1705 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001706
Ilya Dryomova0c58952018-01-22 16:03:06 +01001707 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001708 if (!img_request)
1709 return NULL;
1710
Alex Elderbf0d5f502012-11-22 00:00:08 -06001711 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001712 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001713 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001714 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001715 else
1716 img_request->snapc = snapc;
1717
Alex Eldera2acd002013-05-08 22:50:04 -05001718 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001719 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001720
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001721 INIT_LIST_HEAD(&img_request->object_extents);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02001722 mutex_init(&img_request->state_mutex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001723 kref_init(&img_request->kref);
1724
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001725 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1726 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001727 return img_request;
1728}
1729
1730static void rbd_img_request_destroy(struct kref *kref)
1731{
1732 struct rbd_img_request *img_request;
1733 struct rbd_obj_request *obj_request;
1734 struct rbd_obj_request *next_obj_request;
1735
1736 img_request = container_of(kref, struct rbd_img_request, kref);
1737
Alex Elder37206ee2013-02-20 17:32:08 -06001738 dout("%s: img %p\n", __func__, img_request);
1739
Alex Elderbf0d5f502012-11-22 00:00:08 -06001740 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1741 rbd_img_obj_request_del(img_request, obj_request);
1742
Alex Eldera2acd002013-05-08 22:50:04 -05001743 if (img_request_layered_test(img_request)) {
1744 img_request_layered_clear(img_request);
1745 rbd_dev_parent_put(img_request->rbd_dev);
1746 }
1747
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001748 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001749 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001750
Alex Elder1c2a9df2013-05-01 12:43:03 -05001751 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001752}
1753
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001754static void prune_extents(struct ceph_file_extent *img_extents,
1755 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05001756{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001757 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05001758
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001759 /* drop extents completely beyond the overlap */
1760 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1761 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05001762
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001763 if (cnt) {
1764 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05001765
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001766 /* trim final overlapping extent */
1767 if (ex->fe_off + ex->fe_len > overlap)
1768 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06001769 }
1770
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001771 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05001772}
1773
Alex Elderf1a47392013-04-19 15:34:50 -05001774/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001775 * Determine the byte range(s) covered by either just the object extent
1776 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001777 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001778static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1779 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001780{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001781 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001782 int ret;
1783
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001784 if (!rbd_dev->parent_overlap)
1785 return 0;
1786
1787 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1788 entire ? 0 : obj_req->ex.oe_off,
1789 entire ? rbd_dev->layout.object_size :
1790 obj_req->ex.oe_len,
1791 &obj_req->img_extents,
1792 &obj_req->num_img_extents);
1793 if (ret)
1794 return ret;
1795
1796 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1797 rbd_dev->parent_overlap);
1798 return 0;
1799}
1800
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001801static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001802{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001803 struct rbd_obj_request *obj_req = osd_req->r_priv;
1804
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001805 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001806 case OBJ_REQUEST_BIO:
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001807 osd_req_op_extent_osd_data_bio(osd_req, which,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001808 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001809 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001810 break;
1811 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001812 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001813 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001814 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001815 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001816 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001817 &obj_req->bvec_pos);
1818 break;
1819 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001820 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001821 }
1822}
1823
1824static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1825{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001826 struct ceph_osd_request *osd_req;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001827
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001828 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
1829 if (IS_ERR(osd_req))
1830 return PTR_ERR(osd_req);
1831
1832 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001833 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001834 rbd_osd_setup_data(osd_req, 0);
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001835
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001836 rbd_osd_format_read(osd_req);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001837 obj_req->read_state = RBD_OBJ_READ_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001838 return 0;
1839}
1840
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001841static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001842{
1843 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001844
Alex Elderc5b5ef62013-02-11 12:33:24 -06001845 /*
1846 * The response data for a STAT call consists of:
1847 * le64 length;
1848 * struct {
1849 * le32 tv_sec;
1850 * le32 tv_nsec;
1851 * } mtime;
1852 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001853 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1854 if (IS_ERR(pages))
1855 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06001856
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001857 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
1858 osd_req_op_raw_data_in_pages(osd_req, which, pages,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001859 8 + sizeof(struct ceph_timespec),
1860 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001861 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001862}
1863
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02001864static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
1865 u32 bytes)
1866{
1867 struct rbd_obj_request *obj_req = osd_req->r_priv;
1868 int ret;
1869
1870 ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
1871 if (ret)
1872 return ret;
1873
1874 osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
1875 obj_req->copyup_bvec_count, bytes);
1876 return 0;
1877}
1878
Ilya Dryomov13488d52019-02-25 12:37:50 +01001879static int count_write_ops(struct rbd_obj_request *obj_req)
1880{
1881 return 2; /* setallochint + write/writefull */
1882}
1883
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001884static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
1885 int which)
Alex Elderb454e362013-04-19 15:34:50 -05001886{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001887 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001888 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1889 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05001890
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001891 osd_req_op_alloc_hint_init(osd_req, which++,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001892 rbd_dev->layout.object_size,
1893 rbd_dev->layout.object_size);
Alex Elderb454e362013-04-19 15:34:50 -05001894
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001895 if (rbd_obj_is_entire(obj_req))
1896 opcode = CEPH_OSD_OP_WRITEFULL;
1897 else
1898 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001899
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001900 osd_req_op_extent_init(osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001901 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001902 rbd_osd_setup_data(osd_req, which);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001903
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001904 rbd_osd_format_write(osd_req);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001905}
1906
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001907static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001908{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001909 struct ceph_osd_request *osd_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001910 unsigned int num_osd_ops, which = 0;
1911 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02001912
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001913 /* reverse map the entire object onto the parent */
1914 ret = rbd_obj_calc_img_extents(obj_req, true);
1915 if (ret)
1916 return ret;
1917
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001918 if (rbd_obj_copyup_enabled(obj_req))
1919 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
1920
1921 num_osd_ops = count_write_ops(obj_req);
1922 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
1923 num_osd_ops++; /* stat */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001924
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001925 osd_req = rbd_obj_add_osd_request(obj_req, num_osd_ops);
1926 if (IS_ERR(osd_req))
1927 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001928
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001929 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001930 ret = rbd_osd_setup_stat(osd_req, which++);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001931 if (ret)
1932 return ret;
1933 }
1934
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001935 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001936 __rbd_osd_setup_write_ops(osd_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001937 return 0;
1938}
1939
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001940static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
1941{
1942 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
1943 CEPH_OSD_OP_ZERO;
1944}
1945
1946static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1947{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001948 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001949 struct ceph_osd_request *osd_req;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001950 u64 off = obj_req->ex.oe_off;
1951 u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001952 int ret;
1953
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001954 /*
1955 * Align the range to alloc_size boundary and punt on discards
1956 * that are too small to free up any space.
1957 *
1958 * alloc_size == object_size && is_tail() is a special case for
1959 * filestore with filestore_punch_hole = false, needed to allow
1960 * truncate (in addition to delete).
1961 */
1962 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
1963 !rbd_obj_is_tail(obj_req)) {
1964 off = round_up(off, rbd_dev->opts->alloc_size);
1965 next_off = round_down(next_off, rbd_dev->opts->alloc_size);
1966 if (off >= next_off)
1967 return 1;
1968 }
1969
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001970 /* reverse map the entire object onto the parent */
1971 ret = rbd_obj_calc_img_extents(obj_req, true);
1972 if (ret)
1973 return ret;
1974
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001975 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
1976 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
1977
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001978 osd_req = rbd_obj_add_osd_request(obj_req, 1);
1979 if (IS_ERR(osd_req))
1980 return PTR_ERR(osd_req);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001981
1982 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001983 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001984 osd_req_op_init(osd_req, 0, CEPH_OSD_OP_DELETE, 0);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001985 } else {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001986 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
1987 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
1988 off, next_off - off);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001989 osd_req_op_extent_init(osd_req, 0,
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001990 truncate_or_zero_opcode(obj_req),
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001991 off, next_off - off, 0, 0);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001992 }
1993
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001994 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001995 rbd_osd_format_write(osd_req);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001996 return 0;
1997}
1998
Ilya Dryomov13488d52019-02-25 12:37:50 +01001999static int count_zeroout_ops(struct rbd_obj_request *obj_req)
2000{
2001 int num_osd_ops;
2002
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01002003 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
2004 !rbd_obj_copyup_enabled(obj_req))
Ilya Dryomov13488d52019-02-25 12:37:50 +01002005 num_osd_ops = 2; /* create + truncate */
2006 else
2007 num_osd_ops = 1; /* delete/truncate/zero */
2008
2009 return num_osd_ops;
2010}
2011
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002012static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
2013 int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002014{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002015 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002016 u16 opcode;
2017
2018 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002019 if (obj_req->num_img_extents) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002020 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002021 osd_req_op_init(osd_req, which++,
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01002022 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002023 opcode = CEPH_OSD_OP_TRUNCATE;
2024 } else {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002025 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002026 osd_req_op_init(osd_req, which++,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002027 CEPH_OSD_OP_DELETE, 0);
2028 opcode = 0;
2029 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002030 } else {
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002031 opcode = truncate_or_zero_opcode(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002032 }
2033
2034 if (opcode)
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002035 osd_req_op_extent_init(osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002036 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002037 0, 0);
2038
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002039 rbd_osd_format_write(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002040}
2041
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002042static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002043{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002044 struct ceph_osd_request *osd_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002045 unsigned int num_osd_ops, which = 0;
2046 int ret;
2047
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002048 /* reverse map the entire object onto the parent */
2049 ret = rbd_obj_calc_img_extents(obj_req, true);
2050 if (ret)
2051 return ret;
2052
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002053 if (rbd_obj_copyup_enabled(obj_req))
2054 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2055 if (!obj_req->num_img_extents) {
2056 if (rbd_obj_is_entire(obj_req))
2057 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2058 }
2059
2060 num_osd_ops = count_zeroout_ops(obj_req);
2061 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2062 num_osd_ops++; /* stat */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002063
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002064 osd_req = rbd_obj_add_osd_request(obj_req, num_osd_ops);
2065 if (IS_ERR(osd_req))
2066 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002067
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002068 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002069 ret = rbd_osd_setup_stat(osd_req, which++);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002070 if (ret)
2071 return ret;
2072 }
2073
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002074 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002075 __rbd_osd_setup_zeroout_ops(osd_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002076 return 0;
2077}
2078
2079/*
2080 * For each object request in @img_req, allocate an OSD request, add
2081 * individual OSD ops and prepare them for submission. The number of
2082 * OSD ops depends on op_type and the overlap point (if any).
2083 */
2084static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2085{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002086 struct rbd_obj_request *obj_req, *next_obj_req;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002087 struct ceph_osd_request *osd_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002088 int ret;
2089
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002090 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002091 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002092 case OBJ_OP_READ:
2093 ret = rbd_obj_setup_read(obj_req);
2094 break;
2095 case OBJ_OP_WRITE:
2096 ret = rbd_obj_setup_write(obj_req);
2097 break;
2098 case OBJ_OP_DISCARD:
2099 ret = rbd_obj_setup_discard(obj_req);
2100 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002101 case OBJ_OP_ZEROOUT:
2102 ret = rbd_obj_setup_zeroout(obj_req);
2103 break;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002104 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002105 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002106 }
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002107 if (ret < 0)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002108 return ret;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002109 if (ret > 0) {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002110 rbd_img_obj_request_del(img_req, obj_req);
2111 continue;
2112 }
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002113
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002114 osd_req = list_last_entry(&obj_req->osd_reqs,
2115 struct ceph_osd_request,
2116 r_private_item);
2117 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002118 if (ret)
2119 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002120 }
2121
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002122 img_req->state = RBD_IMG_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002123 return 0;
2124}
2125
Ilya Dryomov5a237812018-02-06 19:26:34 +01002126union rbd_img_fill_iter {
2127 struct ceph_bio_iter bio_iter;
2128 struct ceph_bvec_iter bvec_iter;
2129};
2130
2131struct rbd_img_fill_ctx {
2132 enum obj_request_type pos_type;
2133 union rbd_img_fill_iter *pos;
2134 union rbd_img_fill_iter iter;
2135 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01002136 ceph_object_extent_fn_t count_fn;
2137 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01002138};
2139
2140static struct ceph_object_extent *alloc_object_extent(void *arg)
2141{
2142 struct rbd_img_request *img_req = arg;
2143 struct rbd_obj_request *obj_req;
2144
2145 obj_req = rbd_obj_request_create();
2146 if (!obj_req)
2147 return NULL;
2148
2149 rbd_img_obj_request_add(img_req, obj_req);
2150 return &obj_req->ex;
2151}
2152
2153/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002154 * While su != os && sc == 1 is technically not fancy (it's the same
2155 * layout as su == os && sc == 1), we can't use the nocopy path for it
2156 * because ->set_pos_fn() should be called only once per object.
2157 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2158 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01002159 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002160static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2161{
2162 return l->stripe_unit != l->object_size;
2163}
2164
2165static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2166 struct ceph_file_extent *img_extents,
2167 u32 num_img_extents,
2168 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01002169{
2170 u32 i;
2171 int ret;
2172
2173 img_req->data_type = fctx->pos_type;
2174
2175 /*
2176 * Create object requests and set each object request's starting
2177 * position in the provided bio (list) or bio_vec array.
2178 */
2179 fctx->iter = *fctx->pos;
2180 for (i = 0; i < num_img_extents; i++) {
2181 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2182 img_extents[i].fe_off,
2183 img_extents[i].fe_len,
2184 &img_req->object_extents,
2185 alloc_object_extent, img_req,
2186 fctx->set_pos_fn, &fctx->iter);
2187 if (ret)
2188 return ret;
2189 }
2190
2191 return __rbd_img_fill_request(img_req);
2192}
2193
Ilya Dryomovafb97882018-02-06 19:26:35 +01002194/*
2195 * Map a list of image extents to a list of object extents, create the
2196 * corresponding object requests (normally each to a different object,
2197 * but not always) and add them to @img_req. For each object request,
2198 * set up its data descriptor to point to the corresponding chunk(s) of
2199 * @fctx->pos data buffer.
2200 *
2201 * Because ceph_file_to_extents() will merge adjacent object extents
2202 * together, each object request's data descriptor may point to multiple
2203 * different chunks of @fctx->pos data buffer.
2204 *
2205 * @fctx->pos data buffer is assumed to be large enough.
2206 */
2207static int rbd_img_fill_request(struct rbd_img_request *img_req,
2208 struct ceph_file_extent *img_extents,
2209 u32 num_img_extents,
2210 struct rbd_img_fill_ctx *fctx)
2211{
2212 struct rbd_device *rbd_dev = img_req->rbd_dev;
2213 struct rbd_obj_request *obj_req;
2214 u32 i;
2215 int ret;
2216
2217 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2218 !rbd_layout_is_fancy(&rbd_dev->layout))
2219 return rbd_img_fill_request_nocopy(img_req, img_extents,
2220 num_img_extents, fctx);
2221
2222 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2223
2224 /*
2225 * Create object requests and determine ->bvec_count for each object
2226 * request. Note that ->bvec_count sum over all object requests may
2227 * be greater than the number of bio_vecs in the provided bio (list)
2228 * or bio_vec array because when mapped, those bio_vecs can straddle
2229 * stripe unit boundaries.
2230 */
2231 fctx->iter = *fctx->pos;
2232 for (i = 0; i < num_img_extents; i++) {
2233 ret = ceph_file_to_extents(&rbd_dev->layout,
2234 img_extents[i].fe_off,
2235 img_extents[i].fe_len,
2236 &img_req->object_extents,
2237 alloc_object_extent, img_req,
2238 fctx->count_fn, &fctx->iter);
2239 if (ret)
2240 return ret;
2241 }
2242
2243 for_each_obj_request(img_req, obj_req) {
2244 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2245 sizeof(*obj_req->bvec_pos.bvecs),
2246 GFP_NOIO);
2247 if (!obj_req->bvec_pos.bvecs)
2248 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002249 }
2250
2251 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002252 * Fill in each object request's private bio_vec array, splitting and
2253 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002254 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002255 fctx->iter = *fctx->pos;
2256 for (i = 0; i < num_img_extents; i++) {
2257 ret = ceph_iterate_extents(&rbd_dev->layout,
2258 img_extents[i].fe_off,
2259 img_extents[i].fe_len,
2260 &img_req->object_extents,
2261 fctx->copy_fn, &fctx->iter);
2262 if (ret)
2263 return ret;
2264 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002265
Ilya Dryomovafb97882018-02-06 19:26:35 +01002266 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002267}
2268
Ilya Dryomov5a237812018-02-06 19:26:34 +01002269static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2270 u64 off, u64 len)
2271{
2272 struct ceph_file_extent ex = { off, len };
2273 union rbd_img_fill_iter dummy;
2274 struct rbd_img_fill_ctx fctx = {
2275 .pos_type = OBJ_REQUEST_NODATA,
2276 .pos = &dummy,
2277 };
2278
2279 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2280}
2281
2282static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2283{
2284 struct rbd_obj_request *obj_req =
2285 container_of(ex, struct rbd_obj_request, ex);
2286 struct ceph_bio_iter *it = arg;
2287
2288 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2289 obj_req->bio_pos = *it;
2290 ceph_bio_iter_advance(it, bytes);
2291}
2292
Ilya Dryomovafb97882018-02-06 19:26:35 +01002293static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2294{
2295 struct rbd_obj_request *obj_req =
2296 container_of(ex, struct rbd_obj_request, ex);
2297 struct ceph_bio_iter *it = arg;
2298
2299 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2300 ceph_bio_iter_advance_step(it, bytes, ({
2301 obj_req->bvec_count++;
2302 }));
2303
2304}
2305
2306static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2307{
2308 struct rbd_obj_request *obj_req =
2309 container_of(ex, struct rbd_obj_request, ex);
2310 struct ceph_bio_iter *it = arg;
2311
2312 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2313 ceph_bio_iter_advance_step(it, bytes, ({
2314 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2315 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2316 }));
2317}
2318
Ilya Dryomov5a237812018-02-06 19:26:34 +01002319static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2320 struct ceph_file_extent *img_extents,
2321 u32 num_img_extents,
2322 struct ceph_bio_iter *bio_pos)
2323{
2324 struct rbd_img_fill_ctx fctx = {
2325 .pos_type = OBJ_REQUEST_BIO,
2326 .pos = (union rbd_img_fill_iter *)bio_pos,
2327 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002328 .count_fn = count_bio_bvecs,
2329 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002330 };
2331
2332 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2333 &fctx);
2334}
2335
2336static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2337 u64 off, u64 len, struct bio *bio)
2338{
2339 struct ceph_file_extent ex = { off, len };
2340 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2341
2342 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2343}
2344
2345static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2346{
2347 struct rbd_obj_request *obj_req =
2348 container_of(ex, struct rbd_obj_request, ex);
2349 struct ceph_bvec_iter *it = arg;
2350
2351 obj_req->bvec_pos = *it;
2352 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2353 ceph_bvec_iter_advance(it, bytes);
2354}
2355
Ilya Dryomovafb97882018-02-06 19:26:35 +01002356static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2357{
2358 struct rbd_obj_request *obj_req =
2359 container_of(ex, struct rbd_obj_request, ex);
2360 struct ceph_bvec_iter *it = arg;
2361
2362 ceph_bvec_iter_advance_step(it, bytes, ({
2363 obj_req->bvec_count++;
2364 }));
2365}
2366
2367static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2368{
2369 struct rbd_obj_request *obj_req =
2370 container_of(ex, struct rbd_obj_request, ex);
2371 struct ceph_bvec_iter *it = arg;
2372
2373 ceph_bvec_iter_advance_step(it, bytes, ({
2374 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2375 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2376 }));
2377}
2378
Ilya Dryomov5a237812018-02-06 19:26:34 +01002379static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2380 struct ceph_file_extent *img_extents,
2381 u32 num_img_extents,
2382 struct ceph_bvec_iter *bvec_pos)
2383{
2384 struct rbd_img_fill_ctx fctx = {
2385 .pos_type = OBJ_REQUEST_BVECS,
2386 .pos = (union rbd_img_fill_iter *)bvec_pos,
2387 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002388 .count_fn = count_bvecs,
2389 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002390 };
2391
2392 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2393 &fctx);
2394}
2395
2396static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2397 struct ceph_file_extent *img_extents,
2398 u32 num_img_extents,
2399 struct bio_vec *bvecs)
2400{
2401 struct ceph_bvec_iter it = {
2402 .bvecs = bvecs,
2403 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2404 num_img_extents) },
2405 };
2406
2407 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2408 &it);
2409}
2410
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002411static void rbd_img_handle_request_work(struct work_struct *work)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002412{
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002413 struct rbd_img_request *img_req =
2414 container_of(work, struct rbd_img_request, work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002415
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002416 rbd_img_handle_request(img_req, img_req->work_result);
2417}
Alex Elderbf0d5f502012-11-22 00:00:08 -06002418
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002419static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2420{
2421 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2422 img_req->work_result = result;
2423 queue_work(rbd_wq, &img_req->work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002424}
2425
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002426static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2427{
2428 rbd_obj_request_submit(obj_req);
2429 return 0;
2430}
2431
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002432static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002433{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002434 struct rbd_img_request *img_req = obj_req->img_request;
2435 struct rbd_img_request *child_img_req;
2436 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002437
Ilya Dryomove93aca02018-02-06 19:26:35 +01002438 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2439 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002440 if (!child_img_req)
2441 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002442
Ilya Dryomove93aca02018-02-06 19:26:35 +01002443 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2444 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002445
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002446 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002447 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002448 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002449 ret = __rbd_img_fill_from_bio(child_img_req,
2450 obj_req->img_extents,
2451 obj_req->num_img_extents,
2452 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002453 break;
2454 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002455 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002456 ret = __rbd_img_fill_from_bvecs(child_img_req,
2457 obj_req->img_extents,
2458 obj_req->num_img_extents,
2459 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002460 break;
2461 default:
Arnd Bergmannd342a152019-03-22 15:36:37 +01002462 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002463 }
2464 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002465 ret = rbd_img_fill_from_bvecs(child_img_req,
2466 obj_req->img_extents,
2467 obj_req->num_img_extents,
2468 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002469 }
2470 if (ret) {
2471 rbd_img_request_put(child_img_req);
2472 return ret;
2473 }
2474
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002475 /* avoid parent chain recursion */
2476 rbd_img_schedule(child_img_req, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002477 return 0;
2478}
2479
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002480static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002481{
2482 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2483 int ret;
2484
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002485 switch (obj_req->read_state) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002486 case RBD_OBJ_READ_START:
2487 rbd_assert(!*result);
2488
2489 ret = rbd_obj_read_object(obj_req);
2490 if (ret) {
2491 *result = ret;
2492 return true;
2493 }
2494 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2495 return false;
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002496 case RBD_OBJ_READ_OBJECT:
2497 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2498 /* reverse map this object extent onto the parent */
2499 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002500 if (ret) {
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002501 *result = ret;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002502 return true;
2503 }
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002504 if (obj_req->num_img_extents) {
2505 ret = rbd_obj_read_from_parent(obj_req);
2506 if (ret) {
2507 *result = ret;
2508 return true;
2509 }
2510 obj_req->read_state = RBD_OBJ_READ_PARENT;
2511 return false;
2512 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002513 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002514
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002515 /*
2516 * -ENOENT means a hole in the image -- zero-fill the entire
2517 * length of the request. A short read also implies zero-fill
2518 * to the end of the request.
2519 */
2520 if (*result == -ENOENT) {
2521 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2522 *result = 0;
2523 } else if (*result >= 0) {
2524 if (*result < obj_req->ex.oe_len)
2525 rbd_obj_zero_range(obj_req, *result,
2526 obj_req->ex.oe_len - *result);
2527 else
2528 rbd_assert(*result == obj_req->ex.oe_len);
2529 *result = 0;
2530 }
2531 return true;
2532 case RBD_OBJ_READ_PARENT:
2533 return true;
2534 default:
2535 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002536 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002537}
2538
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002539static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
2540{
2541 rbd_obj_request_submit(obj_req);
2542 return 0;
2543}
2544
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002545/*
2546 * copyup_bvecs pages are never highmem pages
2547 */
2548static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2549{
2550 struct ceph_bvec_iter it = {
2551 .bvecs = bvecs,
2552 .iter = { .bi_size = bytes },
2553 };
2554
2555 ceph_bvec_iter_advance_step(&it, bytes, ({
2556 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2557 bv.bv_len))
2558 return false;
2559 }));
2560 return true;
2561}
2562
Ilya Dryomov3a482502019-02-28 10:49:12 +01002563#define MODS_ONLY U32_MAX
2564
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002565static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req,
2566 u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002567{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002568 struct ceph_osd_request *osd_req;
Chengguang Xufe943d52018-04-12 12:04:55 +08002569 int ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002570
2571 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002572 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002573
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002574 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
2575 if (IS_ERR(osd_req))
2576 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002577
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02002578 ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
Chengguang Xufe943d52018-04-12 12:04:55 +08002579 if (ret)
2580 return ret;
2581
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002582 rbd_osd_format_write(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002583
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002584 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002585 if (ret)
2586 return ret;
2587
2588 rbd_obj_request_submit(obj_req);
2589 return 0;
2590}
2591
Ilya Dryomov3a482502019-02-28 10:49:12 +01002592static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002593{
Ilya Dryomov13488d52019-02-25 12:37:50 +01002594 struct rbd_img_request *img_req = obj_req->img_request;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002595 struct ceph_osd_request *osd_req;
Ilya Dryomov3a482502019-02-28 10:49:12 +01002596 unsigned int num_osd_ops = (bytes != MODS_ONLY);
2597 unsigned int which = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002598 int ret;
2599
2600 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002601
Ilya Dryomov13488d52019-02-25 12:37:50 +01002602 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002603 case OBJ_OP_WRITE:
Ilya Dryomov13488d52019-02-25 12:37:50 +01002604 num_osd_ops += count_write_ops(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002605 break;
Ilya Dryomov13488d52019-02-25 12:37:50 +01002606 case OBJ_OP_ZEROOUT:
2607 num_osd_ops += count_zeroout_ops(obj_req);
2608 break;
2609 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002610 BUG();
Ilya Dryomov13488d52019-02-25 12:37:50 +01002611 }
2612
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002613 osd_req = rbd_obj_add_osd_request(obj_req, num_osd_ops);
2614 if (IS_ERR(osd_req))
2615 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002616
Ilya Dryomov3a482502019-02-28 10:49:12 +01002617 if (bytes != MODS_ONLY) {
Ilya Dryomovb5ae8cb2019-05-29 16:53:14 +02002618 ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
Ilya Dryomov3a482502019-02-28 10:49:12 +01002619 if (ret)
2620 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002621 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002622
Ilya Dryomov13488d52019-02-25 12:37:50 +01002623 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002624 case OBJ_OP_WRITE:
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002625 __rbd_osd_setup_write_ops(osd_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002626 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002627 case OBJ_OP_ZEROOUT:
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002628 __rbd_osd_setup_zeroout_ops(osd_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002629 break;
2630 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002631 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002632 }
2633
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002634 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002635 if (ret)
2636 return ret;
2637
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002638 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002639 return 0;
2640}
2641
Ilya Dryomov3a482502019-02-28 10:49:12 +01002642static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2643{
2644 /*
2645 * Only send non-zero copyup data to save some I/O and network
2646 * bandwidth -- zero copyup data is equivalent to the object not
2647 * existing.
2648 */
2649 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2650 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2651 bytes = 0;
2652 }
2653
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002654 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
2655 /*
2656 * Send a copyup request with an empty snapshot context to
2657 * deep-copyup the object through all existing snapshots.
2658 * A second request with the current snapshot context will be
2659 * sent for the actual modification.
2660 */
2661 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC;
2662 return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes);
2663 }
2664
Ilya Dryomov3a482502019-02-28 10:49:12 +01002665 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2666 return rbd_obj_issue_copyup_ops(obj_req, bytes);
2667}
2668
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002669static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2670{
2671 u32 i;
2672
2673 rbd_assert(!obj_req->copyup_bvecs);
2674 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2675 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2676 sizeof(*obj_req->copyup_bvecs),
2677 GFP_NOIO);
2678 if (!obj_req->copyup_bvecs)
2679 return -ENOMEM;
2680
2681 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2682 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2683
2684 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2685 if (!obj_req->copyup_bvecs[i].bv_page)
2686 return -ENOMEM;
2687
2688 obj_req->copyup_bvecs[i].bv_offset = 0;
2689 obj_req->copyup_bvecs[i].bv_len = len;
2690 obj_overlap -= len;
2691 }
2692
2693 rbd_assert(!obj_overlap);
2694 return 0;
2695}
2696
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002697/*
2698 * The target object doesn't exist. Read the data for the entire
2699 * target object up to the overlap point (if any) from the parent,
2700 * so we can use it for a copyup.
2701 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002702static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2703{
2704 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002705 int ret;
2706
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002707 rbd_assert(obj_req->num_img_extents);
2708 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2709 rbd_dev->parent_overlap);
2710 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002711 /*
2712 * The overlap has become 0 (most likely because the
Ilya Dryomov3a482502019-02-28 10:49:12 +01002713 * image has been flattened). Re-submit the original write
2714 * request -- pass MODS_ONLY since the copyup isn't needed
2715 * anymore.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002716 */
Ilya Dryomov3a482502019-02-28 10:49:12 +01002717 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2718 return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002719 }
2720
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002721 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002722 if (ret)
2723 return ret;
2724
Ilya Dryomov3a482502019-02-28 10:49:12 +01002725 obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002726 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002727}
2728
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002729static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002730{
2731 int ret;
2732
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002733 switch (obj_req->write_state) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002734 case RBD_OBJ_WRITE_START:
2735 rbd_assert(!*result);
2736
2737 ret = rbd_obj_write_object(obj_req);
2738 if (ret) {
2739 *result = ret;
2740 return true;
2741 }
2742 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
2743 return false;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002744 case RBD_OBJ_WRITE_OBJECT:
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002745 if (*result == -ENOENT) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002746 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2747 ret = rbd_obj_handle_write_guard(obj_req);
2748 if (ret) {
2749 *result = ret;
2750 return true;
2751 }
2752 return false;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002753 }
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002754 /*
2755 * On a non-existent object:
2756 * delete - -ENOENT, truncate/zero - 0
2757 */
2758 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
2759 *result = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002760 }
2761 /* fall through */
Ilya Dryomov3a482502019-02-28 10:49:12 +01002762 case RBD_OBJ_WRITE_COPYUP_OPS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002763 return true;
Ilya Dryomov3a482502019-02-28 10:49:12 +01002764 case RBD_OBJ_WRITE_READ_FROM_PARENT:
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002765 if (*result)
Ilya Dryomov3a482502019-02-28 10:49:12 +01002766 return true;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002767
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002768 ret = rbd_obj_issue_copyup(obj_req,
2769 rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002770 if (ret) {
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002771 *result = ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002772 return true;
2773 }
2774 return false;
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002775 case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC:
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002776 if (*result)
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002777 return true;
2778
2779 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2780 ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
2781 if (ret) {
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002782 *result = ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002783 return true;
2784 }
2785 return false;
2786 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002787 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002788 }
2789}
2790
2791/*
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002792 * Return true if @obj_req is completed.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002793 */
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002794static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
2795 int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002796{
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002797 struct rbd_img_request *img_req = obj_req->img_request;
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002798 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002799 bool done;
2800
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002801 mutex_lock(&obj_req->state_mutex);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002802 if (!rbd_img_is_write(img_req))
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002803 done = rbd_obj_advance_read(obj_req, result);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002804 else
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002805 done = rbd_obj_advance_write(obj_req, result);
2806 mutex_unlock(&obj_req->state_mutex);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002807
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002808 if (done && *result) {
2809 rbd_assert(*result < 0);
2810 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
2811 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2812 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
2813 }
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002814 return done;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002815}
2816
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002817/*
2818 * This is open-coded in rbd_img_handle_request() to avoid parent chain
2819 * recursion.
2820 */
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002821static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002822{
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002823 if (__rbd_obj_handle_request(obj_req, &result))
2824 rbd_img_handle_request(obj_req->img_request, result);
2825}
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002826
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002827static void rbd_img_object_requests(struct rbd_img_request *img_req)
2828{
2829 struct rbd_obj_request *obj_req;
2830
2831 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
2832
2833 for_each_obj_request(img_req, obj_req) {
2834 int result = 0;
2835
2836 if (__rbd_obj_handle_request(obj_req, &result)) {
2837 if (result) {
2838 img_req->pending.result = result;
2839 return;
2840 }
2841 } else {
2842 img_req->pending.num_pending++;
2843 }
2844 }
2845}
2846
2847static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
2848{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002849again:
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002850 switch (img_req->state) {
2851 case RBD_IMG_START:
2852 rbd_assert(!*result);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002853
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002854 rbd_img_object_requests(img_req);
2855 if (!img_req->pending.num_pending) {
2856 *result = img_req->pending.result;
2857 img_req->state = RBD_IMG_OBJECT_REQUESTS;
2858 goto again;
2859 }
2860 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
2861 return false;
2862 case __RBD_IMG_OBJECT_REQUESTS:
2863 if (!pending_result_dec(&img_req->pending, result))
2864 return false;
2865 /* fall through */
2866 case RBD_IMG_OBJECT_REQUESTS:
2867 return true;
2868 default:
2869 BUG();
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002870 }
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002871}
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002872
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002873/*
2874 * Return true if @img_req is completed.
2875 */
2876static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
2877 int *result)
2878{
2879 struct rbd_device *rbd_dev = img_req->rbd_dev;
2880 bool done;
2881
2882 mutex_lock(&img_req->state_mutex);
2883 done = rbd_img_advance(img_req, result);
2884 mutex_unlock(&img_req->state_mutex);
2885
2886 if (done && *result) {
2887 rbd_assert(*result < 0);
2888 rbd_warn(rbd_dev, "%s%s result %d",
2889 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
2890 obj_op_name(img_req->op_type), *result);
2891 }
2892 return done;
2893}
2894
2895static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
2896{
2897again:
2898 if (!__rbd_img_handle_request(img_req, &result))
2899 return;
2900
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002901 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002902 struct rbd_obj_request *obj_req = img_req->obj_request;
2903
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002904 rbd_img_request_put(img_req);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002905 if (__rbd_obj_handle_request(obj_req, &result)) {
2906 img_req = obj_req->img_request;
2907 goto again;
2908 }
2909 } else {
2910 struct request *rq = img_req->rq;
2911
2912 rbd_img_request_put(img_req);
2913 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002914 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06002915}
2916
Ilya Dryomoved95b212016-08-12 16:40:02 +02002917static const struct rbd_client_id rbd_empty_cid;
2918
2919static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2920 const struct rbd_client_id *rhs)
2921{
2922 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2923}
2924
2925static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2926{
2927 struct rbd_client_id cid;
2928
2929 mutex_lock(&rbd_dev->watch_mutex);
2930 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2931 cid.handle = rbd_dev->watch_cookie;
2932 mutex_unlock(&rbd_dev->watch_mutex);
2933 return cid;
2934}
2935
2936/*
2937 * lock_rwsem must be held for write
2938 */
2939static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2940 const struct rbd_client_id *cid)
2941{
2942 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2943 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2944 cid->gid, cid->handle);
2945 rbd_dev->owner_cid = *cid; /* struct */
2946}
2947
2948static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2949{
2950 mutex_lock(&rbd_dev->watch_mutex);
2951 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2952 mutex_unlock(&rbd_dev->watch_mutex);
2953}
2954
Florian Margaineedd8ca82017-12-13 16:43:59 +01002955static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2956{
2957 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2958
2959 strcpy(rbd_dev->lock_cookie, cookie);
2960 rbd_set_owner_cid(rbd_dev, &cid);
2961 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2962}
2963
Ilya Dryomoved95b212016-08-12 16:40:02 +02002964/*
2965 * lock_rwsem must be held for write
2966 */
2967static int rbd_lock(struct rbd_device *rbd_dev)
2968{
2969 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002970 char cookie[32];
2971 int ret;
2972
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002973 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2974 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002975
2976 format_lock_cookie(rbd_dev, cookie);
2977 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2978 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2979 RBD_LOCK_TAG, "", 0);
2980 if (ret)
2981 return ret;
2982
2983 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002984 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002985 return 0;
2986}
2987
2988/*
2989 * lock_rwsem must be held for write
2990 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002991static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002992{
2993 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002994 int ret;
2995
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002996 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2997 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002998
Ilya Dryomoved95b212016-08-12 16:40:02 +02002999 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003000 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02003001 if (ret && ret != -ENOENT)
3002 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003003
Ilya Dryomovbbead742017-04-13 12:17:38 +02003004 /* treat errors as the image is unlocked */
3005 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003006 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02003007 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3008 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003009}
3010
3011static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3012 enum rbd_notify_op notify_op,
3013 struct page ***preply_pages,
3014 size_t *preply_len)
3015{
3016 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3017 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07003018 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3019 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003020 void *p = buf;
3021
3022 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3023
3024 /* encode *LockPayload NotifyMessage (op + ClientId) */
3025 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3026 ceph_encode_32(&p, notify_op);
3027 ceph_encode_64(&p, cid.gid);
3028 ceph_encode_64(&p, cid.handle);
3029
3030 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3031 &rbd_dev->header_oloc, buf, buf_size,
3032 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3033}
3034
3035static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3036 enum rbd_notify_op notify_op)
3037{
3038 struct page **reply_pages;
3039 size_t reply_len;
3040
3041 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3042 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3043}
3044
3045static void rbd_notify_acquired_lock(struct work_struct *work)
3046{
3047 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3048 acquired_lock_work);
3049
3050 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3051}
3052
3053static void rbd_notify_released_lock(struct work_struct *work)
3054{
3055 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3056 released_lock_work);
3057
3058 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3059}
3060
3061static int rbd_request_lock(struct rbd_device *rbd_dev)
3062{
3063 struct page **reply_pages;
3064 size_t reply_len;
3065 bool lock_owner_responded = false;
3066 int ret;
3067
3068 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3069
3070 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3071 &reply_pages, &reply_len);
3072 if (ret && ret != -ETIMEDOUT) {
3073 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3074 goto out;
3075 }
3076
3077 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3078 void *p = page_address(reply_pages[0]);
3079 void *const end = p + reply_len;
3080 u32 n;
3081
3082 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3083 while (n--) {
3084 u8 struct_v;
3085 u32 len;
3086
3087 ceph_decode_need(&p, end, 8 + 8, e_inval);
3088 p += 8 + 8; /* skip gid and cookie */
3089
3090 ceph_decode_32_safe(&p, end, len, e_inval);
3091 if (!len)
3092 continue;
3093
3094 if (lock_owner_responded) {
3095 rbd_warn(rbd_dev,
3096 "duplicate lock owners detected");
3097 ret = -EIO;
3098 goto out;
3099 }
3100
3101 lock_owner_responded = true;
3102 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3103 &struct_v, &len);
3104 if (ret) {
3105 rbd_warn(rbd_dev,
3106 "failed to decode ResponseMessage: %d",
3107 ret);
3108 goto e_inval;
3109 }
3110
3111 ret = ceph_decode_32(&p);
3112 }
3113 }
3114
3115 if (!lock_owner_responded) {
3116 rbd_warn(rbd_dev, "no lock owners detected");
3117 ret = -ETIMEDOUT;
3118 }
3119
3120out:
3121 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3122 return ret;
3123
3124e_inval:
3125 ret = -EINVAL;
3126 goto out;
3127}
3128
3129static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3130{
3131 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3132
3133 cancel_delayed_work(&rbd_dev->lock_dwork);
3134 if (wake_all)
3135 wake_up_all(&rbd_dev->lock_waitq);
3136 else
3137 wake_up(&rbd_dev->lock_waitq);
3138}
3139
3140static int get_lock_owner_info(struct rbd_device *rbd_dev,
3141 struct ceph_locker **lockers, u32 *num_lockers)
3142{
3143 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3144 u8 lock_type;
3145 char *lock_tag;
3146 int ret;
3147
3148 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3149
3150 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3151 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3152 &lock_type, &lock_tag, lockers, num_lockers);
3153 if (ret)
3154 return ret;
3155
3156 if (*num_lockers == 0) {
3157 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3158 goto out;
3159 }
3160
3161 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3162 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3163 lock_tag);
3164 ret = -EBUSY;
3165 goto out;
3166 }
3167
3168 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3169 rbd_warn(rbd_dev, "shared lock type detected");
3170 ret = -EBUSY;
3171 goto out;
3172 }
3173
3174 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3175 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3176 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3177 (*lockers)[0].id.cookie);
3178 ret = -EBUSY;
3179 goto out;
3180 }
3181
3182out:
3183 kfree(lock_tag);
3184 return ret;
3185}
3186
3187static int find_watcher(struct rbd_device *rbd_dev,
3188 const struct ceph_locker *locker)
3189{
3190 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3191 struct ceph_watch_item *watchers;
3192 u32 num_watchers;
3193 u64 cookie;
3194 int i;
3195 int ret;
3196
3197 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3198 &rbd_dev->header_oloc, &watchers,
3199 &num_watchers);
3200 if (ret)
3201 return ret;
3202
3203 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3204 for (i = 0; i < num_watchers; i++) {
3205 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3206 sizeof(locker->info.addr)) &&
3207 watchers[i].cookie == cookie) {
3208 struct rbd_client_id cid = {
3209 .gid = le64_to_cpu(watchers[i].name.num),
3210 .handle = cookie,
3211 };
3212
3213 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3214 rbd_dev, cid.gid, cid.handle);
3215 rbd_set_owner_cid(rbd_dev, &cid);
3216 ret = 1;
3217 goto out;
3218 }
3219 }
3220
3221 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3222 ret = 0;
3223out:
3224 kfree(watchers);
3225 return ret;
3226}
3227
3228/*
3229 * lock_rwsem must be held for write
3230 */
3231static int rbd_try_lock(struct rbd_device *rbd_dev)
3232{
3233 struct ceph_client *client = rbd_dev->rbd_client->client;
3234 struct ceph_locker *lockers;
3235 u32 num_lockers;
3236 int ret;
3237
3238 for (;;) {
3239 ret = rbd_lock(rbd_dev);
3240 if (ret != -EBUSY)
3241 return ret;
3242
3243 /* determine if the current lock holder is still alive */
3244 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3245 if (ret)
3246 return ret;
3247
3248 if (num_lockers == 0)
3249 goto again;
3250
3251 ret = find_watcher(rbd_dev, lockers);
3252 if (ret) {
3253 if (ret > 0)
3254 ret = 0; /* have to request lock */
3255 goto out;
3256 }
3257
3258 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3259 ENTITY_NAME(lockers[0].id.name));
3260
3261 ret = ceph_monc_blacklist_add(&client->monc,
3262 &lockers[0].info.addr);
3263 if (ret) {
3264 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3265 ENTITY_NAME(lockers[0].id.name), ret);
3266 goto out;
3267 }
3268
3269 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3270 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3271 lockers[0].id.cookie,
3272 &lockers[0].id.name);
3273 if (ret && ret != -ENOENT)
3274 goto out;
3275
3276again:
3277 ceph_free_lockers(lockers, num_lockers);
3278 }
3279
3280out:
3281 ceph_free_lockers(lockers, num_lockers);
3282 return ret;
3283}
3284
3285/*
3286 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3287 */
3288static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3289 int *pret)
3290{
3291 enum rbd_lock_state lock_state;
3292
3293 down_read(&rbd_dev->lock_rwsem);
3294 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3295 rbd_dev->lock_state);
3296 if (__rbd_is_lock_owner(rbd_dev)) {
3297 lock_state = rbd_dev->lock_state;
3298 up_read(&rbd_dev->lock_rwsem);
3299 return lock_state;
3300 }
3301
3302 up_read(&rbd_dev->lock_rwsem);
3303 down_write(&rbd_dev->lock_rwsem);
3304 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3305 rbd_dev->lock_state);
3306 if (!__rbd_is_lock_owner(rbd_dev)) {
3307 *pret = rbd_try_lock(rbd_dev);
3308 if (*pret)
3309 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3310 }
3311
3312 lock_state = rbd_dev->lock_state;
3313 up_write(&rbd_dev->lock_rwsem);
3314 return lock_state;
3315}
3316
3317static void rbd_acquire_lock(struct work_struct *work)
3318{
3319 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3320 struct rbd_device, lock_dwork);
3321 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003322 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003323
3324 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3325again:
3326 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3327 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3328 if (lock_state == RBD_LOCK_STATE_LOCKED)
3329 wake_requests(rbd_dev, true);
3330 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3331 rbd_dev, lock_state, ret);
3332 return;
3333 }
3334
3335 ret = rbd_request_lock(rbd_dev);
3336 if (ret == -ETIMEDOUT) {
3337 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003338 } else if (ret == -EROFS) {
3339 rbd_warn(rbd_dev, "peer will not release lock");
3340 /*
3341 * If this is rbd_add_acquire_lock(), we want to fail
3342 * immediately -- reuse BLACKLISTED flag. Otherwise we
3343 * want to block.
3344 */
3345 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3346 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3347 /* wake "rbd map --exclusive" process */
3348 wake_requests(rbd_dev, false);
3349 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003350 } else if (ret < 0) {
3351 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3352 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3353 RBD_RETRY_DELAY);
3354 } else {
3355 /*
3356 * lock owner acked, but resend if we don't see them
3357 * release the lock
3358 */
3359 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3360 rbd_dev);
3361 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3362 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3363 }
3364}
3365
3366/*
3367 * lock_rwsem must be held for write
3368 */
3369static bool rbd_release_lock(struct rbd_device *rbd_dev)
3370{
3371 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3372 rbd_dev->lock_state);
3373 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3374 return false;
3375
3376 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3377 downgrade_write(&rbd_dev->lock_rwsem);
3378 /*
3379 * Ensure that all in-flight IO is flushed.
3380 *
3381 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3382 * may be shared with other devices.
3383 */
3384 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3385 up_read(&rbd_dev->lock_rwsem);
3386
3387 down_write(&rbd_dev->lock_rwsem);
3388 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3389 rbd_dev->lock_state);
3390 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3391 return false;
3392
Ilya Dryomovbbead742017-04-13 12:17:38 +02003393 rbd_unlock(rbd_dev);
3394 /*
3395 * Give others a chance to grab the lock - we would re-acquire
3396 * almost immediately if we got new IO during ceph_osdc_sync()
3397 * otherwise. We need to ack our own notifications, so this
3398 * lock_dwork will be requeued from rbd_wait_state_locked()
3399 * after wake_requests() in rbd_handle_released_lock().
3400 */
3401 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003402 return true;
3403}
3404
3405static void rbd_release_lock_work(struct work_struct *work)
3406{
3407 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3408 unlock_work);
3409
3410 down_write(&rbd_dev->lock_rwsem);
3411 rbd_release_lock(rbd_dev);
3412 up_write(&rbd_dev->lock_rwsem);
3413}
3414
3415static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3416 void **p)
3417{
3418 struct rbd_client_id cid = { 0 };
3419
3420 if (struct_v >= 2) {
3421 cid.gid = ceph_decode_64(p);
3422 cid.handle = ceph_decode_64(p);
3423 }
3424
3425 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3426 cid.handle);
3427 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3428 down_write(&rbd_dev->lock_rwsem);
3429 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3430 /*
3431 * we already know that the remote client is
3432 * the owner
3433 */
3434 up_write(&rbd_dev->lock_rwsem);
3435 return;
3436 }
3437
3438 rbd_set_owner_cid(rbd_dev, &cid);
3439 downgrade_write(&rbd_dev->lock_rwsem);
3440 } else {
3441 down_read(&rbd_dev->lock_rwsem);
3442 }
3443
3444 if (!__rbd_is_lock_owner(rbd_dev))
3445 wake_requests(rbd_dev, false);
3446 up_read(&rbd_dev->lock_rwsem);
3447}
3448
3449static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3450 void **p)
3451{
3452 struct rbd_client_id cid = { 0 };
3453
3454 if (struct_v >= 2) {
3455 cid.gid = ceph_decode_64(p);
3456 cid.handle = ceph_decode_64(p);
3457 }
3458
3459 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3460 cid.handle);
3461 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3462 down_write(&rbd_dev->lock_rwsem);
3463 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3464 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3465 __func__, rbd_dev, cid.gid, cid.handle,
3466 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3467 up_write(&rbd_dev->lock_rwsem);
3468 return;
3469 }
3470
3471 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3472 downgrade_write(&rbd_dev->lock_rwsem);
3473 } else {
3474 down_read(&rbd_dev->lock_rwsem);
3475 }
3476
3477 if (!__rbd_is_lock_owner(rbd_dev))
3478 wake_requests(rbd_dev, false);
3479 up_read(&rbd_dev->lock_rwsem);
3480}
3481
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003482/*
3483 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3484 * ResponseMessage is needed.
3485 */
3486static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3487 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003488{
3489 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3490 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003491 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003492
3493 if (struct_v >= 2) {
3494 cid.gid = ceph_decode_64(p);
3495 cid.handle = ceph_decode_64(p);
3496 }
3497
3498 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3499 cid.handle);
3500 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003501 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003502
3503 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003504 if (__rbd_is_lock_owner(rbd_dev)) {
3505 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3506 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3507 goto out_unlock;
3508
3509 /*
3510 * encode ResponseMessage(0) so the peer can detect
3511 * a missing owner
3512 */
3513 result = 0;
3514
3515 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003516 if (!rbd_dev->opts->exclusive) {
3517 dout("%s rbd_dev %p queueing unlock_work\n",
3518 __func__, rbd_dev);
3519 queue_work(rbd_dev->task_wq,
3520 &rbd_dev->unlock_work);
3521 } else {
3522 /* refuse to release the lock */
3523 result = -EROFS;
3524 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003525 }
3526 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003527
3528out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003529 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003530 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003531}
3532
3533static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3534 u64 notify_id, u64 cookie, s32 *result)
3535{
3536 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07003537 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3538 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003539 int ret;
3540
3541 if (result) {
3542 void *p = buf;
3543
3544 /* encode ResponseMessage */
3545 ceph_start_encoding(&p, 1, 1,
3546 buf_size - CEPH_ENCODING_START_BLK_LEN);
3547 ceph_encode_32(&p, *result);
3548 } else {
3549 buf_size = 0;
3550 }
3551
3552 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3553 &rbd_dev->header_oloc, notify_id, cookie,
3554 buf, buf_size);
3555 if (ret)
3556 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3557}
3558
3559static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3560 u64 cookie)
3561{
3562 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3563 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3564}
3565
3566static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3567 u64 notify_id, u64 cookie, s32 result)
3568{
3569 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3570 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3571}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003572
3573static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3574 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003575{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003576 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003577 void *p = data;
3578 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003579 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003580 u32 len;
3581 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003582 int ret;
3583
Ilya Dryomoved95b212016-08-12 16:40:02 +02003584 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3585 __func__, rbd_dev, cookie, notify_id, data_len);
3586 if (data_len) {
3587 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3588 &struct_v, &len);
3589 if (ret) {
3590 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3591 ret);
3592 return;
3593 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003594
Ilya Dryomoved95b212016-08-12 16:40:02 +02003595 notify_op = ceph_decode_32(&p);
3596 } else {
3597 /* legacy notification for header updates */
3598 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3599 len = 0;
3600 }
Alex Elderb8d70032012-11-30 17:53:04 -06003601
Ilya Dryomoved95b212016-08-12 16:40:02 +02003602 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3603 switch (notify_op) {
3604 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3605 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3606 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3607 break;
3608 case RBD_NOTIFY_OP_RELEASED_LOCK:
3609 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3610 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3611 break;
3612 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003613 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3614 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003615 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003616 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003617 else
3618 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3619 break;
3620 case RBD_NOTIFY_OP_HEADER_UPDATE:
3621 ret = rbd_dev_refresh(rbd_dev);
3622 if (ret)
3623 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3624
3625 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3626 break;
3627 default:
3628 if (rbd_is_lock_owner(rbd_dev))
3629 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3630 cookie, -EOPNOTSUPP);
3631 else
3632 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3633 break;
3634 }
Alex Elderb8d70032012-11-30 17:53:04 -06003635}
3636
Ilya Dryomov99d16942016-08-12 16:11:41 +02003637static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3638
Ilya Dryomov922dab62016-05-26 01:15:02 +02003639static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003640{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003641 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003642
Ilya Dryomov922dab62016-05-26 01:15:02 +02003643 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003644
Ilya Dryomoved95b212016-08-12 16:40:02 +02003645 down_write(&rbd_dev->lock_rwsem);
3646 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3647 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003648
Ilya Dryomov99d16942016-08-12 16:11:41 +02003649 mutex_lock(&rbd_dev->watch_mutex);
3650 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3651 __rbd_unregister_watch(rbd_dev);
3652 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003653
Ilya Dryomov99d16942016-08-12 16:11:41 +02003654 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003655 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003656 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003657}
3658
3659/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003660 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003661 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003662static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003663{
3664 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003665 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003666
Ilya Dryomov922dab62016-05-26 01:15:02 +02003667 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003668 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003669
Ilya Dryomov922dab62016-05-26 01:15:02 +02003670 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3671 &rbd_dev->header_oloc, rbd_watch_cb,
3672 rbd_watch_errcb, rbd_dev);
3673 if (IS_ERR(handle))
3674 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003675
Ilya Dryomov922dab62016-05-26 01:15:02 +02003676 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003677 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003678}
3679
Ilya Dryomov99d16942016-08-12 16:11:41 +02003680/*
3681 * watch_mutex must be locked
3682 */
3683static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003684{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003685 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3686 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003687
Ilya Dryomov99d16942016-08-12 16:11:41 +02003688 rbd_assert(rbd_dev->watch_handle);
3689 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003690
Ilya Dryomov922dab62016-05-26 01:15:02 +02003691 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3692 if (ret)
3693 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003694
Ilya Dryomov922dab62016-05-26 01:15:02 +02003695 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003696}
3697
Ilya Dryomov99d16942016-08-12 16:11:41 +02003698static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003699{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003700 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003701
Ilya Dryomov99d16942016-08-12 16:11:41 +02003702 mutex_lock(&rbd_dev->watch_mutex);
3703 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3704 ret = __rbd_register_watch(rbd_dev);
3705 if (ret)
3706 goto out;
3707
3708 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3709 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3710
3711out:
3712 mutex_unlock(&rbd_dev->watch_mutex);
3713 return ret;
3714}
3715
3716static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3717{
3718 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3719
Ilya Dryomoved95b212016-08-12 16:40:02 +02003720 cancel_work_sync(&rbd_dev->acquired_lock_work);
3721 cancel_work_sync(&rbd_dev->released_lock_work);
3722 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3723 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003724}
3725
3726static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3727{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003728 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003729 cancel_tasks_sync(rbd_dev);
3730
3731 mutex_lock(&rbd_dev->watch_mutex);
3732 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3733 __rbd_unregister_watch(rbd_dev);
3734 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3735 mutex_unlock(&rbd_dev->watch_mutex);
3736
Dongsheng Yang23edca82018-06-04 06:24:37 -04003737 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomov811c6682016-04-15 16:22:16 +02003738 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003739}
3740
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003741/*
3742 * lock_rwsem must be held for write
3743 */
3744static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3745{
3746 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3747 char cookie[32];
3748 int ret;
3749
3750 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3751
3752 format_lock_cookie(rbd_dev, cookie);
3753 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3754 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3755 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3756 RBD_LOCK_TAG, cookie);
3757 if (ret) {
3758 if (ret != -EOPNOTSUPP)
3759 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3760 ret);
3761
3762 /*
3763 * Lock cookie cannot be updated on older OSDs, so do
3764 * a manual release and queue an acquire.
3765 */
3766 if (rbd_release_lock(rbd_dev))
3767 queue_delayed_work(rbd_dev->task_wq,
3768 &rbd_dev->lock_dwork, 0);
3769 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003770 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003771 }
3772}
3773
Ilya Dryomov99d16942016-08-12 16:11:41 +02003774static void rbd_reregister_watch(struct work_struct *work)
3775{
3776 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3777 struct rbd_device, watch_dwork);
3778 int ret;
3779
3780 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3781
3782 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003783 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3784 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003785 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003786 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003787
3788 ret = __rbd_register_watch(rbd_dev);
3789 if (ret) {
3790 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003791 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003792 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003793 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003794 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003795 queue_delayed_work(rbd_dev->task_wq,
3796 &rbd_dev->watch_dwork,
3797 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003798 }
3799 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003800 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003801 }
3802
3803 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3804 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3805 mutex_unlock(&rbd_dev->watch_mutex);
3806
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003807 down_write(&rbd_dev->lock_rwsem);
3808 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3809 rbd_reacquire_lock(rbd_dev);
3810 up_write(&rbd_dev->lock_rwsem);
3811
Ilya Dryomov99d16942016-08-12 16:11:41 +02003812 ret = rbd_dev_refresh(rbd_dev);
3813 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00003814 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003815}
3816
Alex Elder36be9a72013-01-19 00:30:28 -06003817/*
Alex Elderf40eb342013-04-25 15:09:42 -05003818 * Synchronous osd object method call. Returns the number of bytes
3819 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003820 */
3821static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003822 struct ceph_object_id *oid,
3823 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003824 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003825 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003826 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003827 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003828 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003829{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003830 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3831 struct page *req_page = NULL;
3832 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003833 int ret;
3834
3835 /*
Alex Elder6010a452013-04-05 01:27:11 -05003836 * Method calls are ultimately read operations. The result
3837 * should placed into the inbound buffer provided. They
3838 * also supply outbound data--parameters for the object
3839 * method. Currently if this is present it will be a
3840 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003841 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003842 if (outbound) {
3843 if (outbound_size > PAGE_SIZE)
3844 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003845
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003846 req_page = alloc_page(GFP_KERNEL);
3847 if (!req_page)
3848 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003849
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003850 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003851 }
Alex Elder430c28c2013-04-03 21:32:51 -05003852
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003853 reply_page = alloc_page(GFP_KERNEL);
3854 if (!reply_page) {
3855 if (req_page)
3856 __free_page(req_page);
3857 return -ENOMEM;
3858 }
Alex Elder36be9a72013-01-19 00:30:28 -06003859
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003860 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3861 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3862 reply_page, &inbound_size);
3863 if (!ret) {
3864 memcpy(inbound, page_address(reply_page), inbound_size);
3865 ret = inbound_size;
3866 }
Alex Elder57385b52013-04-21 12:14:45 -05003867
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003868 if (req_page)
3869 __free_page(req_page);
3870 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003871 return ret;
3872}
3873
Ilya Dryomoved95b212016-08-12 16:40:02 +02003874/*
3875 * lock_rwsem must be held for read
3876 */
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003877static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003878{
3879 DEFINE_WAIT(wait);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003880 unsigned long timeout;
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003881 int ret = 0;
3882
3883 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3884 return -EBLACKLISTED;
3885
3886 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3887 return 0;
3888
3889 if (!may_acquire) {
3890 rbd_warn(rbd_dev, "exclusive lock required");
3891 return -EROFS;
3892 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003893
3894 do {
3895 /*
3896 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3897 * and cancel_delayed_work() in wake_requests().
3898 */
3899 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3900 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3901 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3902 TASK_UNINTERRUPTIBLE);
3903 up_read(&rbd_dev->lock_rwsem);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003904 timeout = schedule_timeout(ceph_timeout_jiffies(
3905 rbd_dev->opts->lock_timeout));
Ilya Dryomoved95b212016-08-12 16:40:02 +02003906 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003907 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3908 ret = -EBLACKLISTED;
3909 break;
3910 }
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003911 if (!timeout) {
3912 rbd_warn(rbd_dev, "timed out waiting for lock");
3913 ret = -ETIMEDOUT;
3914 break;
3915 }
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003916 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003917
Ilya Dryomoved95b212016-08-12 16:40:02 +02003918 finish_wait(&rbd_dev->lock_waitq, &wait);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003919 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003920}
3921
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003922static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003923{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003924 struct request *rq = blk_mq_rq_from_pdu(work);
3925 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003926 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003927 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003928 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3929 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003930 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003931 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003932 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003933 int result;
3934
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003935 switch (req_op(rq)) {
3936 case REQ_OP_DISCARD:
3937 op_type = OBJ_OP_DISCARD;
3938 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01003939 case REQ_OP_WRITE_ZEROES:
3940 op_type = OBJ_OP_ZEROOUT;
3941 break;
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003942 case REQ_OP_WRITE:
3943 op_type = OBJ_OP_WRITE;
3944 break;
3945 case REQ_OP_READ:
3946 op_type = OBJ_OP_READ;
3947 break;
3948 default:
3949 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003950 result = -EIO;
3951 goto err;
3952 }
3953
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003954 /* Ignore/skip any zero-length requests */
3955
3956 if (!length) {
3957 dout("%s: zero-length request\n", __func__);
3958 result = 0;
3959 goto err_rq;
3960 }
3961
Ilya Dryomovb91a7bd2019-05-03 17:27:03 +02003962 if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) {
3963 rbd_warn(rbd_dev, "%s on read-only snapshot",
3964 obj_op_name(op_type));
3965 result = -EIO;
3966 goto err;
3967 }
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003968
3969 /*
3970 * Quit early if the mapped snapshot no longer exists. It's
3971 * still possible the snapshot will have disappeared by the
3972 * time our request arrives at the osd, but there's no sense in
3973 * sending it if we already know.
3974 */
3975 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3976 dout("request for non-existent snapshot");
3977 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3978 result = -ENXIO;
3979 goto err_rq;
3980 }
3981
3982 if (offset && length > U64_MAX - offset + 1) {
3983 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3984 length);
3985 result = -EINVAL;
3986 goto err_rq; /* Shouldn't happen */
3987 }
3988
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003989 blk_mq_start_request(rq);
3990
Josh Durgin4e752f02014-04-08 11:12:11 -07003991 down_read(&rbd_dev->header_rwsem);
3992 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003993 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003994 snapc = rbd_dev->header.snapc;
3995 ceph_get_snap_context(snapc);
3996 }
3997 up_read(&rbd_dev->header_rwsem);
3998
3999 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004000 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07004001 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004002 result = -EIO;
4003 goto err_rq;
4004 }
4005
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02004006 must_be_locked =
4007 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
4008 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004009 if (must_be_locked) {
4010 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02004011 result = rbd_wait_state_locked(rbd_dev,
4012 !rbd_dev->opts->exclusive);
4013 if (result)
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004014 goto err_unlock;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004015 }
4016
Ilya Dryomovdfd98752018-02-06 19:26:35 +01004017 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004018 if (!img_request) {
4019 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004020 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004021 }
4022 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01004023 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004024
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01004025 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
Ilya Dryomov5a237812018-02-06 19:26:34 +01004026 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004027 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01004028 result = rbd_img_fill_from_bio(img_request, offset, length,
4029 rq->bio);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02004030 if (result)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004031 goto err_img_request;
4032
Ilya Dryomov0192ce22019-05-16 15:06:56 +02004033 rbd_img_handle_request(img_request, 0);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004034 if (must_be_locked)
4035 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004036 return;
4037
4038err_img_request:
4039 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004040err_unlock:
4041 if (must_be_locked)
4042 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004043err_rq:
4044 if (result)
4045 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004046 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01004047 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004048err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02004049 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004050}
4051
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004052static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004053 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004054{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004055 struct request *rq = bd->rq;
4056 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004057
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004058 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004059 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06004060}
4061
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004062static void rbd_free_disk(struct rbd_device *rbd_dev)
4063{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004064 blk_cleanup_queue(rbd_dev->disk->queue);
4065 blk_mq_free_tag_set(&rbd_dev->tag_set);
4066 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05004067 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004068}
4069
Alex Elder788e2df2013-01-17 12:25:27 -06004070static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004071 struct ceph_object_id *oid,
4072 struct ceph_object_locator *oloc,
4073 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06004074
4075{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004076 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4077 struct ceph_osd_request *req;
4078 struct page **pages;
4079 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06004080 int ret;
4081
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004082 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4083 if (!req)
4084 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06004085
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004086 ceph_oid_copy(&req->r_base_oid, oid);
4087 ceph_oloc_copy(&req->r_base_oloc, oloc);
4088 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06004089
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004090 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4091 if (IS_ERR(pages)) {
4092 ret = PTR_ERR(pages);
4093 goto out_req;
4094 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06004095
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004096 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4097 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4098 true);
Alex Elder788e2df2013-01-17 12:25:27 -06004099
Ilya Dryomov26f887e2018-10-15 16:11:37 +02004100 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4101 if (ret)
4102 goto out_req;
4103
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004104 ceph_osdc_start_request(osdc, req, false);
4105 ret = ceph_osdc_wait_request(osdc, req);
4106 if (ret >= 0)
4107 ceph_copy_from_page_vector(pages, buf, 0, ret);
4108
4109out_req:
4110 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06004111 return ret;
4112}
4113
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004114/*
Alex Elder662518b2013-05-06 09:51:29 -05004115 * Read the complete header for the given rbd device. On successful
4116 * return, the rbd_dev->header field will contain up-to-date
4117 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05004118 */
Alex Elder99a41eb2013-05-06 09:51:30 -05004119static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05004120{
4121 struct rbd_image_header_ondisk *ondisk = NULL;
4122 u32 snap_count = 0;
4123 u64 names_size = 0;
4124 u32 want_count;
4125 int ret;
4126
4127 /*
4128 * The complete header will include an array of its 64-bit
4129 * snapshot ids, followed by the names of those snapshots as
4130 * a contiguous block of NUL-terminated strings. Note that
4131 * the number of snapshots could change by the time we read
4132 * it in, in which case we re-read it.
4133 */
4134 do {
4135 size_t size;
4136
4137 kfree(ondisk);
4138
4139 size = sizeof (*ondisk);
4140 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4141 size += names_size;
4142 ondisk = kmalloc(size, GFP_KERNEL);
4143 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05004144 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05004145
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004146 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4147 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05004148 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05004149 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004150 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05004151 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004152 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4153 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05004154 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004155 }
4156 if (!rbd_dev_ondisk_valid(ondisk)) {
4157 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004158 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05004159 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004160 }
4161
4162 names_size = le64_to_cpu(ondisk->snap_names_len);
4163 want_count = snap_count;
4164 snap_count = le32_to_cpu(ondisk->snap_count);
4165 } while (snap_count != want_count);
4166
Alex Elder662518b2013-05-06 09:51:29 -05004167 ret = rbd_header_from_disk(rbd_dev, ondisk);
4168out:
Alex Elder4156d992012-08-02 11:29:46 -05004169 kfree(ondisk);
4170
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004171 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004172}
4173
Alex Elder15228ed2013-05-01 12:43:03 -05004174/*
4175 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
4176 * has disappeared from the (just updated) snapshot context.
4177 */
4178static void rbd_exists_validate(struct rbd_device *rbd_dev)
4179{
4180 u64 snap_id;
4181
4182 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
4183 return;
4184
4185 snap_id = rbd_dev->spec->snap_id;
4186 if (snap_id == CEPH_NOSNAP)
4187 return;
4188
4189 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
4190 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4191}
4192
Josh Durgin98752012013-08-29 17:26:31 -07004193static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4194{
4195 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07004196
4197 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02004198 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4199 * try to update its size. If REMOVING is set, updating size
4200 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07004201 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02004202 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4203 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07004204 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4205 dout("setting size to %llu sectors", (unsigned long long)size);
4206 set_capacity(rbd_dev->disk, size);
4207 revalidate_disk(rbd_dev->disk);
4208 }
4209}
4210
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004211static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05004212{
Alex Eldere627db02013-05-06 07:40:30 -05004213 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05004214 int ret;
4215
Alex Eldercfbf6372013-05-31 17:40:45 -05004216 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004217 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004218
4219 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004220 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004221 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004222
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004223 /*
4224 * If there is a parent, see if it has disappeared due to the
4225 * mapped image getting flattened.
4226 */
4227 if (rbd_dev->parent) {
4228 ret = rbd_dev_v2_parent_info(rbd_dev);
4229 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004230 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004231 }
4232
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004233 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004234 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004235 } else {
4236 /* validate mapped snapshot's EXISTS flag */
4237 rbd_exists_validate(rbd_dev);
4238 }
Alex Elder15228ed2013-05-01 12:43:03 -05004239
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004240out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004241 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004242 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004243 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004244
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004245 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004246}
4247
Christoph Hellwigd6296d392017-05-01 10:19:08 -06004248static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4249 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004250{
4251 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4252
4253 INIT_WORK(work, rbd_queue_workfn);
4254 return 0;
4255}
4256
Eric Biggersf363b082017-03-30 13:39:16 -07004257static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004258 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004259 .init_request = rbd_init_request,
4260};
4261
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004262static int rbd_init_disk(struct rbd_device *rbd_dev)
4263{
4264 struct gendisk *disk;
4265 struct request_queue *q;
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004266 unsigned int objset_bytes =
4267 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004268 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004269
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004270 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004271 disk = alloc_disk(single_major ?
4272 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4273 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004274 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004275 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004276
Alex Elderf0f8cef2012-01-29 13:57:44 -06004277 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004278 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004279 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004280 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004281 if (single_major)
4282 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004283 disk->fops = &rbd_bd_ops;
4284 disk->private_data = rbd_dev;
4285
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004286 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4287 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004288 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004289 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ming Lei56d18f62019-02-15 19:13:24 +08004290 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004291 rbd_dev->tag_set.nr_hw_queues = 1;
4292 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4293
4294 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4295 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004296 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004297
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004298 q = blk_mq_init_queue(&rbd_dev->tag_set);
4299 if (IS_ERR(q)) {
4300 err = PTR_ERR(q);
4301 goto out_tag_set;
4302 }
4303
Bart Van Assche8b904b52018-03-07 17:10:10 -08004304 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004305 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004306
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004307 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004308 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01004309 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01004310 blk_queue_max_segment_size(q, UINT_MAX);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01004311 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
4312 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004313
Ilya Dryomovd9360542018-03-23 06:14:47 +01004314 if (rbd_dev->opts->trim) {
4315 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01004316 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
Ilya Dryomovd9360542018-03-23 06:14:47 +01004317 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4318 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4319 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004320
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004321 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01004322 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004323
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004324 /*
4325 * disk_release() expects a queue ref from add_disk() and will
4326 * put it. Hold an extra ref until add_disk() is called.
4327 */
4328 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004329 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004330 q->queuedata = rbd_dev;
4331
4332 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004333
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004334 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004335out_tag_set:
4336 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004337out_disk:
4338 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004339 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004340}
4341
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004342/*
4343 sysfs
4344*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004345
Alex Elder593a9e72012-02-07 12:03:37 -06004346static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4347{
4348 return container_of(dev, struct rbd_device, dev);
4349}
4350
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004351static ssize_t rbd_size_show(struct device *dev,
4352 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004353{
Alex Elder593a9e72012-02-07 12:03:37 -06004354 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004355
Alex Elderfc71d832013-04-26 15:44:36 -05004356 return sprintf(buf, "%llu\n",
4357 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004358}
4359
Alex Elder34b13182012-07-13 20:35:12 -05004360/*
4361 * Note this shows the features for whatever's mapped, which is not
4362 * necessarily the base image.
4363 */
4364static ssize_t rbd_features_show(struct device *dev,
4365 struct device_attribute *attr, char *buf)
4366{
4367 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4368
4369 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004370 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004371}
4372
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004373static ssize_t rbd_major_show(struct device *dev,
4374 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004375{
Alex Elder593a9e72012-02-07 12:03:37 -06004376 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004377
Alex Elderfc71d832013-04-26 15:44:36 -05004378 if (rbd_dev->major)
4379 return sprintf(buf, "%d\n", rbd_dev->major);
4380
4381 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004382}
Alex Elderfc71d832013-04-26 15:44:36 -05004383
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004384static ssize_t rbd_minor_show(struct device *dev,
4385 struct device_attribute *attr, char *buf)
4386{
4387 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4388
4389 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004390}
4391
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004392static ssize_t rbd_client_addr_show(struct device *dev,
4393 struct device_attribute *attr, char *buf)
4394{
4395 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4396 struct ceph_entity_addr *client_addr =
4397 ceph_client_addr(rbd_dev->rbd_client->client);
4398
4399 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4400 le32_to_cpu(client_addr->nonce));
4401}
4402
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004403static ssize_t rbd_client_id_show(struct device *dev,
4404 struct device_attribute *attr, char *buf)
4405{
Alex Elder593a9e72012-02-07 12:03:37 -06004406 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004407
Alex Elder1dbb4392012-01-24 10:08:37 -06004408 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004409 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004410}
4411
Mike Christie267fb902016-08-18 18:38:43 +02004412static ssize_t rbd_cluster_fsid_show(struct device *dev,
4413 struct device_attribute *attr, char *buf)
4414{
4415 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4416
4417 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4418}
4419
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004420static ssize_t rbd_config_info_show(struct device *dev,
4421 struct device_attribute *attr, char *buf)
4422{
4423 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4424
4425 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004426}
4427
4428static ssize_t rbd_pool_show(struct device *dev,
4429 struct device_attribute *attr, char *buf)
4430{
Alex Elder593a9e72012-02-07 12:03:37 -06004431 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004432
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004433 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004434}
4435
Alex Elder9bb2f332012-07-12 10:46:35 -05004436static ssize_t rbd_pool_id_show(struct device *dev,
4437 struct device_attribute *attr, char *buf)
4438{
4439 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4440
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004441 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004442 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004443}
4444
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004445static ssize_t rbd_pool_ns_show(struct device *dev,
4446 struct device_attribute *attr, char *buf)
4447{
4448 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4449
4450 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4451}
4452
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004453static ssize_t rbd_name_show(struct device *dev,
4454 struct device_attribute *attr, char *buf)
4455{
Alex Elder593a9e72012-02-07 12:03:37 -06004456 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004457
Alex Eldera92ffdf2012-10-30 19:40:33 -05004458 if (rbd_dev->spec->image_name)
4459 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4460
4461 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004462}
4463
Alex Elder589d30e2012-07-10 20:30:11 -05004464static ssize_t rbd_image_id_show(struct device *dev,
4465 struct device_attribute *attr, char *buf)
4466{
4467 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4468
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004469 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004470}
4471
Alex Elder34b13182012-07-13 20:35:12 -05004472/*
4473 * Shows the name of the currently-mapped snapshot (or
4474 * RBD_SNAP_HEAD_NAME for the base image).
4475 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004476static ssize_t rbd_snap_show(struct device *dev,
4477 struct device_attribute *attr,
4478 char *buf)
4479{
Alex Elder593a9e72012-02-07 12:03:37 -06004480 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004481
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004482 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004483}
4484
Mike Christie92a58672016-08-18 18:38:44 +02004485static ssize_t rbd_snap_id_show(struct device *dev,
4486 struct device_attribute *attr, char *buf)
4487{
4488 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4489
4490 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4491}
4492
Alex Elder86b00e02012-10-25 23:34:42 -05004493/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004494 * For a v2 image, shows the chain of parent images, separated by empty
4495 * lines. For v1 images or if there is no parent, shows "(no parent
4496 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004497 */
4498static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004499 struct device_attribute *attr,
4500 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004501{
4502 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004503 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004504
Ilya Dryomovff961282014-07-22 21:53:07 +04004505 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004506 return sprintf(buf, "(no parent image)\n");
4507
Ilya Dryomovff961282014-07-22 21:53:07 +04004508 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4509 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004510
Ilya Dryomovff961282014-07-22 21:53:07 +04004511 count += sprintf(&buf[count], "%s"
4512 "pool_id %llu\npool_name %s\n"
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004513 "pool_ns %s\n"
Ilya Dryomovff961282014-07-22 21:53:07 +04004514 "image_id %s\nimage_name %s\n"
4515 "snap_id %llu\nsnap_name %s\n"
4516 "overlap %llu\n",
4517 !count ? "" : "\n", /* first? */
4518 spec->pool_id, spec->pool_name,
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004519 spec->pool_ns ?: "",
Ilya Dryomovff961282014-07-22 21:53:07 +04004520 spec->image_id, spec->image_name ?: "(unknown)",
4521 spec->snap_id, spec->snap_name,
4522 rbd_dev->parent_overlap);
4523 }
Alex Elder86b00e02012-10-25 23:34:42 -05004524
Ilya Dryomovff961282014-07-22 21:53:07 +04004525 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004526}
4527
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004528static ssize_t rbd_image_refresh(struct device *dev,
4529 struct device_attribute *attr,
4530 const char *buf,
4531 size_t size)
4532{
Alex Elder593a9e72012-02-07 12:03:37 -06004533 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004534 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004535
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004536 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004537 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004538 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004539
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004540 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004541}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004542
Joe Perches5657a812018-05-24 13:38:59 -06004543static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
4544static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
4545static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
4546static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
4547static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
4548static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
4549static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
4550static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
4551static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
4552static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004553static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
Joe Perches5657a812018-05-24 13:38:59 -06004554static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
4555static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
4556static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
4557static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
4558static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
4559static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004560
4561static struct attribute *rbd_attrs[] = {
4562 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004563 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004564 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004565 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004566 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004567 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004568 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004569 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004570 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004571 &dev_attr_pool_id.attr,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004572 &dev_attr_pool_ns.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004573 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004574 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004575 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004576 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004577 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004578 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004579 NULL
4580};
4581
4582static struct attribute_group rbd_attr_group = {
4583 .attrs = rbd_attrs,
4584};
4585
4586static const struct attribute_group *rbd_attr_groups[] = {
4587 &rbd_attr_group,
4588 NULL
4589};
4590
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004591static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004592
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304593static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004594 .name = "rbd",
4595 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004596 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004597};
4598
Alex Elder8b8fb992012-10-26 17:25:24 -05004599static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4600{
4601 kref_get(&spec->kref);
4602
4603 return spec;
4604}
4605
4606static void rbd_spec_free(struct kref *kref);
4607static void rbd_spec_put(struct rbd_spec *spec)
4608{
4609 if (spec)
4610 kref_put(&spec->kref, rbd_spec_free);
4611}
4612
4613static struct rbd_spec *rbd_spec_alloc(void)
4614{
4615 struct rbd_spec *spec;
4616
4617 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4618 if (!spec)
4619 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004620
4621 spec->pool_id = CEPH_NOPOOL;
4622 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004623 kref_init(&spec->kref);
4624
Alex Elder8b8fb992012-10-26 17:25:24 -05004625 return spec;
4626}
4627
4628static void rbd_spec_free(struct kref *kref)
4629{
4630 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4631
4632 kfree(spec->pool_name);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004633 kfree(spec->pool_ns);
Alex Elder8b8fb992012-10-26 17:25:24 -05004634 kfree(spec->image_id);
4635 kfree(spec->image_name);
4636 kfree(spec->snap_name);
4637 kfree(spec);
4638}
4639
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004640static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004641{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004642 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004643 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004644
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004645 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004646 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004647 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004648
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004649 rbd_put_client(rbd_dev->rbd_client);
4650 rbd_spec_put(rbd_dev->spec);
4651 kfree(rbd_dev->opts);
4652 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004653}
4654
4655static void rbd_dev_release(struct device *dev)
4656{
4657 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4658 bool need_put = !!rbd_dev->opts;
4659
4660 if (need_put) {
4661 destroy_workqueue(rbd_dev->task_wq);
4662 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4663 }
4664
4665 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004666
4667 /*
4668 * This is racy, but way better than putting module outside of
4669 * the release callback. The race window is pretty small, so
4670 * doing something similar to dm (dm-builtin.c) is overkill.
4671 */
4672 if (need_put)
4673 module_put(THIS_MODULE);
4674}
4675
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004676static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4677 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004678{
4679 struct rbd_device *rbd_dev;
4680
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004681 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004682 if (!rbd_dev)
4683 return NULL;
4684
4685 spin_lock_init(&rbd_dev->lock);
4686 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004687 init_rwsem(&rbd_dev->header_rwsem);
4688
Ilya Dryomov7e973322017-01-25 18:16:22 +01004689 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004690 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004691 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004692 if (spec->pool_ns) {
4693 WARN_ON(!*spec->pool_ns);
4694 rbd_dev->header_oloc.pool_ns =
4695 ceph_find_or_create_string(spec->pool_ns,
4696 strlen(spec->pool_ns));
4697 }
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004698
Ilya Dryomov99d16942016-08-12 16:11:41 +02004699 mutex_init(&rbd_dev->watch_mutex);
4700 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4701 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4702
Ilya Dryomoved95b212016-08-12 16:40:02 +02004703 init_rwsem(&rbd_dev->lock_rwsem);
4704 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4705 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4706 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4707 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4708 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4709 init_waitqueue_head(&rbd_dev->lock_waitq);
4710
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004711 rbd_dev->dev.bus = &rbd_bus_type;
4712 rbd_dev->dev.type = &rbd_device_type;
4713 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004714 device_initialize(&rbd_dev->dev);
4715
Alex Elderc53d5892012-10-25 23:34:42 -05004716 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004717 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004718
Alex Elderc53d5892012-10-25 23:34:42 -05004719 return rbd_dev;
4720}
4721
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004722/*
4723 * Create a mapping rbd_dev.
4724 */
4725static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4726 struct rbd_spec *spec,
4727 struct rbd_options *opts)
4728{
4729 struct rbd_device *rbd_dev;
4730
4731 rbd_dev = __rbd_dev_create(rbdc, spec);
4732 if (!rbd_dev)
4733 return NULL;
4734
4735 rbd_dev->opts = opts;
4736
4737 /* get an id and fill in device name */
4738 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4739 minor_to_rbd_dev_id(1 << MINORBITS),
4740 GFP_KERNEL);
4741 if (rbd_dev->dev_id < 0)
4742 goto fail_rbd_dev;
4743
4744 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4745 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4746 rbd_dev->name);
4747 if (!rbd_dev->task_wq)
4748 goto fail_dev_id;
4749
4750 /* we have a ref from do_rbd_add() */
4751 __module_get(THIS_MODULE);
4752
4753 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4754 return rbd_dev;
4755
4756fail_dev_id:
4757 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4758fail_rbd_dev:
4759 rbd_dev_free(rbd_dev);
4760 return NULL;
4761}
4762
Alex Elderc53d5892012-10-25 23:34:42 -05004763static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4764{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004765 if (rbd_dev)
4766 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004767}
4768
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004769/*
Alex Elder9d475de2012-07-03 16:01:19 -05004770 * Get the size and object order for an image snapshot, or if
4771 * snap_id is CEPH_NOSNAP, gets this information for the base
4772 * image.
4773 */
4774static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4775 u8 *order, u64 *snap_size)
4776{
4777 __le64 snapid = cpu_to_le64(snap_id);
4778 int ret;
4779 struct {
4780 u8 order;
4781 __le64 size;
4782 } __attribute__ ((packed)) size_buf = { 0 };
4783
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004784 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4785 &rbd_dev->header_oloc, "get_size",
4786 &snapid, sizeof(snapid),
4787 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004788 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004789 if (ret < 0)
4790 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004791 if (ret < sizeof (size_buf))
4792 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004793
Josh Durginc3545572013-08-28 17:08:10 -07004794 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004795 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004796 dout(" order %u", (unsigned int)*order);
4797 }
Alex Elder9d475de2012-07-03 16:01:19 -05004798 *snap_size = le64_to_cpu(size_buf.size);
4799
Josh Durginc3545572013-08-28 17:08:10 -07004800 dout(" snap_id 0x%016llx snap_size = %llu\n",
4801 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004802 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004803
4804 return 0;
4805}
4806
4807static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4808{
4809 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4810 &rbd_dev->header.obj_order,
4811 &rbd_dev->header.image_size);
4812}
4813
Alex Elder1e130192012-07-03 16:01:19 -05004814static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4815{
4816 void *reply_buf;
4817 int ret;
4818 void *p;
4819
4820 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4821 if (!reply_buf)
4822 return -ENOMEM;
4823
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004824 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4825 &rbd_dev->header_oloc, "get_object_prefix",
4826 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004827 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004828 if (ret < 0)
4829 goto out;
4830
4831 p = reply_buf;
4832 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004833 p + ret, NULL, GFP_NOIO);
4834 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004835
4836 if (IS_ERR(rbd_dev->header.object_prefix)) {
4837 ret = PTR_ERR(rbd_dev->header.object_prefix);
4838 rbd_dev->header.object_prefix = NULL;
4839 } else {
4840 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4841 }
Alex Elder1e130192012-07-03 16:01:19 -05004842out:
4843 kfree(reply_buf);
4844
4845 return ret;
4846}
4847
Alex Elderb1b54022012-07-03 16:01:19 -05004848static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4849 u64 *snap_features)
4850{
4851 __le64 snapid = cpu_to_le64(snap_id);
4852 struct {
4853 __le64 features;
4854 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004855 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004856 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004857 int ret;
4858
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004859 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4860 &rbd_dev->header_oloc, "get_features",
4861 &snapid, sizeof(snapid),
4862 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004863 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004864 if (ret < 0)
4865 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004866 if (ret < sizeof (features_buf))
4867 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004868
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004869 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4870 if (unsup) {
4871 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4872 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004873 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004874 }
Alex Elderd8891402012-10-09 13:50:17 -07004875
Alex Elderb1b54022012-07-03 16:01:19 -05004876 *snap_features = le64_to_cpu(features_buf.features);
4877
4878 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004879 (unsigned long long)snap_id,
4880 (unsigned long long)*snap_features,
4881 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004882
4883 return 0;
4884}
4885
4886static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4887{
4888 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4889 &rbd_dev->header.features);
4890}
4891
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004892struct parent_image_info {
4893 u64 pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004894 const char *pool_ns;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004895 const char *image_id;
4896 u64 snap_id;
4897
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004898 bool has_overlap;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004899 u64 overlap;
4900};
4901
4902/*
4903 * The caller is responsible for @pii.
4904 */
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004905static int decode_parent_image_spec(void **p, void *end,
4906 struct parent_image_info *pii)
4907{
4908 u8 struct_v;
4909 u32 struct_len;
4910 int ret;
4911
4912 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
4913 &struct_v, &struct_len);
4914 if (ret)
4915 return ret;
4916
4917 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
4918 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4919 if (IS_ERR(pii->pool_ns)) {
4920 ret = PTR_ERR(pii->pool_ns);
4921 pii->pool_ns = NULL;
4922 return ret;
4923 }
4924 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4925 if (IS_ERR(pii->image_id)) {
4926 ret = PTR_ERR(pii->image_id);
4927 pii->image_id = NULL;
4928 return ret;
4929 }
4930 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
4931 return 0;
4932
4933e_inval:
4934 return -EINVAL;
4935}
4936
4937static int __get_parent_info(struct rbd_device *rbd_dev,
4938 struct page *req_page,
4939 struct page *reply_page,
4940 struct parent_image_info *pii)
4941{
4942 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4943 size_t reply_len = PAGE_SIZE;
4944 void *p, *end;
4945 int ret;
4946
4947 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4948 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4949 req_page, sizeof(u64), reply_page, &reply_len);
4950 if (ret)
4951 return ret == -EOPNOTSUPP ? 1 : ret;
4952
4953 p = page_address(reply_page);
4954 end = p + reply_len;
4955 ret = decode_parent_image_spec(&p, end, pii);
4956 if (ret)
4957 return ret;
4958
4959 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4960 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4961 req_page, sizeof(u64), reply_page, &reply_len);
4962 if (ret)
4963 return ret;
4964
4965 p = page_address(reply_page);
4966 end = p + reply_len;
4967 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
4968 if (pii->has_overlap)
4969 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4970
4971 return 0;
4972
4973e_inval:
4974 return -EINVAL;
4975}
4976
4977/*
4978 * The caller is responsible for @pii.
4979 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004980static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4981 struct page *req_page,
4982 struct page *reply_page,
4983 struct parent_image_info *pii)
4984{
4985 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4986 size_t reply_len = PAGE_SIZE;
4987 void *p, *end;
4988 int ret;
4989
4990 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4991 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4992 req_page, sizeof(u64), reply_page, &reply_len);
4993 if (ret)
4994 return ret;
4995
4996 p = page_address(reply_page);
4997 end = p + reply_len;
4998 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4999 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
5000 if (IS_ERR(pii->image_id)) {
5001 ret = PTR_ERR(pii->image_id);
5002 pii->image_id = NULL;
5003 return ret;
5004 }
5005 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005006 pii->has_overlap = true;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005007 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5008
5009 return 0;
5010
5011e_inval:
5012 return -EINVAL;
5013}
5014
5015static int get_parent_info(struct rbd_device *rbd_dev,
5016 struct parent_image_info *pii)
5017{
5018 struct page *req_page, *reply_page;
5019 void *p;
5020 int ret;
5021
5022 req_page = alloc_page(GFP_KERNEL);
5023 if (!req_page)
5024 return -ENOMEM;
5025
5026 reply_page = alloc_page(GFP_KERNEL);
5027 if (!reply_page) {
5028 __free_page(req_page);
5029 return -ENOMEM;
5030 }
5031
5032 p = page_address(req_page);
5033 ceph_encode_64(&p, rbd_dev->spec->snap_id);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005034 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5035 if (ret > 0)
5036 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5037 pii);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005038
5039 __free_page(req_page);
5040 __free_page(reply_page);
5041 return ret;
5042}
5043
Alex Elder86b00e02012-10-25 23:34:42 -05005044static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5045{
5046 struct rbd_spec *parent_spec;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005047 struct parent_image_info pii = { 0 };
Alex Elder86b00e02012-10-25 23:34:42 -05005048 int ret;
5049
5050 parent_spec = rbd_spec_alloc();
5051 if (!parent_spec)
5052 return -ENOMEM;
5053
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005054 ret = get_parent_info(rbd_dev, &pii);
5055 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05005056 goto out_err;
5057
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005058 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5059 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5060 pii.has_overlap, pii.overlap);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005061
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005062 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
Alex Elder392a9da2013-05-06 17:40:33 -05005063 /*
5064 * Either the parent never existed, or we have
5065 * record of it but the image got flattened so it no
5066 * longer has a parent. When the parent of a
5067 * layered image disappears we immediately set the
5068 * overlap to 0. The effect of this is that all new
5069 * requests will be treated as if the image had no
5070 * parent.
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005071 *
5072 * If !pii.has_overlap, the parent image spec is not
5073 * applicable. It's there to avoid duplication in each
5074 * snapshot record.
Alex Elder392a9da2013-05-06 17:40:33 -05005075 */
5076 if (rbd_dev->parent_overlap) {
5077 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05005078 rbd_dev_parent_put(rbd_dev);
5079 pr_info("%s: clone image has been flattened\n",
5080 rbd_dev->disk->disk_name);
5081 }
5082
Alex Elder86b00e02012-10-25 23:34:42 -05005083 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05005084 }
Alex Elder86b00e02012-10-25 23:34:42 -05005085
Alex Elder0903e872012-11-14 12:25:19 -06005086 /* The ceph file layout needs to fit pool id in 32 bits */
5087
5088 ret = -EIO;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005089 if (pii.pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04005090 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005091 (unsigned long long)pii.pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05005092 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005093 }
Alex Elder0903e872012-11-14 12:25:19 -06005094
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005095 /*
5096 * The parent won't change (except when the clone is
5097 * flattened, already handled that). So we only need to
5098 * record the parent spec we have not already done so.
5099 */
5100 if (!rbd_dev->parent_spec) {
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005101 parent_spec->pool_id = pii.pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005102 if (pii.pool_ns && *pii.pool_ns) {
5103 parent_spec->pool_ns = pii.pool_ns;
5104 pii.pool_ns = NULL;
5105 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005106 parent_spec->image_id = pii.image_id;
5107 pii.image_id = NULL;
5108 parent_spec->snap_id = pii.snap_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005109
Alex Elder70cf49c2013-05-06 17:40:33 -05005110 rbd_dev->parent_spec = parent_spec;
5111 parent_spec = NULL; /* rbd_dev now owns this */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005112 }
5113
5114 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005115 * We always update the parent overlap. If it's zero we issue
5116 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005117 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005118 if (!pii.overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005119 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005120 /* refresh, careful to warn just once */
5121 if (rbd_dev->parent_overlap)
5122 rbd_warn(rbd_dev,
5123 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005124 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005125 /* initial probe */
5126 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005127 }
Alex Elder70cf49c2013-05-06 17:40:33 -05005128 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005129 rbd_dev->parent_overlap = pii.overlap;
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005130
Alex Elder86b00e02012-10-25 23:34:42 -05005131out:
5132 ret = 0;
5133out_err:
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005134 kfree(pii.pool_ns);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005135 kfree(pii.image_id);
Alex Elder86b00e02012-10-25 23:34:42 -05005136 rbd_spec_put(parent_spec);
Alex Elder86b00e02012-10-25 23:34:42 -05005137 return ret;
5138}
5139
Alex Eldercc070d52013-04-21 12:14:45 -05005140static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5141{
5142 struct {
5143 __le64 stripe_unit;
5144 __le64 stripe_count;
5145 } __attribute__ ((packed)) striping_info_buf = { 0 };
5146 size_t size = sizeof (striping_info_buf);
5147 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05005148 int ret;
5149
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005150 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5151 &rbd_dev->header_oloc, "get_stripe_unit_count",
5152 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05005153 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5154 if (ret < 0)
5155 return ret;
5156 if (ret < size)
5157 return -ERANGE;
5158
Alex Eldercc070d52013-04-21 12:14:45 -05005159 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01005160 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5161 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05005162 return 0;
5163}
5164
Ilya Dryomov7e973322017-01-25 18:16:22 +01005165static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5166{
5167 __le64 data_pool_id;
5168 int ret;
5169
5170 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5171 &rbd_dev->header_oloc, "get_data_pool",
5172 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5173 if (ret < 0)
5174 return ret;
5175 if (ret < sizeof(data_pool_id))
5176 return -EBADMSG;
5177
5178 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5179 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5180 return 0;
5181}
5182
Alex Elder9e15b772012-10-30 19:40:33 -05005183static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5184{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005185 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05005186 size_t image_id_size;
5187 char *image_id;
5188 void *p;
5189 void *end;
5190 size_t size;
5191 void *reply_buf = NULL;
5192 size_t len = 0;
5193 char *image_name = NULL;
5194 int ret;
5195
5196 rbd_assert(!rbd_dev->spec->image_name);
5197
Alex Elder69e7a022012-11-01 08:39:26 -05005198 len = strlen(rbd_dev->spec->image_id);
5199 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05005200 image_id = kmalloc(image_id_size, GFP_KERNEL);
5201 if (!image_id)
5202 return NULL;
5203
5204 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05005205 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05005206 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05005207
5208 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5209 reply_buf = kmalloc(size, GFP_KERNEL);
5210 if (!reply_buf)
5211 goto out;
5212
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005213 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5214 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5215 "dir_get_name", image_id, image_id_size,
5216 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005217 if (ret < 0)
5218 goto out;
5219 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005220 end = reply_buf + ret;
5221
Alex Elder9e15b772012-10-30 19:40:33 -05005222 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5223 if (IS_ERR(image_name))
5224 image_name = NULL;
5225 else
5226 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5227out:
5228 kfree(reply_buf);
5229 kfree(image_id);
5230
5231 return image_name;
5232}
5233
Alex Elder2ad3d712013-04-30 00:44:33 -05005234static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5235{
5236 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5237 const char *snap_name;
5238 u32 which = 0;
5239
5240 /* Skip over names until we find the one we are looking for */
5241
5242 snap_name = rbd_dev->header.snap_names;
5243 while (which < snapc->num_snaps) {
5244 if (!strcmp(name, snap_name))
5245 return snapc->snaps[which];
5246 snap_name += strlen(snap_name) + 1;
5247 which++;
5248 }
5249 return CEPH_NOSNAP;
5250}
5251
5252static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5253{
5254 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5255 u32 which;
5256 bool found = false;
5257 u64 snap_id;
5258
5259 for (which = 0; !found && which < snapc->num_snaps; which++) {
5260 const char *snap_name;
5261
5262 snap_id = snapc->snaps[which];
5263 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005264 if (IS_ERR(snap_name)) {
5265 /* ignore no-longer existing snapshots */
5266 if (PTR_ERR(snap_name) == -ENOENT)
5267 continue;
5268 else
5269 break;
5270 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005271 found = !strcmp(name, snap_name);
5272 kfree(snap_name);
5273 }
5274 return found ? snap_id : CEPH_NOSNAP;
5275}
5276
5277/*
5278 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5279 * no snapshot by that name is found, or if an error occurs.
5280 */
5281static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5282{
5283 if (rbd_dev->image_format == 1)
5284 return rbd_v1_snap_id_by_name(rbd_dev, name);
5285
5286 return rbd_v2_snap_id_by_name(rbd_dev, name);
5287}
5288
Alex Elder9e15b772012-10-30 19:40:33 -05005289/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005290 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005291 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005292static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5293{
5294 struct rbd_spec *spec = rbd_dev->spec;
5295
5296 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5297 rbd_assert(spec->image_id && spec->image_name);
5298 rbd_assert(spec->snap_name);
5299
5300 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5301 u64 snap_id;
5302
5303 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5304 if (snap_id == CEPH_NOSNAP)
5305 return -ENOENT;
5306
5307 spec->snap_id = snap_id;
5308 } else {
5309 spec->snap_id = CEPH_NOSNAP;
5310 }
5311
5312 return 0;
5313}
5314
5315/*
5316 * A parent image will have all ids but none of the names.
5317 *
5318 * All names in an rbd spec are dynamically allocated. It's OK if we
5319 * can't figure out the name for an image id.
5320 */
5321static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005322{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005323 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5324 struct rbd_spec *spec = rbd_dev->spec;
5325 const char *pool_name;
5326 const char *image_name;
5327 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005328 int ret;
5329
Ilya Dryomov04077592014-07-23 17:11:20 +04005330 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5331 rbd_assert(spec->image_id);
5332 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005333
Alex Elder2e9f7f12013-04-26 09:43:48 -05005334 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005335
Alex Elder2e9f7f12013-04-26 09:43:48 -05005336 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5337 if (!pool_name) {
5338 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005339 return -EIO;
5340 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005341 pool_name = kstrdup(pool_name, GFP_KERNEL);
5342 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005343 return -ENOMEM;
5344
5345 /* Fetch the image name; tolerate failure here */
5346
Alex Elder2e9f7f12013-04-26 09:43:48 -05005347 image_name = rbd_dev_image_name(rbd_dev);
5348 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005349 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005350
Ilya Dryomov04077592014-07-23 17:11:20 +04005351 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005352
Alex Elder2e9f7f12013-04-26 09:43:48 -05005353 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005354 if (IS_ERR(snap_name)) {
5355 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005356 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005357 }
5358
5359 spec->pool_name = pool_name;
5360 spec->image_name = image_name;
5361 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005362
5363 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005364
Alex Elder9e15b772012-10-30 19:40:33 -05005365out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005366 kfree(image_name);
5367 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005368 return ret;
5369}
5370
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005371static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005372{
5373 size_t size;
5374 int ret;
5375 void *reply_buf;
5376 void *p;
5377 void *end;
5378 u64 seq;
5379 u32 snap_count;
5380 struct ceph_snap_context *snapc;
5381 u32 i;
5382
5383 /*
5384 * We'll need room for the seq value (maximum snapshot id),
5385 * snapshot count, and array of that many snapshot ids.
5386 * For now we have a fixed upper limit on the number we're
5387 * prepared to receive.
5388 */
5389 size = sizeof (__le64) + sizeof (__le32) +
5390 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5391 reply_buf = kzalloc(size, GFP_KERNEL);
5392 if (!reply_buf)
5393 return -ENOMEM;
5394
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005395 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5396 &rbd_dev->header_oloc, "get_snapcontext",
5397 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005398 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005399 if (ret < 0)
5400 goto out;
5401
Alex Elder35d489f2012-07-03 16:01:19 -05005402 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005403 end = reply_buf + ret;
5404 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005405 ceph_decode_64_safe(&p, end, seq, out);
5406 ceph_decode_32_safe(&p, end, snap_count, out);
5407
5408 /*
5409 * Make sure the reported number of snapshot ids wouldn't go
5410 * beyond the end of our buffer. But before checking that,
5411 * make sure the computed size of the snapshot context we
5412 * allocate is representable in a size_t.
5413 */
5414 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5415 / sizeof (u64)) {
5416 ret = -EINVAL;
5417 goto out;
5418 }
5419 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5420 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005421 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005422
Alex Elder812164f82013-04-30 00:44:32 -05005423 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005424 if (!snapc) {
5425 ret = -ENOMEM;
5426 goto out;
5427 }
Alex Elder35d489f2012-07-03 16:01:19 -05005428 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005429 for (i = 0; i < snap_count; i++)
5430 snapc->snaps[i] = ceph_decode_64(&p);
5431
Alex Elder49ece552013-05-06 08:37:00 -05005432 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005433 rbd_dev->header.snapc = snapc;
5434
5435 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005436 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005437out:
5438 kfree(reply_buf);
5439
Alex Elder57385b52013-04-21 12:14:45 -05005440 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005441}
5442
Alex Elder54cac612013-04-30 00:44:33 -05005443static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5444 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005445{
5446 size_t size;
5447 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005448 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005449 int ret;
5450 void *p;
5451 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005452 char *snap_name;
5453
5454 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5455 reply_buf = kmalloc(size, GFP_KERNEL);
5456 if (!reply_buf)
5457 return ERR_PTR(-ENOMEM);
5458
Alex Elder54cac612013-04-30 00:44:33 -05005459 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005460 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5461 &rbd_dev->header_oloc, "get_snapshot_name",
5462 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005463 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005464 if (ret < 0) {
5465 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005466 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005467 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005468
5469 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005470 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005471 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005472 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005473 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005474
Alex Elderf40eb342013-04-25 15:09:42 -05005475 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005476 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005477out:
5478 kfree(reply_buf);
5479
Alex Elderf40eb342013-04-25 15:09:42 -05005480 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005481}
5482
Alex Elder2df3fac2013-05-06 09:51:30 -05005483static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005484{
Alex Elder2df3fac2013-05-06 09:51:30 -05005485 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005486 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005487
Josh Durgin1617e402013-06-12 14:43:10 -07005488 ret = rbd_dev_v2_image_size(rbd_dev);
5489 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005490 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005491
Alex Elder2df3fac2013-05-06 09:51:30 -05005492 if (first_time) {
5493 ret = rbd_dev_v2_header_onetime(rbd_dev);
5494 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005495 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005496 }
5497
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005498 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005499 if (ret && first_time) {
5500 kfree(rbd_dev->header.object_prefix);
5501 rbd_dev->header.object_prefix = NULL;
5502 }
Alex Elder117973f2012-08-31 17:29:55 -05005503
5504 return ret;
5505}
5506
Ilya Dryomova720ae02014-07-23 17:11:19 +04005507static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5508{
5509 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5510
5511 if (rbd_dev->image_format == 1)
5512 return rbd_dev_v1_header_info(rbd_dev);
5513
5514 return rbd_dev_v2_header_info(rbd_dev);
5515}
5516
Alex Elder1ddbe942012-01-29 13:57:44 -06005517/*
Alex Eldere28fff262012-02-02 08:13:30 -06005518 * Skips over white space at *buf, and updates *buf to point to the
5519 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005520 * the token (string of non-white space characters) found. Note
5521 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005522 */
5523static inline size_t next_token(const char **buf)
5524{
5525 /*
5526 * These are the characters that produce nonzero for
5527 * isspace() in the "C" and "POSIX" locales.
5528 */
5529 const char *spaces = " \f\n\r\t\v";
5530
5531 *buf += strspn(*buf, spaces); /* Find start of token */
5532
5533 return strcspn(*buf, spaces); /* Return token length */
5534}
5535
5536/*
Alex Elderea3352f2012-07-09 21:04:23 -05005537 * Finds the next token in *buf, dynamically allocates a buffer big
5538 * enough to hold a copy of it, and copies the token into the new
5539 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5540 * that a duplicate buffer is created even for a zero-length token.
5541 *
5542 * Returns a pointer to the newly-allocated duplicate, or a null
5543 * pointer if memory for the duplicate was not available. If
5544 * the lenp argument is a non-null pointer, the length of the token
5545 * (not including the '\0') is returned in *lenp.
5546 *
5547 * If successful, the *buf pointer will be updated to point beyond
5548 * the end of the found token.
5549 *
5550 * Note: uses GFP_KERNEL for allocation.
5551 */
5552static inline char *dup_token(const char **buf, size_t *lenp)
5553{
5554 char *dup;
5555 size_t len;
5556
5557 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005558 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005559 if (!dup)
5560 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005561 *(dup + len) = '\0';
5562 *buf += len;
5563
5564 if (lenp)
5565 *lenp = len;
5566
5567 return dup;
5568}
5569
5570/*
Alex Elder859c31d2012-10-25 23:34:42 -05005571 * Parse the options provided for an "rbd add" (i.e., rbd image
5572 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5573 * and the data written is passed here via a NUL-terminated buffer.
5574 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005575 *
Alex Elder859c31d2012-10-25 23:34:42 -05005576 * The information extracted from these options is recorded in
5577 * the other parameters which return dynamically-allocated
5578 * structures:
5579 * ceph_opts
5580 * The address of a pointer that will refer to a ceph options
5581 * structure. Caller must release the returned pointer using
5582 * ceph_destroy_options() when it is no longer needed.
5583 * rbd_opts
5584 * Address of an rbd options pointer. Fully initialized by
5585 * this function; caller must release with kfree().
5586 * spec
5587 * Address of an rbd image specification pointer. Fully
5588 * initialized by this function based on parsed options.
5589 * Caller must release with rbd_spec_put().
5590 *
5591 * The options passed take this form:
5592 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5593 * where:
5594 * <mon_addrs>
5595 * A comma-separated list of one or more monitor addresses.
5596 * A monitor address is an ip address, optionally followed
5597 * by a port number (separated by a colon).
5598 * I.e.: ip1[:port1][,ip2[:port2]...]
5599 * <options>
5600 * A comma-separated list of ceph and/or rbd options.
5601 * <pool_name>
5602 * The name of the rados pool containing the rbd image.
5603 * <image_name>
5604 * The name of the image in that pool to map.
5605 * <snap_id>
5606 * An optional snapshot id. If provided, the mapping will
5607 * present data from the image at the time that snapshot was
5608 * created. The image head is used if no snapshot id is
5609 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005610 */
Alex Elder859c31d2012-10-25 23:34:42 -05005611static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005612 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005613 struct rbd_options **opts,
5614 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005615{
Alex Elderd22f76e2012-07-12 10:46:35 -05005616 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005617 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005618 const char *mon_addrs;
Alex Elderecb4dc222013-04-26 09:43:47 -05005619 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005620 size_t mon_addrs_size;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005621 struct parse_rbd_opts_ctx pctx = { 0 };
Alex Elder859c31d2012-10-25 23:34:42 -05005622 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005623 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005624
5625 /* The first four tokens are required */
5626
Alex Elder7ef32142012-02-02 08:13:30 -06005627 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005628 if (!len) {
5629 rbd_warn(NULL, "no monitor address(es) provided");
5630 return -EINVAL;
5631 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005632 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005633 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005634 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005635
Alex Elderdc79b112012-10-25 23:34:41 -05005636 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005637 options = dup_token(&buf, NULL);
5638 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005639 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005640 if (!*options) {
5641 rbd_warn(NULL, "no options provided");
5642 goto out_err;
5643 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005644
Ilya Dryomovc3001562018-07-03 15:28:43 +02005645 pctx.spec = rbd_spec_alloc();
5646 if (!pctx.spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005647 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005648
Ilya Dryomovc3001562018-07-03 15:28:43 +02005649 pctx.spec->pool_name = dup_token(&buf, NULL);
5650 if (!pctx.spec->pool_name)
Alex Elder859c31d2012-10-25 23:34:42 -05005651 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005652 if (!*pctx.spec->pool_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005653 rbd_warn(NULL, "no pool name provided");
5654 goto out_err;
5655 }
Alex Eldere28fff262012-02-02 08:13:30 -06005656
Ilya Dryomovc3001562018-07-03 15:28:43 +02005657 pctx.spec->image_name = dup_token(&buf, NULL);
5658 if (!pctx.spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005659 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005660 if (!*pctx.spec->image_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005661 rbd_warn(NULL, "no image name provided");
5662 goto out_err;
5663 }
Alex Eldere28fff262012-02-02 08:13:30 -06005664
Alex Elderf28e5652012-10-25 23:34:41 -05005665 /*
5666 * Snapshot name is optional; default is to use "-"
5667 * (indicating the head/no snapshot).
5668 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005669 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005670 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005671 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5672 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005673 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005674 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005675 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005676 }
Alex Elderecb4dc222013-04-26 09:43:47 -05005677 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5678 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005679 goto out_mem;
Alex Elderecb4dc222013-04-26 09:43:47 -05005680 *(snap_name + len) = '\0';
Ilya Dryomovc3001562018-07-03 15:28:43 +02005681 pctx.spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005682
Alex Elder0ddebc02012-10-25 23:34:41 -05005683 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005684
Ilya Dryomovc3001562018-07-03 15:28:43 +02005685 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5686 if (!pctx.opts)
Alex Elder4e9afeb2012-10-25 23:34:41 -05005687 goto out_mem;
5688
Ilya Dryomovc3001562018-07-03 15:28:43 +02005689 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5690 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01005691 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005692 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5693 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5694 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5695 pctx.opts->trim = RBD_TRIM_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005696
Alex Elder859c31d2012-10-25 23:34:42 -05005697 copts = ceph_parse_options(options, mon_addrs,
Ilya Dryomovc3001562018-07-03 15:28:43 +02005698 mon_addrs + mon_addrs_size - 1,
5699 parse_rbd_opts_token, &pctx);
Alex Elder859c31d2012-10-25 23:34:42 -05005700 if (IS_ERR(copts)) {
5701 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005702 goto out_err;
5703 }
Alex Elder859c31d2012-10-25 23:34:42 -05005704 kfree(options);
5705
5706 *ceph_opts = copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005707 *opts = pctx.opts;
5708 *rbd_spec = pctx.spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005709
Alex Elderdc79b112012-10-25 23:34:41 -05005710 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005711out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005712 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005713out_err:
Ilya Dryomovc3001562018-07-03 15:28:43 +02005714 kfree(pctx.opts);
5715 rbd_spec_put(pctx.spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005716 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005717
Alex Elderdc79b112012-10-25 23:34:41 -05005718 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005719}
5720
Ilya Dryomove010dd02017-04-13 12:17:39 +02005721static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5722{
5723 down_write(&rbd_dev->lock_rwsem);
5724 if (__rbd_is_lock_owner(rbd_dev))
5725 rbd_unlock(rbd_dev);
5726 up_write(&rbd_dev->lock_rwsem);
5727}
5728
5729static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5730{
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005731 int ret;
5732
Ilya Dryomove010dd02017-04-13 12:17:39 +02005733 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5734 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5735 return -EINVAL;
5736 }
5737
5738 /* FIXME: "rbd map --exclusive" should be in interruptible */
5739 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005740 ret = rbd_wait_state_locked(rbd_dev, true);
Ilya Dryomove010dd02017-04-13 12:17:39 +02005741 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005742 if (ret) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02005743 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5744 return -EROFS;
5745 }
5746
5747 return 0;
5748}
5749
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005750/*
Alex Elder589d30e2012-07-10 20:30:11 -05005751 * An rbd format 2 image has a unique identifier, distinct from the
5752 * name given to it by the user. Internally, that identifier is
5753 * what's used to specify the names of objects related to the image.
5754 *
5755 * A special "rbd id" object is used to map an rbd image name to its
5756 * id. If that object doesn't exist, then there is no v2 rbd image
5757 * with the supplied name.
5758 *
5759 * This function will record the given rbd_dev's image_id field if
5760 * it can be determined, and in that case will return 0. If any
5761 * errors occur a negative errno will be returned and the rbd_dev's
5762 * image_id field will be unchanged (and should be NULL).
5763 */
5764static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5765{
5766 int ret;
5767 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005768 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005769 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005770 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005771
Alex Elder589d30e2012-07-10 20:30:11 -05005772 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005773 * When probing a parent image, the image id is already
5774 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005775 * need to fetch the image id again in this case. We
5776 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005777 */
Alex Elderc0fba362013-04-25 23:15:08 -05005778 if (rbd_dev->spec->image_id) {
5779 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5780
Alex Elder2c0d0a12012-10-30 19:40:33 -05005781 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005782 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005783
5784 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005785 * First, see if the format 2 image id file exists, and if
5786 * so, get the image's persistent id from it.
5787 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005788 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5789 rbd_dev->spec->image_name);
5790 if (ret)
5791 return ret;
5792
5793 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005794
5795 /* Response will be an encoded string, which includes a length */
5796
5797 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5798 response = kzalloc(size, GFP_NOIO);
5799 if (!response) {
5800 ret = -ENOMEM;
5801 goto out;
5802 }
5803
Alex Elderc0fba362013-04-25 23:15:08 -05005804 /* If it doesn't exist we'll assume it's a format 1 image */
5805
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005806 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5807 "get_id", NULL, 0,
5808 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005809 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005810 if (ret == -ENOENT) {
5811 image_id = kstrdup("", GFP_KERNEL);
5812 ret = image_id ? 0 : -ENOMEM;
5813 if (!ret)
5814 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005815 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005816 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005817
Alex Elderc0fba362013-04-25 23:15:08 -05005818 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005819 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005820 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005821 if (!ret)
5822 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005823 }
5824
5825 if (!ret) {
5826 rbd_dev->spec->image_id = image_id;
5827 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005828 }
5829out:
5830 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005831 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005832 return ret;
5833}
5834
Alex Elder3abef3b2013-05-13 20:35:37 -05005835/*
5836 * Undo whatever state changes are made by v1 or v2 header info
5837 * call.
5838 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005839static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5840{
5841 struct rbd_image_header *header;
5842
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005843 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005844
5845 /* Free dynamic fields from the header, then zero it out */
5846
5847 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005848 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005849 kfree(header->snap_sizes);
5850 kfree(header->snap_names);
5851 kfree(header->object_prefix);
5852 memset(header, 0, sizeof (*header));
5853}
5854
Alex Elder2df3fac2013-05-06 09:51:30 -05005855static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005856{
5857 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005858
Alex Elder1e130192012-07-03 16:01:19 -05005859 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005860 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005861 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005862
Alex Elder2df3fac2013-05-06 09:51:30 -05005863 /*
5864 * Get the and check features for the image. Currently the
5865 * features are assumed to never change.
5866 */
Alex Elderb1b54022012-07-03 16:01:19 -05005867 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005868 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005869 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005870
Alex Eldercc070d52013-04-21 12:14:45 -05005871 /* If the image supports fancy striping, get its parameters */
5872
5873 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5874 ret = rbd_dev_v2_striping_info(rbd_dev);
5875 if (ret < 0)
5876 goto out_err;
5877 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005878
Ilya Dryomov7e973322017-01-25 18:16:22 +01005879 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5880 ret = rbd_dev_v2_data_pool(rbd_dev);
5881 if (ret)
5882 goto out_err;
5883 }
5884
Ilya Dryomov263423f2017-01-25 18:16:22 +01005885 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005886 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005887
Alex Elder9d475de2012-07-03 16:01:19 -05005888out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005889 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005890 kfree(rbd_dev->header.object_prefix);
5891 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005892 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005893}
5894
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005895/*
5896 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5897 * rbd_dev_image_probe() recursion depth, which means it's also the
5898 * length of the already discovered part of the parent chain.
5899 */
5900static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005901{
Alex Elder2f82ee52012-10-30 19:40:33 -05005902 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005903 int ret;
5904
5905 if (!rbd_dev->parent_spec)
5906 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005907
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005908 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5909 pr_info("parent chain is too long (%d)\n", depth);
5910 ret = -EINVAL;
5911 goto out_err;
5912 }
5913
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005914 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005915 if (!parent) {
5916 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005917 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005918 }
5919
5920 /*
5921 * Images related by parent/child relationships always share
5922 * rbd_client and spec/parent_spec, so bump their refcounts.
5923 */
5924 __rbd_get_client(rbd_dev->rbd_client);
5925 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005926
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005927 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005928 if (ret < 0)
5929 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005930
Alex Elder124afba2013-04-26 15:44:36 -05005931 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005932 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005933 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005934
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005935out_err:
5936 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005937 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005938 return ret;
5939}
5940
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005941static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5942{
5943 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5944 rbd_dev_mapping_clear(rbd_dev);
5945 rbd_free_disk(rbd_dev);
5946 if (!single_major)
5947 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5948}
5949
Ilya Dryomov811c6682016-04-15 16:22:16 +02005950/*
5951 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5952 * upon return.
5953 */
Alex Elder200a6a82013-04-28 23:32:34 -05005954static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005955{
Alex Elder83a06262012-10-30 15:47:17 -05005956 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005957
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005958 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005959
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005960 if (!single_major) {
5961 ret = register_blkdev(0, rbd_dev->name);
5962 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005963 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005964
5965 rbd_dev->major = ret;
5966 rbd_dev->minor = 0;
5967 } else {
5968 rbd_dev->major = rbd_major;
5969 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5970 }
Alex Elder83a06262012-10-30 15:47:17 -05005971
5972 /* Set up the blkdev mapping. */
5973
5974 ret = rbd_init_disk(rbd_dev);
5975 if (ret)
5976 goto err_out_blkdev;
5977
Alex Elderf35a4de2013-05-06 09:51:29 -05005978 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005979 if (ret)
5980 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005981
Alex Elderf35a4de2013-05-06 09:51:29 -05005982 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005983 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005984
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005985 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005986 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005987 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005988
Alex Elder129b79d2013-04-26 15:44:36 -05005989 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005990 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005991 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005992
Alex Elderf35a4de2013-05-06 09:51:29 -05005993err_out_mapping:
5994 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005995err_out_disk:
5996 rbd_free_disk(rbd_dev);
5997err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005998 if (!single_major)
5999 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006000err_out_unlock:
6001 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05006002 return ret;
6003}
6004
Alex Elder332bb122013-04-27 09:59:30 -05006005static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6006{
6007 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006008 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05006009
6010 /* Record the header object name for this rbd image. */
6011
6012 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05006013 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006014 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6015 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05006016 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006017 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6018 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05006019
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006020 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05006021}
6022
Alex Elder200a6a82013-04-28 23:32:34 -05006023static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6024{
Alex Elder6fd48b32013-04-28 23:32:34 -05006025 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006026 if (rbd_dev->opts)
6027 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006028 rbd_dev->image_format = 0;
6029 kfree(rbd_dev->spec->image_id);
6030 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05006031}
6032
Alex Eldera30b71b2012-07-10 20:30:11 -05006033/*
6034 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05006035 * device. If this image is the one being mapped (i.e., not a
6036 * parent), initiate a watch on its header object before using that
6037 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05006038 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006039static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05006040{
6041 int ret;
6042
6043 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05006044 * Get the id from the image id object. Unless there's an
6045 * error, rbd_dev->spec->image_id will be filled in with
6046 * a dynamically-allocated string, and rbd_dev->image_format
6047 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05006048 */
6049 ret = rbd_dev_image_id(rbd_dev);
6050 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05006051 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05006052
Alex Elder332bb122013-04-27 09:59:30 -05006053 ret = rbd_dev_header_name(rbd_dev);
6054 if (ret)
6055 goto err_out_format;
6056
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006057 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02006058 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006059 if (ret) {
6060 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006061 pr_info("image %s/%s%s%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006062 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006063 rbd_dev->spec->pool_ns ?: "",
6064 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006065 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006066 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006067 }
Alex Elder1f3ef782013-05-06 17:40:33 -05006068 }
Alex Elderb644de22013-04-27 09:59:31 -05006069
Ilya Dryomova720ae02014-07-23 17:11:19 +04006070 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05006071 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05006072 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05006073
Ilya Dryomov04077592014-07-23 17:11:20 +04006074 /*
6075 * If this image is the one being mapped, we have pool name and
6076 * id, image name and id, and snap name - need to fill snap id.
6077 * Otherwise this is a parent image, identified by pool, image
6078 * and snap ids - need to fill in names for those ids.
6079 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006080 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04006081 ret = rbd_spec_fill_snap_id(rbd_dev);
6082 else
6083 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006084 if (ret) {
6085 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006086 pr_info("snap %s/%s%s%s@%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006087 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006088 rbd_dev->spec->pool_ns ?: "",
6089 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006090 rbd_dev->spec->image_name,
6091 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05006092 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006093 }
Alex Elder9bb81c92013-04-27 09:59:30 -05006094
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006095 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6096 ret = rbd_dev_v2_parent_info(rbd_dev);
6097 if (ret)
6098 goto err_out_probe;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006099 }
6100
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006101 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05006102 if (ret)
6103 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05006104
Alex Elder30d60ba2013-05-06 09:51:30 -05006105 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006106 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05006107 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006108
Alex Elder6fd48b32013-04-28 23:32:34 -05006109err_out_probe:
6110 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05006111err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006112 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02006113 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05006114err_out_format:
6115 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05006116 kfree(rbd_dev->spec->image_id);
6117 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05006118 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006119}
6120
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006121static ssize_t do_rbd_add(struct bus_type *bus,
6122 const char *buf,
6123 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006124{
Alex Eldercb8627c2012-07-09 21:04:23 -05006125 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05006126 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05006127 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05006128 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05006129 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006130 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006131
6132 if (!try_module_get(THIS_MODULE))
6133 return -ENODEV;
6134
Alex Eldera725f65e2012-02-02 08:13:30 -06006135 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05006136 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05006137 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006138 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06006139
Alex Elder9d3997f2012-10-25 23:34:42 -05006140 rbdc = rbd_get_client(ceph_opts);
6141 if (IS_ERR(rbdc)) {
6142 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006143 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05006144 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006145
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006146 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01006147 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006148 if (rc < 0) {
6149 if (rc == -ENOENT)
6150 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006151 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006152 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05006153 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05006154
Ilya Dryomovd1475432015-06-22 13:24:48 +03006155 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006156 if (!rbd_dev) {
6157 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05006158 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006159 }
Alex Elderc53d5892012-10-25 23:34:42 -05006160 rbdc = NULL; /* rbd_dev now owns this */
6161 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03006162 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006163
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006164 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
6165 if (!rbd_dev->config_info) {
6166 rc = -ENOMEM;
6167 goto err_out_rbd_dev;
6168 }
6169
Ilya Dryomov811c6682016-04-15 16:22:16 +02006170 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006171 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006172 if (rc < 0) {
6173 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05006174 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006175 }
Alex Elder05fd6f62012-08-29 17:11:07 -05006176
Alex Elder7ce4eef2013-05-06 17:40:33 -05006177 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05006178 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02006179 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05006180
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01006181 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
6182 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
6183 rbd_dev->layout.object_size);
6184 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
6185 }
6186
Alex Elderb536f692013-04-28 23:32:34 -05006187 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006188 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006189 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05006190
Ilya Dryomove010dd02017-04-13 12:17:39 +02006191 if (rbd_dev->opts->exclusive) {
6192 rc = rbd_add_acquire_lock(rbd_dev);
6193 if (rc)
6194 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05006195 }
6196
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006197 /* Everything's ready. Announce the disk to the world. */
6198
6199 rc = device_add(&rbd_dev->dev);
6200 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02006201 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006202
6203 add_disk(rbd_dev->disk);
6204 /* see rbd_init_disk() */
6205 blk_put_queue(rbd_dev->disk->queue);
6206
6207 spin_lock(&rbd_dev_list_lock);
6208 list_add_tail(&rbd_dev->node, &rbd_dev_list);
6209 spin_unlock(&rbd_dev_list_lock);
6210
6211 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
6212 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
6213 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006214 rc = count;
6215out:
6216 module_put(THIS_MODULE);
6217 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05006218
Ilya Dryomove010dd02017-04-13 12:17:39 +02006219err_out_image_lock:
6220 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006221err_out_device_setup:
6222 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006223err_out_image_probe:
6224 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05006225err_out_rbd_dev:
6226 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05006227err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05006228 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006229err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05006230 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03006231 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006232 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006233}
6234
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006235static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006236{
6237 if (single_major)
6238 return -EINVAL;
6239
6240 return do_rbd_add(bus, buf, count);
6241}
6242
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006243static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
6244 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006245{
6246 return do_rbd_add(bus, buf, count);
6247}
6248
Alex Elder05a46af2013-04-26 15:44:36 -05006249static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6250{
Alex Elderad945fc2013-04-26 15:44:36 -05006251 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05006252 struct rbd_device *first = rbd_dev;
6253 struct rbd_device *second = first->parent;
6254 struct rbd_device *third;
6255
6256 /*
6257 * Follow to the parent with no grandparent and
6258 * remove it.
6259 */
6260 while (second && (third = second->parent)) {
6261 first = second;
6262 second = third;
6263 }
Alex Elderad945fc2013-04-26 15:44:36 -05006264 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006265 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006266 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05006267 first->parent = NULL;
6268 first->parent_overlap = 0;
6269
6270 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05006271 rbd_spec_put(first->parent_spec);
6272 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05006273 }
6274}
6275
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006276static ssize_t do_rbd_remove(struct bus_type *bus,
6277 const char *buf,
6278 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006279{
6280 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05006281 struct list_head *tmp;
6282 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02006283 char opt_buf[6];
Mike Christie0276dca2016-08-18 18:38:45 +02006284 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05006285 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006286
Mike Christie0276dca2016-08-18 18:38:45 +02006287 dev_id = -1;
6288 opt_buf[0] = '\0';
6289 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6290 if (dev_id < 0) {
6291 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006292 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02006293 }
6294 if (opt_buf[0] != '\0') {
6295 if (!strcmp(opt_buf, "force")) {
6296 force = true;
6297 } else {
6298 pr_err("bad remove option at '%s'\n", opt_buf);
6299 return -EINVAL;
6300 }
6301 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006302
Alex Elder751cc0e2013-05-31 15:17:01 -05006303 ret = -ENOENT;
6304 spin_lock(&rbd_dev_list_lock);
6305 list_for_each(tmp, &rbd_dev_list) {
6306 rbd_dev = list_entry(tmp, struct rbd_device, node);
6307 if (rbd_dev->dev_id == dev_id) {
6308 ret = 0;
6309 break;
6310 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006311 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006312 if (!ret) {
6313 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006314 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006315 ret = -EBUSY;
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006316 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6317 &rbd_dev->flags))
6318 ret = -EINPROGRESS;
Alex Elder751cc0e2013-05-31 15:17:01 -05006319 spin_unlock_irq(&rbd_dev->lock);
6320 }
6321 spin_unlock(&rbd_dev_list_lock);
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006322 if (ret)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006323 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006324
Mike Christie0276dca2016-08-18 18:38:45 +02006325 if (force) {
6326 /*
6327 * Prevent new IO from being queued and wait for existing
6328 * IO to complete/fail.
6329 */
6330 blk_mq_freeze_queue(rbd_dev->disk->queue);
6331 blk_set_queue_dying(rbd_dev->disk->queue);
6332 }
6333
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006334 del_gendisk(rbd_dev->disk);
6335 spin_lock(&rbd_dev_list_lock);
6336 list_del_init(&rbd_dev->node);
6337 spin_unlock(&rbd_dev_list_lock);
6338 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006339
Ilya Dryomove010dd02017-04-13 12:17:39 +02006340 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006341 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006342 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006343 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006344 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006345}
6346
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006347static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006348{
6349 if (single_major)
6350 return -EINVAL;
6351
6352 return do_rbd_remove(bus, buf, count);
6353}
6354
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006355static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
6356 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006357{
6358 return do_rbd_remove(bus, buf, count);
6359}
6360
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006361/*
6362 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006363 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006364 */
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006365static int __init rbd_sysfs_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006366{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006367 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006368
Alex Elderfed4c142012-02-07 12:03:36 -06006369 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006370 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006371 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006372
Alex Elderfed4c142012-02-07 12:03:36 -06006373 ret = bus_register(&rbd_bus_type);
6374 if (ret < 0)
6375 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006376
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006377 return ret;
6378}
6379
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006380static void __exit rbd_sysfs_cleanup(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006381{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006382 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006383 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006384}
6385
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006386static int __init rbd_slab_init(void)
Alex Elder1c2a9df2013-05-01 12:43:03 -05006387{
6388 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006389 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006390 if (!rbd_img_request_cache)
6391 return -ENOMEM;
6392
6393 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006394 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006395 if (!rbd_obj_request_cache)
6396 goto out_err;
6397
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006398 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006399
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006400out_err:
Alex Elder868311b2013-05-01 12:43:03 -05006401 kmem_cache_destroy(rbd_img_request_cache);
6402 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006403 return -ENOMEM;
6404}
6405
6406static void rbd_slab_exit(void)
6407{
Alex Elder868311b2013-05-01 12:43:03 -05006408 rbd_assert(rbd_obj_request_cache);
6409 kmem_cache_destroy(rbd_obj_request_cache);
6410 rbd_obj_request_cache = NULL;
6411
Alex Elder1c2a9df2013-05-01 12:43:03 -05006412 rbd_assert(rbd_img_request_cache);
6413 kmem_cache_destroy(rbd_img_request_cache);
6414 rbd_img_request_cache = NULL;
6415}
6416
Alex Eldercc344fa2013-02-19 12:25:56 -06006417static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006418{
6419 int rc;
6420
Alex Elder1e32d342013-01-30 11:13:33 -06006421 if (!libceph_compatible(NULL)) {
6422 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006423 return -EINVAL;
6424 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006425
Alex Elder1c2a9df2013-05-01 12:43:03 -05006426 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006427 if (rc)
6428 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006429
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006430 /*
6431 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006432 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006433 */
6434 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6435 if (!rbd_wq) {
6436 rc = -ENOMEM;
6437 goto err_out_slab;
6438 }
6439
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006440 if (single_major) {
6441 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6442 if (rbd_major < 0) {
6443 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006444 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006445 }
6446 }
6447
Alex Elder1c2a9df2013-05-01 12:43:03 -05006448 rc = rbd_sysfs_init();
6449 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006450 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006451
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006452 if (single_major)
6453 pr_info("loaded (major %d)\n", rbd_major);
6454 else
6455 pr_info("loaded\n");
6456
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006457 return 0;
6458
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006459err_out_blkdev:
6460 if (single_major)
6461 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006462err_out_wq:
6463 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006464err_out_slab:
6465 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006466 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006467}
6468
Alex Eldercc344fa2013-02-19 12:25:56 -06006469static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006470{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006471 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006472 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006473 if (single_major)
6474 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006475 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006476 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006477}
6478
6479module_init(rbd_init);
6480module_exit(rbd_exit);
6481
Alex Elderd552c612013-05-31 20:13:09 -05006482MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006483MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6484MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006485/* following authorship retained from original osdblk.c */
6486MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6487
Ilya Dryomov90da2582013-12-13 15:28:56 +02006488MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006489MODULE_LICENSE("GPL");