blob: 414037fef3ef779498a4cd36a4480eacf7b24cfa [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
Mark Rutlandbfc18e32018-06-21 13:13:04 +010064 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
Alex Eldera2acd002013-05-08 22:50:04 -050065 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100118#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100119#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100120#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100121
Ilya Dryomoved95b212016-08-12 16:40:02 +0200122#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
123 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100124 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100125 RBD_FEATURE_DEEP_FLATTEN | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100126 RBD_FEATURE_DATA_POOL | \
127 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700128
129/* Features supported by this (client software) implementation. */
130
Alex Elder770eba62012-10-25 23:34:40 -0500131#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700132
Alex Elder81a89792012-02-02 08:13:30 -0600133/*
134 * An RBD device name will be "rbd#", where the "rbd" comes from
135 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600136 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700137#define DEV_NAME_LEN 32
138
139/*
140 * block device image metadata (in-memory version)
141 */
142struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500143 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500144 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500146 u64 stripe_unit;
147 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100148 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500149 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700150
Alex Elderf84344f2012-08-31 17:29:51 -0500151 /* The remaining fields need to be updated occasionally */
152 u64 image_size;
153 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500154 char *snap_names; /* format 1 only */
155 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700156};
157
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500158/*
159 * An rbd image specification.
160 *
161 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500162 * identify an image. Each rbd_dev structure includes a pointer to
163 * an rbd_spec structure that encapsulates this identity.
164 *
165 * Each of the id's in an rbd_spec has an associated name. For a
166 * user-mapped image, the names are supplied and the id's associated
167 * with them are looked up. For a layered image, a parent image is
168 * defined by the tuple, and the names are looked up.
169 *
170 * An rbd_dev structure contains a parent_spec pointer which is
171 * non-null if the image it represents is a child in a layered
172 * image. This pointer will refer to the rbd_spec structure used
173 * by the parent rbd_dev for its own identity (i.e., the structure
174 * is shared between the parent and child).
175 *
176 * Since these structures are populated once, during the discovery
177 * phase of image construction, they are effectively immutable so
178 * we make no effort to synchronize access to them.
179 *
180 * Note that code herein does not assume the image name is known (it
181 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500182 */
183struct rbd_spec {
184 u64 pool_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500185 const char *pool_name;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200186 const char *pool_ns; /* NULL if default, never "" */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500187
Alex Elderecb4dc222013-04-26 09:43:47 -0500188 const char *image_id;
189 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500190
191 u64 snap_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500192 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500193
194 struct kref kref;
195};
196
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700197/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600198 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199 */
200struct rbd_client {
201 struct ceph_client *client;
202 struct kref kref;
203 struct list_head node;
204};
205
Alex Elderbf0d5f502012-11-22 00:00:08 -0600206struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600207
Alex Elder9969ebc2013-01-18 12:31:10 -0600208enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100209 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100210 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100211 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100212 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600213};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600214
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800215enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100216 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800217 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800218 OBJ_OP_DISCARD,
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100219 OBJ_OP_ZEROOUT,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800220};
221
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100222/*
223 * Writes go through the following state machine to deal with
224 * layering:
225 *
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100226 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
227 * . | .
228 * . v .
229 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
230 * . | . .
231 * . v v (deep-copyup .
232 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
233 * flattened) v | . .
234 * . v . .
235 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
236 * | not needed) v
237 * v .
238 * done . . . . . . . . . . . . . . . . . .
239 * ^
240 * |
241 * RBD_OBJ_WRITE_FLAT
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100242 *
243 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100244 * assert_exists guard is needed or not (in some cases it's not needed
245 * even if there is a parent).
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100246 */
247enum rbd_obj_write_state {
248 RBD_OBJ_WRITE_FLAT = 1,
249 RBD_OBJ_WRITE_GUARD,
Ilya Dryomov3a482502019-02-28 10:49:12 +0100250 RBD_OBJ_WRITE_READ_FROM_PARENT,
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100251 RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC,
Ilya Dryomov3a482502019-02-28 10:49:12 +0100252 RBD_OBJ_WRITE_COPYUP_OPS,
Alex Elder926f9b32013-02-11 12:33:24 -0600253};
254
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100256 struct ceph_object_extent ex;
Alex Elderc5b5ef62013-02-11 12:33:24 -0600257 union {
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100258 bool tried_parent; /* for reads */
259 enum rbd_obj_write_state write_state; /* for writes */
260 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600261
Ilya Dryomov51c35092018-01-29 14:04:08 +0100262 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100263 struct ceph_file_extent *img_extents;
264 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600265
Alex Elder788e2df2013-01-17 12:25:27 -0600266 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100267 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600268 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100269 struct ceph_bvec_iter bvec_pos;
270 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100271 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600272 };
273 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100274 struct bio_vec *copyup_bvecs;
275 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276
277 struct ceph_osd_request *osd_req;
278
279 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800280 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600281
Alex Elderbf0d5f502012-11-22 00:00:08 -0600282 struct kref kref;
283};
284
Alex Elder0c425242013-02-08 09:55:49 -0600285enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600286 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600287 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600288};
289
Alex Elderbf0d5f502012-11-22 00:00:08 -0600290struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600291 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100292 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100293 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600294 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600295 union {
Alex Elder9849e982013-01-24 16:13:36 -0600296 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600297 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600298 };
299 union {
300 struct request *rq; /* block request */
301 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600302 };
Ilya Dryomov15961b42018-02-01 11:50:47 +0100303 spinlock_t completion_lock;
Alex Elder55f27e02013-04-10 12:34:25 -0500304 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600305 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600306
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100307 struct list_head object_extents; /* obj_req.ex structs */
Ilya Dryomov7114eda2018-02-01 11:50:47 +0100308 u32 pending_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600309
310 struct kref kref;
311};
312
313#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100314 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600315#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100316 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600317
Ilya Dryomov99d16942016-08-12 16:11:41 +0200318enum rbd_watch_state {
319 RBD_WATCH_STATE_UNREGISTERED,
320 RBD_WATCH_STATE_REGISTERED,
321 RBD_WATCH_STATE_ERROR,
322};
323
Ilya Dryomoved95b212016-08-12 16:40:02 +0200324enum rbd_lock_state {
325 RBD_LOCK_STATE_UNLOCKED,
326 RBD_LOCK_STATE_LOCKED,
327 RBD_LOCK_STATE_RELEASING,
328};
329
330/* WatchNotify::ClientId */
331struct rbd_client_id {
332 u64 gid;
333 u64 handle;
334};
335
Alex Elderf84344f2012-08-31 17:29:51 -0500336struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500337 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500338 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500339};
340
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700341/*
342 * a single device
343 */
344struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500345 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700346
347 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200348 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700349 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350
Alex Eldera30b71b2012-07-10 20:30:11 -0500351 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700352 struct rbd_client *rbd_client;
353
354 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
355
Alex Elderb82d1672013-01-14 12:43:31 -0600356 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700357
358 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600359 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500360 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300361 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200362 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700363
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200364 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200365 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500366
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200367 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600368
Ilya Dryomov99d16942016-08-12 16:11:41 +0200369 struct mutex watch_mutex;
370 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200371 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200372 u64 watch_cookie;
373 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700374
Ilya Dryomoved95b212016-08-12 16:40:02 +0200375 struct rw_semaphore lock_rwsem;
376 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200377 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200378 struct rbd_client_id owner_cid;
379 struct work_struct acquired_lock_work;
380 struct work_struct released_lock_work;
381 struct delayed_work lock_dwork;
382 struct work_struct unlock_work;
383 wait_queue_head_t lock_waitq;
384
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200385 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700386
Alex Elder86b00e02012-10-25 23:34:42 -0500387 struct rbd_spec *parent_spec;
388 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500389 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500390 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500391
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100392 /* Block layer tags. */
393 struct blk_mq_tag_set tag_set;
394
Josh Durginc6666012011-11-21 17:11:12 -0800395 /* protects updating the header */
396 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500397
398 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700399
400 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800401
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800402 /* sysfs related */
403 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600404 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800405};
406
Alex Elderb82d1672013-01-14 12:43:31 -0600407/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200408 * Flag bits for rbd_dev->flags:
409 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
410 * by rbd_dev->lock
411 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600412 */
Alex Elder6d292902013-01-14 12:43:31 -0600413enum rbd_dev_flags {
414 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600415 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200416 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600417};
418
Alex Eldercfbf6372013-05-31 17:40:45 -0500419static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600420
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700421static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600422static DEFINE_SPINLOCK(rbd_dev_list_lock);
423
Alex Elder432b8582012-01-29 13:57:44 -0600424static LIST_HEAD(rbd_client_list); /* clients */
425static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700426
Alex Elder78c2a442013-05-01 12:43:04 -0500427/* Slab caches for frequently-allocated structures */
428
Alex Elder1c2a9df2013-05-01 12:43:03 -0500429static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500430static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500431
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200432static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200433static DEFINE_IDA(rbd_dev_id_ida);
434
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400435static struct workqueue_struct *rbd_wq;
436
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100437static struct ceph_snap_context rbd_empty_snapc = {
438 .nref = REFCOUNT_INIT(1),
439};
440
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200441/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100442 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200443 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100444static bool single_major = true;
Joe Perches5657a812018-05-24 13:38:59 -0600445module_param(single_major, bool, 0444);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100446MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200447
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100448static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
449static ssize_t remove_store(struct bus_type *bus, const char *buf,
450 size_t count);
451static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
452 size_t count);
453static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
454 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200455static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600456
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200457static int rbd_dev_id_to_minor(int dev_id)
458{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200459 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200460}
461
462static int minor_to_rbd_dev_id(int minor)
463{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200464 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200465}
466
Ilya Dryomoved95b212016-08-12 16:40:02 +0200467static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
468{
469 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
470 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
471}
472
473static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
474{
475 bool is_lock_owner;
476
477 down_read(&rbd_dev->lock_rwsem);
478 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
479 up_read(&rbd_dev->lock_rwsem);
480 return is_lock_owner;
481}
482
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100483static ssize_t supported_features_show(struct bus_type *bus, char *buf)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100484{
485 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
486}
487
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100488static BUS_ATTR_WO(add);
489static BUS_ATTR_WO(remove);
490static BUS_ATTR_WO(add_single_major);
491static BUS_ATTR_WO(remove_single_major);
492static BUS_ATTR_RO(supported_features);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700493
494static struct attribute *rbd_bus_attrs[] = {
495 &bus_attr_add.attr,
496 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200497 &bus_attr_add_single_major.attr,
498 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100499 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700500 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600501};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200502
503static umode_t rbd_bus_is_visible(struct kobject *kobj,
504 struct attribute *attr, int index)
505{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200506 if (!single_major &&
507 (attr == &bus_attr_add_single_major.attr ||
508 attr == &bus_attr_remove_single_major.attr))
509 return 0;
510
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200511 return attr->mode;
512}
513
514static const struct attribute_group rbd_bus_group = {
515 .attrs = rbd_bus_attrs,
516 .is_visible = rbd_bus_is_visible,
517};
518__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600519
520static struct bus_type rbd_bus_type = {
521 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700522 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600523};
524
525static void rbd_root_dev_release(struct device *dev)
526{
527}
528
529static struct device rbd_root_dev = {
530 .init_name = "rbd",
531 .release = rbd_root_dev_release,
532};
533
Alex Elder06ecc6c2012-11-01 10:17:15 -0500534static __printf(2, 3)
535void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
536{
537 struct va_format vaf;
538 va_list args;
539
540 va_start(args, fmt);
541 vaf.fmt = fmt;
542 vaf.va = &args;
543
544 if (!rbd_dev)
545 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
546 else if (rbd_dev->disk)
547 printk(KERN_WARNING "%s: %s: %pV\n",
548 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
549 else if (rbd_dev->spec && rbd_dev->spec->image_name)
550 printk(KERN_WARNING "%s: image %s: %pV\n",
551 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
552 else if (rbd_dev->spec && rbd_dev->spec->image_id)
553 printk(KERN_WARNING "%s: id %s: %pV\n",
554 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
555 else /* punt */
556 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
557 RBD_DRV_NAME, rbd_dev, &vaf);
558 va_end(args);
559}
560
Alex Elderaafb2302012-09-06 16:00:54 -0500561#ifdef RBD_DEBUG
562#define rbd_assert(expr) \
563 if (unlikely(!(expr))) { \
564 printk(KERN_ERR "\nAssertion failure in %s() " \
565 "at line %d:\n\n" \
566 "\trbd_assert(%s);\n\n", \
567 __func__, __LINE__, #expr); \
568 BUG(); \
569 }
570#else /* !RBD_DEBUG */
571# define rbd_assert(expr) ((void) 0)
572#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800573
Alex Elder05a46af2013-04-26 15:44:36 -0500574static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600575
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500576static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500577static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400578static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400579static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500580static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
581 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500582static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
583 u8 *order, u64 *snap_size);
584static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
585 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700586
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587static int rbd_open(struct block_device *bdev, fmode_t mode)
588{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600589 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600590 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700591
Alex Eldera14ea262013-02-05 13:23:12 -0600592 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600593 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
594 removing = true;
595 else
596 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600597 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600598 if (removing)
599 return -ENOENT;
600
Alex Elderc3e946c2012-11-16 09:29:16 -0600601 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700602
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700603 return 0;
604}
605
Al Virodb2a1442013-05-05 21:52:57 -0400606static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800607{
608 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600609 unsigned long open_count_before;
610
Alex Eldera14ea262013-02-05 13:23:12 -0600611 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600612 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600613 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600614 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800615
Alex Elderc3e946c2012-11-16 09:29:16 -0600616 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800617}
618
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800619static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
620{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200621 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800622
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200623 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800624 return -EFAULT;
625
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200626 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800627 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
628 return -EROFS;
629
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200630 /* Let blkdev_roset() handle it */
631 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800632}
633
634static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
635 unsigned int cmd, unsigned long arg)
636{
637 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200638 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800639
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800640 switch (cmd) {
641 case BLKROSET:
642 ret = rbd_ioctl_set_ro(rbd_dev, arg);
643 break;
644 default:
645 ret = -ENOTTY;
646 }
647
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800648 return ret;
649}
650
651#ifdef CONFIG_COMPAT
652static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
653 unsigned int cmd, unsigned long arg)
654{
655 return rbd_ioctl(bdev, mode, cmd, arg);
656}
657#endif /* CONFIG_COMPAT */
658
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700659static const struct block_device_operations rbd_bd_ops = {
660 .owner = THIS_MODULE,
661 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800662 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800663 .ioctl = rbd_ioctl,
664#ifdef CONFIG_COMPAT
665 .compat_ioctl = rbd_compat_ioctl,
666#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700667};
668
669/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500670 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500671 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700672 */
Alex Elderf8c38922012-08-10 13:12:07 -0700673static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674{
675 struct rbd_client *rbdc;
676 int ret = -ENOMEM;
677
Alex Elder37206ee2013-02-20 17:32:08 -0600678 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700679 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
680 if (!rbdc)
681 goto out_opt;
682
683 kref_init(&rbdc->kref);
684 INIT_LIST_HEAD(&rbdc->node);
685
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100686 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500688 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500689 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700690
691 ret = ceph_open_session(rbdc->client);
692 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500693 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700694
Alex Elder432b8582012-01-29 13:57:44 -0600695 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600697 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700698
Alex Elder37206ee2013-02-20 17:32:08 -0600699 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600700
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700701 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500702out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700703 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500704out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705 kfree(rbdc);
706out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500707 if (ceph_opts)
708 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600709 dout("%s: error %d\n", __func__, ret);
710
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400711 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700712}
713
Alex Elder2f82ee52012-10-30 19:40:33 -0500714static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
715{
716 kref_get(&rbdc->kref);
717
718 return rbdc;
719}
720
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700722 * Find a ceph client with specific addr and configuration. If
723 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700724 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700725static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726{
727 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700728 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729
Alex Elder43ae4702012-07-03 16:01:18 -0500730 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700731 return NULL;
732
Alex Elder1f7ba332012-08-10 13:12:07 -0700733 spin_lock(&rbd_client_list_lock);
734 list_for_each_entry(client_node, &rbd_client_list, node) {
735 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500736 __rbd_get_client(client_node);
737
Alex Elder1f7ba332012-08-10 13:12:07 -0700738 found = true;
739 break;
740 }
741 }
742 spin_unlock(&rbd_client_list_lock);
743
744 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745}
746
747/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300748 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700749 */
750enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300751 Opt_queue_depth,
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100752 Opt_alloc_size,
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400753 Opt_lock_timeout,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700754 Opt_last_int,
755 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200756 Opt_pool_ns,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700757 Opt_last_string,
758 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700759 Opt_read_only,
760 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200761 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200762 Opt_exclusive,
Ilya Dryomovd9360542018-03-23 06:14:47 +0100763 Opt_notrim,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300764 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700765};
766
Alex Elder43ae4702012-07-03 16:01:18 -0500767static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300768 {Opt_queue_depth, "queue_depth=%d"},
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100769 {Opt_alloc_size, "alloc_size=%d"},
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400770 {Opt_lock_timeout, "lock_timeout=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700771 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200772 {Opt_pool_ns, "_pool_ns=%s"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700773 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500774 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700775 {Opt_read_only, "ro"}, /* Alternate spelling */
776 {Opt_read_write, "read_write"},
777 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200778 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200779 {Opt_exclusive, "exclusive"},
Ilya Dryomovd9360542018-03-23 06:14:47 +0100780 {Opt_notrim, "notrim"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300781 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700782};
783
Alex Elder98571b52013-01-20 14:44:42 -0600784struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300785 int queue_depth;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100786 int alloc_size;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400787 unsigned long lock_timeout;
Alex Elder98571b52013-01-20 14:44:42 -0600788 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200789 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200790 bool exclusive;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100791 bool trim;
Alex Elder98571b52013-01-20 14:44:42 -0600792};
793
Ilya Dryomovb5584182015-06-23 16:21:19 +0300794#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100795#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400796#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
Alex Elder98571b52013-01-20 14:44:42 -0600797#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200798#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200799#define RBD_EXCLUSIVE_DEFAULT false
Ilya Dryomovd9360542018-03-23 06:14:47 +0100800#define RBD_TRIM_DEFAULT true
Alex Elder98571b52013-01-20 14:44:42 -0600801
Ilya Dryomovc3001562018-07-03 15:28:43 +0200802struct parse_rbd_opts_ctx {
803 struct rbd_spec *spec;
804 struct rbd_options *opts;
805};
806
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700807static int parse_rbd_opts_token(char *c, void *private)
808{
Ilya Dryomovc3001562018-07-03 15:28:43 +0200809 struct parse_rbd_opts_ctx *pctx = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700810 substring_t argstr[MAX_OPT_ARGS];
811 int token, intval, ret;
812
Alex Elder43ae4702012-07-03 16:01:18 -0500813 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700814 if (token < Opt_last_int) {
815 ret = match_int(&argstr[0], &intval);
816 if (ret < 0) {
Ilya Dryomov2f56b6b2018-06-27 16:38:13 +0200817 pr_err("bad option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700818 return ret;
819 }
820 dout("got int token %d val %d\n", token, intval);
821 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300822 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700823 } else {
824 dout("got token %d\n", token);
825 }
826
827 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300828 case Opt_queue_depth:
829 if (intval < 1) {
830 pr_err("queue_depth out of range\n");
831 return -EINVAL;
832 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200833 pctx->opts->queue_depth = intval;
Ilya Dryomovb5584182015-06-23 16:21:19 +0300834 break;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100835 case Opt_alloc_size:
Ilya Dryomov16d80c52019-03-15 14:50:04 +0100836 if (intval < SECTOR_SIZE) {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100837 pr_err("alloc_size out of range\n");
838 return -EINVAL;
839 }
840 if (!is_power_of_2(intval)) {
841 pr_err("alloc_size must be a power of 2\n");
842 return -EINVAL;
843 }
844 pctx->opts->alloc_size = intval;
845 break;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400846 case Opt_lock_timeout:
847 /* 0 is "wait forever" (i.e. infinite timeout) */
848 if (intval < 0 || intval > INT_MAX / 1000) {
849 pr_err("lock_timeout out of range\n");
850 return -EINVAL;
851 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200852 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400853 break;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200854 case Opt_pool_ns:
855 kfree(pctx->spec->pool_ns);
856 pctx->spec->pool_ns = match_strdup(argstr);
857 if (!pctx->spec->pool_ns)
858 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700859 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700860 case Opt_read_only:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200861 pctx->opts->read_only = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700862 break;
863 case Opt_read_write:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200864 pctx->opts->read_only = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700865 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200866 case Opt_lock_on_read:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200867 pctx->opts->lock_on_read = true;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200868 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200869 case Opt_exclusive:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200870 pctx->opts->exclusive = true;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200871 break;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100872 case Opt_notrim:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200873 pctx->opts->trim = false;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100874 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700875 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300876 /* libceph prints "bad option" msg */
877 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700878 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300879
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700880 return 0;
881}
882
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800883static char* obj_op_name(enum obj_operation_type op_type)
884{
885 switch (op_type) {
886 case OBJ_OP_READ:
887 return "read";
888 case OBJ_OP_WRITE:
889 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800890 case OBJ_OP_DISCARD:
891 return "discard";
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100892 case OBJ_OP_ZEROOUT:
893 return "zeroout";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800894 default:
895 return "???";
896 }
897}
898
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700899/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700900 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600901 *
Alex Elder432b8582012-01-29 13:57:44 -0600902 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700903 */
904static void rbd_client_release(struct kref *kref)
905{
906 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
907
Alex Elder37206ee2013-02-20 17:32:08 -0600908 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500909 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700910 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500911 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700912
913 ceph_destroy_client(rbdc->client);
914 kfree(rbdc);
915}
916
917/*
918 * Drop reference to ceph client node. If it's not referenced anymore, release
919 * it.
920 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500921static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700922{
Alex Elderc53d5892012-10-25 23:34:42 -0500923 if (rbdc)
924 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925}
926
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100927/*
928 * Get a ceph client with specific addr and configuration, if one does
929 * not exist create it. Either way, ceph_opts is consumed by this
930 * function.
931 */
932static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
933{
934 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100935 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100936
937 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
938 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100939 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100940 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100941
942 /*
943 * Using an existing client. Make sure ->pg_pools is up to
944 * date before we look up the pool id in do_rbd_add().
945 */
Ilya Dryomov9d4a2272019-03-20 10:58:05 +0100946 ret = ceph_wait_for_latest_osdmap(rbdc->client,
947 rbdc->client->options->mount_timeout);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100948 if (ret) {
949 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
950 rbd_put_client(rbdc);
951 rbdc = ERR_PTR(ret);
952 }
953 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100954 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100955 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100956 mutex_unlock(&client_mutex);
957
958 return rbdc;
959}
960
Alex Eldera30b71b2012-07-10 20:30:11 -0500961static bool rbd_image_format_valid(u32 image_format)
962{
963 return image_format == 1 || image_format == 2;
964}
965
Alex Elder8e94af82012-07-25 09:32:40 -0500966static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
967{
Alex Elder103a1502012-08-02 11:29:45 -0500968 size_t size;
969 u32 snap_count;
970
971 /* The header has to start with the magic rbd header text */
972 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
973 return false;
974
Alex Elderdb2388b2012-10-20 22:17:27 -0500975 /* The bio layer requires at least sector-sized I/O */
976
977 if (ondisk->options.order < SECTOR_SHIFT)
978 return false;
979
980 /* If we use u64 in a few spots we may be able to loosen this */
981
982 if (ondisk->options.order > 8 * sizeof (int) - 1)
983 return false;
984
Alex Elder103a1502012-08-02 11:29:45 -0500985 /*
986 * The size of a snapshot header has to fit in a size_t, and
987 * that limits the number of snapshots.
988 */
989 snap_count = le32_to_cpu(ondisk->snap_count);
990 size = SIZE_MAX - sizeof (struct ceph_snap_context);
991 if (snap_count > size / sizeof (__le64))
992 return false;
993
994 /*
995 * Not only that, but the size of the entire the snapshot
996 * header must also be representable in a size_t.
997 */
998 size -= snap_count * sizeof (__le64);
999 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1000 return false;
1001
1002 return true;
Alex Elder8e94af82012-07-25 09:32:40 -05001003}
1004
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001006 * returns the size of an object in the image
1007 */
1008static u32 rbd_obj_bytes(struct rbd_image_header *header)
1009{
1010 return 1U << header->obj_order;
1011}
1012
Ilya Dryomov263423f2017-01-25 18:16:22 +01001013static void rbd_init_layout(struct rbd_device *rbd_dev)
1014{
1015 if (rbd_dev->header.stripe_unit == 0 ||
1016 rbd_dev->header.stripe_count == 0) {
1017 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1018 rbd_dev->header.stripe_count = 1;
1019 }
1020
1021 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1022 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1023 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +01001024 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1025 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001026 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1027}
1028
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001029/*
Alex Elderbb23e372013-05-06 09:51:29 -05001030 * Fill an rbd image header with information from the given format 1
1031 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001032 */
Alex Elder662518b2013-05-06 09:51:29 -05001033static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001034 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035{
Alex Elder662518b2013-05-06 09:51:29 -05001036 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001037 bool first_time = header->object_prefix == NULL;
1038 struct ceph_snap_context *snapc;
1039 char *object_prefix = NULL;
1040 char *snap_names = NULL;
1041 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001042 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001043 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001044 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001045
Alex Elderbb23e372013-05-06 09:51:29 -05001046 /* Allocate this now to avoid having to handle failure below */
1047
1048 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001049 object_prefix = kstrndup(ondisk->object_prefix,
1050 sizeof(ondisk->object_prefix),
1051 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001052 if (!object_prefix)
1053 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001054 }
1055
1056 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001057
Alex Elder103a1502012-08-02 11:29:45 -05001058 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001059 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1060 if (!snapc)
1061 goto out_err;
1062 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001063 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001064 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001065 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1066
Alex Elderbb23e372013-05-06 09:51:29 -05001067 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001068
Alex Elderbb23e372013-05-06 09:51:29 -05001069 if (snap_names_len > (u64)SIZE_MAX)
1070 goto out_2big;
1071 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1072 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001073 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001074
1075 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001076 snap_sizes = kmalloc_array(snap_count,
1077 sizeof(*header->snap_sizes),
1078 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001079 if (!snap_sizes)
1080 goto out_err;
1081
Alex Elderf785cc12012-08-23 23:22:06 -05001082 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001083 * Copy the names, and fill in each snapshot's id
1084 * and size.
1085 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001086 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001087 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001088 * snap_names_len bytes beyond the end of the
1089 * snapshot id array, this memcpy() is safe.
1090 */
Alex Elderbb23e372013-05-06 09:51:29 -05001091 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1092 snaps = ondisk->snaps;
1093 for (i = 0; i < snap_count; i++) {
1094 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1095 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1096 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001097 }
Alex Elder849b4262012-07-09 21:04:24 -05001098
Alex Elderbb23e372013-05-06 09:51:29 -05001099 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001100
Alex Elderbb23e372013-05-06 09:51:29 -05001101 if (first_time) {
1102 header->object_prefix = object_prefix;
1103 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001104 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001105 } else {
1106 ceph_put_snap_context(header->snapc);
1107 kfree(header->snap_names);
1108 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001109 }
1110
1111 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001112
Alex Elderf84344f2012-08-31 17:29:51 -05001113 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001114 header->snapc = snapc;
1115 header->snap_names = snap_names;
1116 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001117
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001118 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001119out_2big:
1120 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001121out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001122 kfree(snap_sizes);
1123 kfree(snap_names);
1124 ceph_put_snap_context(snapc);
1125 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001126
Alex Elderbb23e372013-05-06 09:51:29 -05001127 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001128}
1129
Alex Elder9682fc62013-04-30 00:44:33 -05001130static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1131{
1132 const char *snap_name;
1133
1134 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1135
1136 /* Skip over names until we find the one we are looking for */
1137
1138 snap_name = rbd_dev->header.snap_names;
1139 while (which--)
1140 snap_name += strlen(snap_name) + 1;
1141
1142 return kstrdup(snap_name, GFP_KERNEL);
1143}
1144
Alex Elder30d1cff2013-05-01 12:43:03 -05001145/*
1146 * Snapshot id comparison function for use with qsort()/bsearch().
1147 * Note that result is for snapshots in *descending* order.
1148 */
1149static int snapid_compare_reverse(const void *s1, const void *s2)
1150{
1151 u64 snap_id1 = *(u64 *)s1;
1152 u64 snap_id2 = *(u64 *)s2;
1153
1154 if (snap_id1 < snap_id2)
1155 return 1;
1156 return snap_id1 == snap_id2 ? 0 : -1;
1157}
1158
1159/*
1160 * Search a snapshot context to see if the given snapshot id is
1161 * present.
1162 *
1163 * Returns the position of the snapshot id in the array if it's found,
1164 * or BAD_SNAP_INDEX otherwise.
1165 *
1166 * Note: The snapshot array is in kept sorted (by the osd) in
1167 * reverse order, highest snapshot id first.
1168 */
Alex Elder9682fc62013-04-30 00:44:33 -05001169static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1170{
1171 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001172 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001173
Alex Elder30d1cff2013-05-01 12:43:03 -05001174 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1175 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001176
Alex Elder30d1cff2013-05-01 12:43:03 -05001177 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001178}
1179
Alex Elder2ad3d712013-04-30 00:44:33 -05001180static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1181 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001182{
1183 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001184 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001185
1186 which = rbd_dev_snap_index(rbd_dev, snap_id);
1187 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001188 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001189
Josh Durginda6a6b62013-09-04 17:57:31 -07001190 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1191 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001192}
1193
Alex Elder9e15b772012-10-30 19:40:33 -05001194static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1195{
Alex Elder9e15b772012-10-30 19:40:33 -05001196 if (snap_id == CEPH_NOSNAP)
1197 return RBD_SNAP_HEAD_NAME;
1198
Alex Elder54cac612013-04-30 00:44:33 -05001199 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1200 if (rbd_dev->image_format == 1)
1201 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001202
Alex Elder54cac612013-04-30 00:44:33 -05001203 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001204}
1205
Alex Elder2ad3d712013-04-30 00:44:33 -05001206static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1207 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208{
Alex Elder2ad3d712013-04-30 00:44:33 -05001209 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1210 if (snap_id == CEPH_NOSNAP) {
1211 *snap_size = rbd_dev->header.image_size;
1212 } else if (rbd_dev->image_format == 1) {
1213 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001214
Alex Elder2ad3d712013-04-30 00:44:33 -05001215 which = rbd_dev_snap_index(rbd_dev, snap_id);
1216 if (which == BAD_SNAP_INDEX)
1217 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001218
Alex Elder2ad3d712013-04-30 00:44:33 -05001219 *snap_size = rbd_dev->header.snap_sizes[which];
1220 } else {
1221 u64 size = 0;
1222 int ret;
1223
1224 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1225 if (ret)
1226 return ret;
1227
1228 *snap_size = size;
1229 }
1230 return 0;
1231}
1232
1233static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1234 u64 *snap_features)
1235{
1236 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1237 if (snap_id == CEPH_NOSNAP) {
1238 *snap_features = rbd_dev->header.features;
1239 } else if (rbd_dev->image_format == 1) {
1240 *snap_features = 0; /* No features for format 1 */
1241 } else {
1242 u64 features = 0;
1243 int ret;
1244
1245 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1246 if (ret)
1247 return ret;
1248
1249 *snap_features = features;
1250 }
1251 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001252}
1253
Alex Elderd1cf5782013-04-27 09:59:30 -05001254static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001256 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001257 u64 size = 0;
1258 u64 features = 0;
1259 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001260
Alex Elder2ad3d712013-04-30 00:44:33 -05001261 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1262 if (ret)
1263 return ret;
1264 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1265 if (ret)
1266 return ret;
1267
1268 rbd_dev->mapping.size = size;
1269 rbd_dev->mapping.features = features;
1270
Alex Elder8b0241f2013-04-25 23:15:08 -05001271 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272}
1273
Alex Elderd1cf5782013-04-27 09:59:30 -05001274static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1275{
1276 rbd_dev->mapping.size = 0;
1277 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001278}
1279
Ilya Dryomov5359a172018-01-20 10:30:10 +01001280static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001281{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001282 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001283 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001284
Ilya Dryomov5359a172018-01-20 10:30:10 +01001285 buf = bvec_kmap_irq(bv, &flags);
1286 memset(buf, 0, bv->bv_len);
1287 flush_dcache_page(bv->bv_page);
1288 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001289}
1290
Ilya Dryomov5359a172018-01-20 10:30:10 +01001291static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001292{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001293 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001294
Ilya Dryomov5359a172018-01-20 10:30:10 +01001295 ceph_bio_iter_advance(&it, off);
1296 ceph_bio_iter_advance_step(&it, bytes, ({
1297 zero_bvec(&bv);
1298 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001299}
1300
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001301static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001302{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001303 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001304
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001305 ceph_bvec_iter_advance(&it, off);
1306 ceph_bvec_iter_advance_step(&it, bytes, ({
1307 zero_bvec(&bv);
1308 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001309}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001310
Alex Elderf7760da2012-10-20 22:17:27 -05001311/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001312 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001313 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001314 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001315 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001316 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001317static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1318 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001319{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001320 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001321 case OBJ_REQUEST_BIO:
1322 zero_bios(&obj_req->bio_pos, off, bytes);
1323 break;
1324 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001325 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001326 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1327 break;
1328 default:
1329 rbd_assert(0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001330 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001331}
1332
1333static void rbd_obj_request_destroy(struct kref *kref);
1334static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1335{
1336 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001337 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001338 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001339 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1340}
1341
Alex Elder0f2d5be2014-04-26 14:21:44 +04001342static void rbd_img_request_get(struct rbd_img_request *img_request)
1343{
1344 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001345 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001346 kref_get(&img_request->kref);
1347}
1348
Alex Elderbf0d5f502012-11-22 00:00:08 -06001349static void rbd_img_request_destroy(struct kref *kref);
1350static void rbd_img_request_put(struct rbd_img_request *img_request)
1351{
1352 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001353 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001354 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001355 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001356}
1357
1358static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1359 struct rbd_obj_request *obj_request)
1360{
Alex Elder25dcf952013-01-25 17:08:55 -06001361 rbd_assert(obj_request->img_request == NULL);
1362
Alex Elderb155e862013-04-15 14:50:37 -05001363 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001364 obj_request->img_request = img_request;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01001365 img_request->pending_count++;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001366 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001367}
1368
1369static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1370 struct rbd_obj_request *obj_request)
1371{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001372 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001373 list_del(&obj_request->ex.oe_item);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001374 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001375 rbd_obj_request_put(obj_request);
1376}
1377
Ilya Dryomov980917f2016-09-12 18:59:42 +02001378static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001379{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001380 struct ceph_osd_request *osd_req = obj_request->osd_req;
1381
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001382 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001383 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1384 obj_request->ex.oe_len, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001385 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001386}
1387
Alex Elder0c425242013-02-08 09:55:49 -06001388/*
1389 * The default/initial value for all image request flags is 0. Each
1390 * is conditionally set to 1 at image request initialization time
1391 * and currently never change thereafter.
1392 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001393static void img_request_layered_set(struct rbd_img_request *img_request)
1394{
1395 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1396 smp_mb();
1397}
1398
Alex Eldera2acd002013-05-08 22:50:04 -05001399static void img_request_layered_clear(struct rbd_img_request *img_request)
1400{
1401 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1402 smp_mb();
1403}
1404
Alex Elderd0b2e942013-01-24 16:13:36 -06001405static bool img_request_layered_test(struct rbd_img_request *img_request)
1406{
1407 smp_mb();
1408 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1409}
1410
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001411static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001412{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001413 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1414
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001415 return !obj_req->ex.oe_off &&
1416 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001417}
1418
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001419static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001420{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001421 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001422
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001423 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001424 rbd_dev->layout.object_size;
1425}
1426
Ilya Dryomov13488d52019-02-25 12:37:50 +01001427/*
1428 * Must be called after rbd_obj_calc_img_extents().
1429 */
1430static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1431{
1432 if (!obj_req->num_img_extents ||
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01001433 (rbd_obj_is_entire(obj_req) &&
1434 !obj_req->img_request->snapc->num_snaps))
Ilya Dryomov13488d52019-02-25 12:37:50 +01001435 return false;
1436
1437 return true;
1438}
1439
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001440static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1441{
1442 return ceph_file_extents_bytes(obj_req->img_extents,
1443 obj_req->num_img_extents);
1444}
1445
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001446static bool rbd_img_is_write(struct rbd_img_request *img_req)
1447{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001448 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001449 case OBJ_OP_READ:
1450 return false;
1451 case OBJ_OP_WRITE:
1452 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001453 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001454 return true;
1455 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02001456 BUG();
Alex Elder6e2a4502013-03-27 09:16:30 -05001457 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001458}
1459
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001460static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
Ilya Dryomov27617132015-07-16 17:36:11 +03001461
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001462static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001463{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001464 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001465
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001466 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1467 osd_req->r_result, obj_req);
1468 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001469
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001470 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1471 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1472 obj_req->xferred = osd_req->r_result;
1473 else
1474 /*
1475 * Writes aren't allowed to return a data payload. In some
1476 * guarded write cases (e.g. stat + zero on an empty object)
1477 * a stat response makes it through, but we don't care.
1478 */
1479 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001480
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001481 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001482}
1483
Alex Elder9d4df012013-04-19 15:34:50 -05001484static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001485{
Alex Elder8c042b02013-04-03 01:28:58 -05001486 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001487
Ilya Dryomova162b302018-01-30 17:52:10 +01001488 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001489 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001490}
1491
1492static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1493{
Alex Elder9d4df012013-04-19 15:34:50 -05001494 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001495
Ilya Dryomova162b302018-01-30 17:52:10 +01001496 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001497 ktime_get_real_ts64(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001498 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001499}
1500
Ilya Dryomovbc812072017-01-25 18:16:23 +01001501static struct ceph_osd_request *
Ilya Dryomove28eded2019-02-25 11:42:26 +01001502__rbd_osd_req_create(struct rbd_obj_request *obj_req,
1503 struct ceph_snap_context *snapc, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001504{
Ilya Dryomove28eded2019-02-25 11:42:26 +01001505 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001506 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1507 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001508 const char *name_format = rbd_dev->image_format == 1 ?
1509 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001510
Ilya Dryomove28eded2019-02-25 11:42:26 +01001511 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001512 if (!req)
1513 return NULL;
1514
Ilya Dryomovbc812072017-01-25 18:16:23 +01001515 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001516 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001517
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001518 /*
1519 * Data objects may be stored in a separate pool, but always in
1520 * the same namespace in that pool as the header in its pool.
1521 */
1522 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001523 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001524
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001525 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001526 rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001527 goto err_req;
1528
Ilya Dryomovbc812072017-01-25 18:16:23 +01001529 return req;
1530
1531err_req:
1532 ceph_osdc_put_request(req);
1533 return NULL;
1534}
1535
Ilya Dryomove28eded2019-02-25 11:42:26 +01001536static struct ceph_osd_request *
1537rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
1538{
1539 return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc,
1540 num_ops);
1541}
1542
Alex Elderbf0d5f502012-11-22 00:00:08 -06001543static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1544{
1545 ceph_osdc_put_request(osd_req);
1546}
1547
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001548static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001549{
1550 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001551
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001552 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001553 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001554 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001555
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001556 ceph_object_extent_init(&obj_request->ex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001557 kref_init(&obj_request->kref);
1558
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001559 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001560 return obj_request;
1561}
1562
1563static void rbd_obj_request_destroy(struct kref *kref)
1564{
1565 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001566 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567
1568 obj_request = container_of(kref, struct rbd_obj_request, kref);
1569
Alex Elder37206ee2013-02-20 17:32:08 -06001570 dout("%s: obj %p\n", __func__, obj_request);
1571
Alex Elderbf0d5f502012-11-22 00:00:08 -06001572 if (obj_request->osd_req)
1573 rbd_osd_req_destroy(obj_request->osd_req);
1574
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001575 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001576 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001577 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001578 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001579 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001580 case OBJ_REQUEST_OWN_BVECS:
1581 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001582 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001583 default:
1584 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001585 }
1586
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001587 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001588 if (obj_request->copyup_bvecs) {
1589 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1590 if (obj_request->copyup_bvecs[i].bv_page)
1591 __free_page(obj_request->copyup_bvecs[i].bv_page);
1592 }
1593 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001594 }
1595
Alex Elder868311b2013-05-01 12:43:03 -05001596 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001597}
1598
Alex Elderfb65d2282013-05-08 22:50:04 -05001599/* It's OK to call this for a device with no parent */
1600
1601static void rbd_spec_put(struct rbd_spec *spec);
1602static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1603{
1604 rbd_dev_remove_parent(rbd_dev);
1605 rbd_spec_put(rbd_dev->parent_spec);
1606 rbd_dev->parent_spec = NULL;
1607 rbd_dev->parent_overlap = 0;
1608}
1609
Alex Elderbf0d5f502012-11-22 00:00:08 -06001610/*
Alex Eldera2acd002013-05-08 22:50:04 -05001611 * Parent image reference counting is used to determine when an
1612 * image's parent fields can be safely torn down--after there are no
1613 * more in-flight requests to the parent image. When the last
1614 * reference is dropped, cleaning them up is safe.
1615 */
1616static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1617{
1618 int counter;
1619
1620 if (!rbd_dev->parent_spec)
1621 return;
1622
1623 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1624 if (counter > 0)
1625 return;
1626
1627 /* Last reference; clean up parent data structures */
1628
1629 if (!counter)
1630 rbd_dev_unparent(rbd_dev);
1631 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001632 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001633}
1634
1635/*
1636 * If an image has a non-zero parent overlap, get a reference to its
1637 * parent.
1638 *
1639 * Returns true if the rbd device has a parent with a non-zero
1640 * overlap and a reference for it was successfully taken, or
1641 * false otherwise.
1642 */
1643static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1644{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001645 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001646
1647 if (!rbd_dev->parent_spec)
1648 return false;
1649
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001650 down_read(&rbd_dev->header_rwsem);
1651 if (rbd_dev->parent_overlap)
1652 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1653 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001654
1655 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001656 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001657
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001658 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001659}
1660
Alex Elderbf0d5f502012-11-22 00:00:08 -06001661/*
1662 * Caller is responsible for filling in the list of object requests
1663 * that comprises the image request, and the Linux request pointer
1664 * (if there is one).
1665 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001666static struct rbd_img_request *rbd_img_request_create(
1667 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001668 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001669 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001670{
1671 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001672
Ilya Dryomova0c58952018-01-22 16:03:06 +01001673 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001674 if (!img_request)
1675 return NULL;
1676
Alex Elderbf0d5f502012-11-22 00:00:08 -06001677 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001678 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001679 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001680 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001681 else
1682 img_request->snapc = snapc;
1683
Alex Eldera2acd002013-05-08 22:50:04 -05001684 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001685 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001686
Alex Elderbf0d5f502012-11-22 00:00:08 -06001687 spin_lock_init(&img_request->completion_lock);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001688 INIT_LIST_HEAD(&img_request->object_extents);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001689 kref_init(&img_request->kref);
1690
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001691 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1692 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001693 return img_request;
1694}
1695
1696static void rbd_img_request_destroy(struct kref *kref)
1697{
1698 struct rbd_img_request *img_request;
1699 struct rbd_obj_request *obj_request;
1700 struct rbd_obj_request *next_obj_request;
1701
1702 img_request = container_of(kref, struct rbd_img_request, kref);
1703
Alex Elder37206ee2013-02-20 17:32:08 -06001704 dout("%s: img %p\n", __func__, img_request);
1705
Alex Elderbf0d5f502012-11-22 00:00:08 -06001706 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1707 rbd_img_obj_request_del(img_request, obj_request);
1708
Alex Eldera2acd002013-05-08 22:50:04 -05001709 if (img_request_layered_test(img_request)) {
1710 img_request_layered_clear(img_request);
1711 rbd_dev_parent_put(img_request->rbd_dev);
1712 }
1713
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001714 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001715 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001716
Alex Elder1c2a9df2013-05-01 12:43:03 -05001717 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001718}
1719
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001720static void prune_extents(struct ceph_file_extent *img_extents,
1721 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05001722{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001723 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05001724
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001725 /* drop extents completely beyond the overlap */
1726 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1727 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05001728
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001729 if (cnt) {
1730 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05001731
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001732 /* trim final overlapping extent */
1733 if (ex->fe_off + ex->fe_len > overlap)
1734 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06001735 }
1736
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001737 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05001738}
1739
Alex Elderf1a47392013-04-19 15:34:50 -05001740/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001741 * Determine the byte range(s) covered by either just the object extent
1742 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001743 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001744static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1745 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001746{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001747 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001748 int ret;
1749
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001750 if (!rbd_dev->parent_overlap)
1751 return 0;
1752
1753 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1754 entire ? 0 : obj_req->ex.oe_off,
1755 entire ? rbd_dev->layout.object_size :
1756 obj_req->ex.oe_len,
1757 &obj_req->img_extents,
1758 &obj_req->num_img_extents);
1759 if (ret)
1760 return ret;
1761
1762 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1763 rbd_dev->parent_overlap);
1764 return 0;
1765}
1766
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001767static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1768{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001769 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001770 case OBJ_REQUEST_BIO:
1771 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1772 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001773 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001774 break;
1775 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001776 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001777 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001778 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001779 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001780 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1781 &obj_req->bvec_pos);
1782 break;
1783 default:
1784 rbd_assert(0);
1785 }
1786}
1787
1788static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1789{
Ilya Dryomove28eded2019-02-25 11:42:26 +01001790 obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001791 if (!obj_req->osd_req)
Ilya Dryomov710214e2016-09-15 17:53:32 +02001792 return -ENOMEM;
1793
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001794 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001795 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001796 rbd_osd_req_setup_data(obj_req, 0);
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001797
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001798 rbd_osd_req_format_read(obj_req);
1799 return 0;
1800}
1801
1802static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1803 unsigned int which)
1804{
1805 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001806
Alex Elderc5b5ef62013-02-11 12:33:24 -06001807 /*
1808 * The response data for a STAT call consists of:
1809 * le64 length;
1810 * struct {
1811 * le32 tv_sec;
1812 * le32 tv_nsec;
1813 * } mtime;
1814 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001815 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1816 if (IS_ERR(pages))
1817 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06001818
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001819 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1820 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1821 8 + sizeof(struct ceph_timespec),
1822 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001823 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001824}
1825
Ilya Dryomov13488d52019-02-25 12:37:50 +01001826static int count_write_ops(struct rbd_obj_request *obj_req)
1827{
1828 return 2; /* setallochint + write/writefull */
1829}
1830
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001831static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1832 unsigned int which)
Alex Elderb454e362013-04-19 15:34:50 -05001833{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001834 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1835 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05001836
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001837 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
1838 rbd_dev->layout.object_size,
1839 rbd_dev->layout.object_size);
Alex Elderb454e362013-04-19 15:34:50 -05001840
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001841 if (rbd_obj_is_entire(obj_req))
1842 opcode = CEPH_OSD_OP_WRITEFULL;
1843 else
1844 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001845
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001846 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001847 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001848 rbd_osd_req_setup_data(obj_req, which++);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001849
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001850 rbd_assert(which == obj_req->osd_req->r_num_ops);
1851 rbd_osd_req_format_write(obj_req);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001852}
1853
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001854static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001855{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001856 unsigned int num_osd_ops, which = 0;
Ilya Dryomov13488d52019-02-25 12:37:50 +01001857 bool need_guard;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001858 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02001859
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001860 /* reverse map the entire object onto the parent */
1861 ret = rbd_obj_calc_img_extents(obj_req, true);
1862 if (ret)
1863 return ret;
1864
Ilya Dryomov13488d52019-02-25 12:37:50 +01001865 need_guard = rbd_obj_copyup_enabled(obj_req);
1866 num_osd_ops = need_guard + count_write_ops(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001867
Ilya Dryomova162b302018-01-30 17:52:10 +01001868 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001869 if (!obj_req->osd_req)
1870 return -ENOMEM;
1871
Ilya Dryomov13488d52019-02-25 12:37:50 +01001872 if (need_guard) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001873 ret = __rbd_obj_setup_stat(obj_req, which++);
1874 if (ret)
1875 return ret;
Ilya Dryomov13488d52019-02-25 12:37:50 +01001876
1877 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1878 } else {
1879 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001880 }
1881
1882 __rbd_obj_setup_write(obj_req, which);
1883 return 0;
1884}
1885
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001886static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
1887{
1888 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
1889 CEPH_OSD_OP_ZERO;
1890}
1891
1892static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1893{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001894 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1895 u64 off = obj_req->ex.oe_off;
1896 u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001897 int ret;
1898
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001899 /*
1900 * Align the range to alloc_size boundary and punt on discards
1901 * that are too small to free up any space.
1902 *
1903 * alloc_size == object_size && is_tail() is a special case for
1904 * filestore with filestore_punch_hole = false, needed to allow
1905 * truncate (in addition to delete).
1906 */
1907 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
1908 !rbd_obj_is_tail(obj_req)) {
1909 off = round_up(off, rbd_dev->opts->alloc_size);
1910 next_off = round_down(next_off, rbd_dev->opts->alloc_size);
1911 if (off >= next_off)
1912 return 1;
1913 }
1914
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001915 /* reverse map the entire object onto the parent */
1916 ret = rbd_obj_calc_img_extents(obj_req, true);
1917 if (ret)
1918 return ret;
1919
1920 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
1921 if (!obj_req->osd_req)
1922 return -ENOMEM;
1923
1924 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
1925 osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0);
1926 } else {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001927 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
1928 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
1929 off, next_off - off);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001930 osd_req_op_extent_init(obj_req->osd_req, 0,
1931 truncate_or_zero_opcode(obj_req),
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001932 off, next_off - off, 0, 0);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001933 }
1934
1935 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1936 rbd_osd_req_format_write(obj_req);
1937 return 0;
1938}
1939
Ilya Dryomov13488d52019-02-25 12:37:50 +01001940static int count_zeroout_ops(struct rbd_obj_request *obj_req)
1941{
1942 int num_osd_ops;
1943
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01001944 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
1945 !rbd_obj_copyup_enabled(obj_req))
Ilya Dryomov13488d52019-02-25 12:37:50 +01001946 num_osd_ops = 2; /* create + truncate */
1947 else
1948 num_osd_ops = 1; /* delete/truncate/zero */
1949
1950 return num_osd_ops;
1951}
1952
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001953static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001954 unsigned int which)
1955{
1956 u16 opcode;
1957
1958 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001959 if (obj_req->num_img_extents) {
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01001960 if (!rbd_obj_copyup_enabled(obj_req))
1961 osd_req_op_init(obj_req->osd_req, which++,
1962 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001963 opcode = CEPH_OSD_OP_TRUNCATE;
1964 } else {
1965 osd_req_op_init(obj_req->osd_req, which++,
1966 CEPH_OSD_OP_DELETE, 0);
1967 opcode = 0;
1968 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001969 } else {
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001970 opcode = truncate_or_zero_opcode(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001971 }
1972
1973 if (opcode)
1974 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001975 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001976 0, 0);
1977
1978 rbd_assert(which == obj_req->osd_req->r_num_ops);
1979 rbd_osd_req_format_write(obj_req);
1980}
1981
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001982static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001983{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001984 unsigned int num_osd_ops, which = 0;
Ilya Dryomov13488d52019-02-25 12:37:50 +01001985 bool need_guard;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001986 int ret;
1987
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001988 /* reverse map the entire object onto the parent */
1989 ret = rbd_obj_calc_img_extents(obj_req, true);
1990 if (ret)
1991 return ret;
1992
Ilya Dryomov13488d52019-02-25 12:37:50 +01001993 need_guard = rbd_obj_copyup_enabled(obj_req);
1994 num_osd_ops = need_guard + count_zeroout_ops(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001995
Ilya Dryomova162b302018-01-30 17:52:10 +01001996 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001997 if (!obj_req->osd_req)
1998 return -ENOMEM;
1999
Ilya Dryomov13488d52019-02-25 12:37:50 +01002000 if (need_guard) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002001 ret = __rbd_obj_setup_stat(obj_req, which++);
2002 if (ret)
2003 return ret;
Ilya Dryomov13488d52019-02-25 12:37:50 +01002004
2005 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2006 } else {
2007 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002008 }
2009
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002010 __rbd_obj_setup_zeroout(obj_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002011 return 0;
2012}
2013
2014/*
2015 * For each object request in @img_req, allocate an OSD request, add
2016 * individual OSD ops and prepare them for submission. The number of
2017 * OSD ops depends on op_type and the overlap point (if any).
2018 */
2019static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2020{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002021 struct rbd_obj_request *obj_req, *next_obj_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002022 int ret;
2023
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002024 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002025 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002026 case OBJ_OP_READ:
2027 ret = rbd_obj_setup_read(obj_req);
2028 break;
2029 case OBJ_OP_WRITE:
2030 ret = rbd_obj_setup_write(obj_req);
2031 break;
2032 case OBJ_OP_DISCARD:
2033 ret = rbd_obj_setup_discard(obj_req);
2034 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002035 case OBJ_OP_ZEROOUT:
2036 ret = rbd_obj_setup_zeroout(obj_req);
2037 break;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002038 default:
2039 rbd_assert(0);
2040 }
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002041 if (ret < 0)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002042 return ret;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002043 if (ret > 0) {
2044 img_req->xferred += obj_req->ex.oe_len;
2045 img_req->pending_count--;
2046 rbd_img_obj_request_del(img_req, obj_req);
2047 continue;
2048 }
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002049
2050 ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
2051 if (ret)
2052 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002053 }
2054
2055 return 0;
2056}
2057
Ilya Dryomov5a237812018-02-06 19:26:34 +01002058union rbd_img_fill_iter {
2059 struct ceph_bio_iter bio_iter;
2060 struct ceph_bvec_iter bvec_iter;
2061};
2062
2063struct rbd_img_fill_ctx {
2064 enum obj_request_type pos_type;
2065 union rbd_img_fill_iter *pos;
2066 union rbd_img_fill_iter iter;
2067 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01002068 ceph_object_extent_fn_t count_fn;
2069 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01002070};
2071
2072static struct ceph_object_extent *alloc_object_extent(void *arg)
2073{
2074 struct rbd_img_request *img_req = arg;
2075 struct rbd_obj_request *obj_req;
2076
2077 obj_req = rbd_obj_request_create();
2078 if (!obj_req)
2079 return NULL;
2080
2081 rbd_img_obj_request_add(img_req, obj_req);
2082 return &obj_req->ex;
2083}
2084
2085/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002086 * While su != os && sc == 1 is technically not fancy (it's the same
2087 * layout as su == os && sc == 1), we can't use the nocopy path for it
2088 * because ->set_pos_fn() should be called only once per object.
2089 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2090 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01002091 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002092static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2093{
2094 return l->stripe_unit != l->object_size;
2095}
2096
2097static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2098 struct ceph_file_extent *img_extents,
2099 u32 num_img_extents,
2100 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01002101{
2102 u32 i;
2103 int ret;
2104
2105 img_req->data_type = fctx->pos_type;
2106
2107 /*
2108 * Create object requests and set each object request's starting
2109 * position in the provided bio (list) or bio_vec array.
2110 */
2111 fctx->iter = *fctx->pos;
2112 for (i = 0; i < num_img_extents; i++) {
2113 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2114 img_extents[i].fe_off,
2115 img_extents[i].fe_len,
2116 &img_req->object_extents,
2117 alloc_object_extent, img_req,
2118 fctx->set_pos_fn, &fctx->iter);
2119 if (ret)
2120 return ret;
2121 }
2122
2123 return __rbd_img_fill_request(img_req);
2124}
2125
Ilya Dryomovafb97882018-02-06 19:26:35 +01002126/*
2127 * Map a list of image extents to a list of object extents, create the
2128 * corresponding object requests (normally each to a different object,
2129 * but not always) and add them to @img_req. For each object request,
2130 * set up its data descriptor to point to the corresponding chunk(s) of
2131 * @fctx->pos data buffer.
2132 *
2133 * Because ceph_file_to_extents() will merge adjacent object extents
2134 * together, each object request's data descriptor may point to multiple
2135 * different chunks of @fctx->pos data buffer.
2136 *
2137 * @fctx->pos data buffer is assumed to be large enough.
2138 */
2139static int rbd_img_fill_request(struct rbd_img_request *img_req,
2140 struct ceph_file_extent *img_extents,
2141 u32 num_img_extents,
2142 struct rbd_img_fill_ctx *fctx)
2143{
2144 struct rbd_device *rbd_dev = img_req->rbd_dev;
2145 struct rbd_obj_request *obj_req;
2146 u32 i;
2147 int ret;
2148
2149 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2150 !rbd_layout_is_fancy(&rbd_dev->layout))
2151 return rbd_img_fill_request_nocopy(img_req, img_extents,
2152 num_img_extents, fctx);
2153
2154 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2155
2156 /*
2157 * Create object requests and determine ->bvec_count for each object
2158 * request. Note that ->bvec_count sum over all object requests may
2159 * be greater than the number of bio_vecs in the provided bio (list)
2160 * or bio_vec array because when mapped, those bio_vecs can straddle
2161 * stripe unit boundaries.
2162 */
2163 fctx->iter = *fctx->pos;
2164 for (i = 0; i < num_img_extents; i++) {
2165 ret = ceph_file_to_extents(&rbd_dev->layout,
2166 img_extents[i].fe_off,
2167 img_extents[i].fe_len,
2168 &img_req->object_extents,
2169 alloc_object_extent, img_req,
2170 fctx->count_fn, &fctx->iter);
2171 if (ret)
2172 return ret;
2173 }
2174
2175 for_each_obj_request(img_req, obj_req) {
2176 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2177 sizeof(*obj_req->bvec_pos.bvecs),
2178 GFP_NOIO);
2179 if (!obj_req->bvec_pos.bvecs)
2180 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002181 }
2182
2183 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002184 * Fill in each object request's private bio_vec array, splitting and
2185 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002186 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002187 fctx->iter = *fctx->pos;
2188 for (i = 0; i < num_img_extents; i++) {
2189 ret = ceph_iterate_extents(&rbd_dev->layout,
2190 img_extents[i].fe_off,
2191 img_extents[i].fe_len,
2192 &img_req->object_extents,
2193 fctx->copy_fn, &fctx->iter);
2194 if (ret)
2195 return ret;
2196 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002197
Ilya Dryomovafb97882018-02-06 19:26:35 +01002198 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002199}
2200
Ilya Dryomov5a237812018-02-06 19:26:34 +01002201static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2202 u64 off, u64 len)
2203{
2204 struct ceph_file_extent ex = { off, len };
2205 union rbd_img_fill_iter dummy;
2206 struct rbd_img_fill_ctx fctx = {
2207 .pos_type = OBJ_REQUEST_NODATA,
2208 .pos = &dummy,
2209 };
2210
2211 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2212}
2213
2214static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2215{
2216 struct rbd_obj_request *obj_req =
2217 container_of(ex, struct rbd_obj_request, ex);
2218 struct ceph_bio_iter *it = arg;
2219
2220 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2221 obj_req->bio_pos = *it;
2222 ceph_bio_iter_advance(it, bytes);
2223}
2224
Ilya Dryomovafb97882018-02-06 19:26:35 +01002225static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2226{
2227 struct rbd_obj_request *obj_req =
2228 container_of(ex, struct rbd_obj_request, ex);
2229 struct ceph_bio_iter *it = arg;
2230
2231 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2232 ceph_bio_iter_advance_step(it, bytes, ({
2233 obj_req->bvec_count++;
2234 }));
2235
2236}
2237
2238static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2239{
2240 struct rbd_obj_request *obj_req =
2241 container_of(ex, struct rbd_obj_request, ex);
2242 struct ceph_bio_iter *it = arg;
2243
2244 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2245 ceph_bio_iter_advance_step(it, bytes, ({
2246 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2247 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2248 }));
2249}
2250
Ilya Dryomov5a237812018-02-06 19:26:34 +01002251static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2252 struct ceph_file_extent *img_extents,
2253 u32 num_img_extents,
2254 struct ceph_bio_iter *bio_pos)
2255{
2256 struct rbd_img_fill_ctx fctx = {
2257 .pos_type = OBJ_REQUEST_BIO,
2258 .pos = (union rbd_img_fill_iter *)bio_pos,
2259 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002260 .count_fn = count_bio_bvecs,
2261 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002262 };
2263
2264 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2265 &fctx);
2266}
2267
2268static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2269 u64 off, u64 len, struct bio *bio)
2270{
2271 struct ceph_file_extent ex = { off, len };
2272 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2273
2274 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2275}
2276
2277static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2278{
2279 struct rbd_obj_request *obj_req =
2280 container_of(ex, struct rbd_obj_request, ex);
2281 struct ceph_bvec_iter *it = arg;
2282
2283 obj_req->bvec_pos = *it;
2284 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2285 ceph_bvec_iter_advance(it, bytes);
2286}
2287
Ilya Dryomovafb97882018-02-06 19:26:35 +01002288static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2289{
2290 struct rbd_obj_request *obj_req =
2291 container_of(ex, struct rbd_obj_request, ex);
2292 struct ceph_bvec_iter *it = arg;
2293
2294 ceph_bvec_iter_advance_step(it, bytes, ({
2295 obj_req->bvec_count++;
2296 }));
2297}
2298
2299static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2300{
2301 struct rbd_obj_request *obj_req =
2302 container_of(ex, struct rbd_obj_request, ex);
2303 struct ceph_bvec_iter *it = arg;
2304
2305 ceph_bvec_iter_advance_step(it, bytes, ({
2306 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2307 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2308 }));
2309}
2310
Ilya Dryomov5a237812018-02-06 19:26:34 +01002311static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2312 struct ceph_file_extent *img_extents,
2313 u32 num_img_extents,
2314 struct ceph_bvec_iter *bvec_pos)
2315{
2316 struct rbd_img_fill_ctx fctx = {
2317 .pos_type = OBJ_REQUEST_BVECS,
2318 .pos = (union rbd_img_fill_iter *)bvec_pos,
2319 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002320 .count_fn = count_bvecs,
2321 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002322 };
2323
2324 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2325 &fctx);
2326}
2327
2328static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2329 struct ceph_file_extent *img_extents,
2330 u32 num_img_extents,
2331 struct bio_vec *bvecs)
2332{
2333 struct ceph_bvec_iter it = {
2334 .bvecs = bvecs,
2335 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2336 num_img_extents) },
2337 };
2338
2339 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2340 &it);
2341}
2342
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002343static void rbd_img_request_submit(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002344{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002345 struct rbd_obj_request *obj_request;
2346
Alex Elder37206ee2013-02-20 17:32:08 -06002347 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002348
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002349 rbd_img_request_get(img_request);
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002350 for_each_obj_request(img_request, obj_request)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002351 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002352
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002353 rbd_img_request_put(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002354}
2355
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002356static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002357{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002358 struct rbd_img_request *img_req = obj_req->img_request;
2359 struct rbd_img_request *child_img_req;
2360 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002361
Ilya Dryomove93aca02018-02-06 19:26:35 +01002362 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2363 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002364 if (!child_img_req)
2365 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002366
Ilya Dryomove93aca02018-02-06 19:26:35 +01002367 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2368 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002369
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002370 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002371 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002372 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002373 ret = __rbd_img_fill_from_bio(child_img_req,
2374 obj_req->img_extents,
2375 obj_req->num_img_extents,
2376 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002377 break;
2378 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002379 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002380 ret = __rbd_img_fill_from_bvecs(child_img_req,
2381 obj_req->img_extents,
2382 obj_req->num_img_extents,
2383 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002384 break;
2385 default:
Arnd Bergmannd342a152019-03-22 15:36:37 +01002386 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002387 }
2388 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002389 ret = rbd_img_fill_from_bvecs(child_img_req,
2390 obj_req->img_extents,
2391 obj_req->num_img_extents,
2392 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002393 }
2394 if (ret) {
2395 rbd_img_request_put(child_img_req);
2396 return ret;
2397 }
2398
2399 rbd_img_request_submit(child_img_req);
2400 return 0;
2401}
2402
2403static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2404{
2405 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2406 int ret;
2407
2408 if (obj_req->result == -ENOENT &&
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002409 rbd_dev->parent_overlap && !obj_req->tried_parent) {
2410 /* reverse map this object extent onto the parent */
2411 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002412 if (ret) {
2413 obj_req->result = ret;
2414 return true;
2415 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002416
2417 if (obj_req->num_img_extents) {
2418 obj_req->tried_parent = true;
2419 ret = rbd_obj_read_from_parent(obj_req);
2420 if (ret) {
2421 obj_req->result = ret;
2422 return true;
2423 }
2424 return false;
2425 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002426 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002427
2428 /*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002429 * -ENOENT means a hole in the image -- zero-fill the entire
2430 * length of the request. A short read also implies zero-fill
2431 * to the end of the request. In both cases we update xferred
2432 * count to indicate the whole request was satisfied.
Alex Elder02c74fb2013-05-06 17:40:33 -05002433 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002434 if (obj_req->result == -ENOENT ||
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002435 (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002436 rbd_assert(!obj_req->xferred || !obj_req->result);
2437 rbd_obj_zero_range(obj_req, obj_req->xferred,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002438 obj_req->ex.oe_len - obj_req->xferred);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002439 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002440 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002441 }
2442
2443 return true;
2444}
2445
2446/*
2447 * copyup_bvecs pages are never highmem pages
2448 */
2449static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2450{
2451 struct ceph_bvec_iter it = {
2452 .bvecs = bvecs,
2453 .iter = { .bi_size = bytes },
2454 };
2455
2456 ceph_bvec_iter_advance_step(&it, bytes, ({
2457 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2458 bv.bv_len))
2459 return false;
2460 }));
2461 return true;
2462}
2463
Ilya Dryomov3a482502019-02-28 10:49:12 +01002464#define MODS_ONLY U32_MAX
2465
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002466static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req,
2467 u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002468{
Chengguang Xufe943d52018-04-12 12:04:55 +08002469 int ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002470
2471 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2472 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002473 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002474 rbd_osd_req_destroy(obj_req->osd_req);
2475
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002476 obj_req->osd_req = __rbd_osd_req_create(obj_req, &rbd_empty_snapc, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002477 if (!obj_req->osd_req)
2478 return -ENOMEM;
2479
Ilya Dryomov24639ce562018-09-26 19:12:07 +02002480 ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup");
Chengguang Xufe943d52018-04-12 12:04:55 +08002481 if (ret)
2482 return ret;
2483
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002484 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
Ilya Dryomov0010f702018-05-04 16:57:30 +02002485 obj_req->copyup_bvecs,
2486 obj_req->copyup_bvec_count,
2487 bytes);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002488 rbd_osd_req_format_write(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002489
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002490 ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
2491 if (ret)
2492 return ret;
2493
2494 rbd_obj_request_submit(obj_req);
2495 return 0;
2496}
2497
Ilya Dryomov3a482502019-02-28 10:49:12 +01002498static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002499{
Ilya Dryomov13488d52019-02-25 12:37:50 +01002500 struct rbd_img_request *img_req = obj_req->img_request;
Ilya Dryomov3a482502019-02-28 10:49:12 +01002501 unsigned int num_osd_ops = (bytes != MODS_ONLY);
2502 unsigned int which = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002503 int ret;
2504
2505 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002506 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT ||
2507 obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_CALL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002508 rbd_osd_req_destroy(obj_req->osd_req);
2509
Ilya Dryomov13488d52019-02-25 12:37:50 +01002510 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002511 case OBJ_OP_WRITE:
Ilya Dryomov13488d52019-02-25 12:37:50 +01002512 num_osd_ops += count_write_ops(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002513 break;
Ilya Dryomov13488d52019-02-25 12:37:50 +01002514 case OBJ_OP_ZEROOUT:
2515 num_osd_ops += count_zeroout_ops(obj_req);
2516 break;
2517 default:
2518 rbd_assert(0);
2519 }
2520
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002521 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
2522 if (!obj_req->osd_req)
2523 return -ENOMEM;
2524
Ilya Dryomov3a482502019-02-28 10:49:12 +01002525 if (bytes != MODS_ONLY) {
2526 ret = osd_req_op_cls_init(obj_req->osd_req, which, "rbd",
2527 "copyup");
2528 if (ret)
2529 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002530
Ilya Dryomov3a482502019-02-28 10:49:12 +01002531 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, which++,
2532 obj_req->copyup_bvecs,
2533 obj_req->copyup_bvec_count,
2534 bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002535 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002536
Ilya Dryomov13488d52019-02-25 12:37:50 +01002537 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002538 case OBJ_OP_WRITE:
Ilya Dryomov3a482502019-02-28 10:49:12 +01002539 __rbd_obj_setup_write(obj_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002540 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002541 case OBJ_OP_ZEROOUT:
Ilya Dryomov3a482502019-02-28 10:49:12 +01002542 __rbd_obj_setup_zeroout(obj_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002543 break;
2544 default:
2545 rbd_assert(0);
2546 }
2547
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002548 ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
2549 if (ret)
2550 return ret;
2551
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002552 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002553 return 0;
2554}
2555
Ilya Dryomov3a482502019-02-28 10:49:12 +01002556static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2557{
2558 /*
2559 * Only send non-zero copyup data to save some I/O and network
2560 * bandwidth -- zero copyup data is equivalent to the object not
2561 * existing.
2562 */
2563 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2564 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2565 bytes = 0;
2566 }
2567
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002568 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
2569 /*
2570 * Send a copyup request with an empty snapshot context to
2571 * deep-copyup the object through all existing snapshots.
2572 * A second request with the current snapshot context will be
2573 * sent for the actual modification.
2574 */
2575 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC;
2576 return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes);
2577 }
2578
Ilya Dryomov3a482502019-02-28 10:49:12 +01002579 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2580 return rbd_obj_issue_copyup_ops(obj_req, bytes);
2581}
2582
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002583static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2584{
2585 u32 i;
2586
2587 rbd_assert(!obj_req->copyup_bvecs);
2588 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2589 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2590 sizeof(*obj_req->copyup_bvecs),
2591 GFP_NOIO);
2592 if (!obj_req->copyup_bvecs)
2593 return -ENOMEM;
2594
2595 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2596 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2597
2598 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2599 if (!obj_req->copyup_bvecs[i].bv_page)
2600 return -ENOMEM;
2601
2602 obj_req->copyup_bvecs[i].bv_offset = 0;
2603 obj_req->copyup_bvecs[i].bv_len = len;
2604 obj_overlap -= len;
2605 }
2606
2607 rbd_assert(!obj_overlap);
2608 return 0;
2609}
2610
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002611static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2612{
2613 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002614 int ret;
2615
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002616 rbd_assert(obj_req->num_img_extents);
2617 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2618 rbd_dev->parent_overlap);
2619 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002620 /*
2621 * The overlap has become 0 (most likely because the
Ilya Dryomov3a482502019-02-28 10:49:12 +01002622 * image has been flattened). Re-submit the original write
2623 * request -- pass MODS_ONLY since the copyup isn't needed
2624 * anymore.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002625 */
Ilya Dryomov3a482502019-02-28 10:49:12 +01002626 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2627 return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002628 }
2629
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002630 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002631 if (ret)
2632 return ret;
2633
Ilya Dryomov3a482502019-02-28 10:49:12 +01002634 obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002635 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002636}
2637
2638static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2639{
2640 int ret;
2641
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002642 switch (obj_req->write_state) {
2643 case RBD_OBJ_WRITE_GUARD:
2644 rbd_assert(!obj_req->xferred);
2645 if (obj_req->result == -ENOENT) {
2646 /*
2647 * The target object doesn't exist. Read the data for
2648 * the entire target object up to the overlap point (if
2649 * any) from the parent, so we can use it for a copyup.
2650 */
2651 ret = rbd_obj_handle_write_guard(obj_req);
2652 if (ret) {
2653 obj_req->result = ret;
2654 return true;
2655 }
2656 return false;
2657 }
2658 /* fall through */
2659 case RBD_OBJ_WRITE_FLAT:
Ilya Dryomov3a482502019-02-28 10:49:12 +01002660 case RBD_OBJ_WRITE_COPYUP_OPS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002661 if (!obj_req->result)
2662 /*
2663 * There is no such thing as a successful short
2664 * write -- indicate the whole request was satisfied.
2665 */
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002666 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002667 return true;
Ilya Dryomov3a482502019-02-28 10:49:12 +01002668 case RBD_OBJ_WRITE_READ_FROM_PARENT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002669 if (obj_req->result)
Ilya Dryomov3a482502019-02-28 10:49:12 +01002670 return true;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002671
2672 rbd_assert(obj_req->xferred);
2673 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2674 if (ret) {
2675 obj_req->result = ret;
Ilya Dryomov356889c2019-03-01 12:06:24 +01002676 obj_req->xferred = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002677 return true;
2678 }
2679 return false;
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002680 case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC:
2681 if (obj_req->result)
2682 return true;
2683
2684 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2685 ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
2686 if (ret) {
2687 obj_req->result = ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002688 return true;
2689 }
2690 return false;
2691 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002692 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002693 }
2694}
2695
2696/*
2697 * Returns true if @obj_req is completed, or false otherwise.
2698 */
2699static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2700{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002701 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002702 case OBJ_OP_READ:
2703 return rbd_obj_handle_read(obj_req);
2704 case OBJ_OP_WRITE:
2705 return rbd_obj_handle_write(obj_req);
2706 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002707 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002708 if (rbd_obj_handle_write(obj_req)) {
2709 /*
2710 * Hide -ENOENT from delete/truncate/zero -- discarding
2711 * a non-existent object is not a problem.
2712 */
2713 if (obj_req->result == -ENOENT) {
2714 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002715 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002716 }
2717 return true;
2718 }
2719 return false;
2720 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002721 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002722 }
2723}
2724
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002725static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2726{
2727 struct rbd_img_request *img_req = obj_req->img_request;
2728
2729 rbd_assert((!obj_req->result &&
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002730 obj_req->xferred == obj_req->ex.oe_len) ||
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002731 (obj_req->result < 0 && !obj_req->xferred));
2732 if (!obj_req->result) {
2733 img_req->xferred += obj_req->xferred;
Ilya Dryomov980917f2016-09-12 18:59:42 +02002734 return;
Alex Elder02c74fb2013-05-06 17:40:33 -05002735 }
2736
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002737 rbd_warn(img_req->rbd_dev,
2738 "%s at objno %llu %llu~%llu result %d xferred %llu",
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002739 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2740 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002741 obj_req->xferred);
2742 if (!img_req->result) {
2743 img_req->result = obj_req->result;
2744 img_req->xferred = 0;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002745 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06002746}
2747
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002748static void rbd_img_end_child_request(struct rbd_img_request *img_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002749{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002750 struct rbd_obj_request *obj_req = img_req->obj_request;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002751
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002752 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002753 rbd_assert((!img_req->result &&
2754 img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
2755 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002756
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002757 obj_req->result = img_req->result;
2758 obj_req->xferred = img_req->xferred;
2759 rbd_img_request_put(img_req);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002760}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002761
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002762static void rbd_img_end_request(struct rbd_img_request *img_req)
2763{
2764 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2765 rbd_assert((!img_req->result &&
2766 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2767 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002768
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002769 blk_mq_end_request(img_req->rq,
2770 errno_to_blk_status(img_req->result));
2771 rbd_img_request_put(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002772}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002773
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002774static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2775{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002776 struct rbd_img_request *img_req;
2777
2778again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002779 if (!__rbd_obj_handle_request(obj_req))
2780 return;
2781
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002782 img_req = obj_req->img_request;
2783 spin_lock(&img_req->completion_lock);
2784 rbd_obj_end_request(obj_req);
2785 rbd_assert(img_req->pending_count);
2786 if (--img_req->pending_count) {
2787 spin_unlock(&img_req->completion_lock);
2788 return;
2789 }
2790
2791 spin_unlock(&img_req->completion_lock);
2792 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2793 obj_req = img_req->obj_request;
2794 rbd_img_end_child_request(img_req);
2795 goto again;
2796 }
2797 rbd_img_end_request(img_req);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002798}
2799
Ilya Dryomoved95b212016-08-12 16:40:02 +02002800static const struct rbd_client_id rbd_empty_cid;
2801
2802static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2803 const struct rbd_client_id *rhs)
2804{
2805 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2806}
2807
2808static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2809{
2810 struct rbd_client_id cid;
2811
2812 mutex_lock(&rbd_dev->watch_mutex);
2813 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2814 cid.handle = rbd_dev->watch_cookie;
2815 mutex_unlock(&rbd_dev->watch_mutex);
2816 return cid;
2817}
2818
2819/*
2820 * lock_rwsem must be held for write
2821 */
2822static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2823 const struct rbd_client_id *cid)
2824{
2825 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2826 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2827 cid->gid, cid->handle);
2828 rbd_dev->owner_cid = *cid; /* struct */
2829}
2830
2831static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2832{
2833 mutex_lock(&rbd_dev->watch_mutex);
2834 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2835 mutex_unlock(&rbd_dev->watch_mutex);
2836}
2837
Florian Margaineedd8ca82017-12-13 16:43:59 +01002838static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2839{
2840 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2841
2842 strcpy(rbd_dev->lock_cookie, cookie);
2843 rbd_set_owner_cid(rbd_dev, &cid);
2844 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2845}
2846
Ilya Dryomoved95b212016-08-12 16:40:02 +02002847/*
2848 * lock_rwsem must be held for write
2849 */
2850static int rbd_lock(struct rbd_device *rbd_dev)
2851{
2852 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002853 char cookie[32];
2854 int ret;
2855
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002856 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2857 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002858
2859 format_lock_cookie(rbd_dev, cookie);
2860 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2861 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2862 RBD_LOCK_TAG, "", 0);
2863 if (ret)
2864 return ret;
2865
2866 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002867 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002868 return 0;
2869}
2870
2871/*
2872 * lock_rwsem must be held for write
2873 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002874static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002875{
2876 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002877 int ret;
2878
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002879 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2880 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002881
Ilya Dryomoved95b212016-08-12 16:40:02 +02002882 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002883 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002884 if (ret && ret != -ENOENT)
2885 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002886
Ilya Dryomovbbead742017-04-13 12:17:38 +02002887 /* treat errors as the image is unlocked */
2888 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002889 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002890 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2891 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002892}
2893
2894static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2895 enum rbd_notify_op notify_op,
2896 struct page ***preply_pages,
2897 size_t *preply_len)
2898{
2899 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2900 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07002901 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
2902 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002903 void *p = buf;
2904
2905 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2906
2907 /* encode *LockPayload NotifyMessage (op + ClientId) */
2908 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2909 ceph_encode_32(&p, notify_op);
2910 ceph_encode_64(&p, cid.gid);
2911 ceph_encode_64(&p, cid.handle);
2912
2913 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2914 &rbd_dev->header_oloc, buf, buf_size,
2915 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2916}
2917
2918static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2919 enum rbd_notify_op notify_op)
2920{
2921 struct page **reply_pages;
2922 size_t reply_len;
2923
2924 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2925 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2926}
2927
2928static void rbd_notify_acquired_lock(struct work_struct *work)
2929{
2930 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2931 acquired_lock_work);
2932
2933 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2934}
2935
2936static void rbd_notify_released_lock(struct work_struct *work)
2937{
2938 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2939 released_lock_work);
2940
2941 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2942}
2943
2944static int rbd_request_lock(struct rbd_device *rbd_dev)
2945{
2946 struct page **reply_pages;
2947 size_t reply_len;
2948 bool lock_owner_responded = false;
2949 int ret;
2950
2951 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2952
2953 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2954 &reply_pages, &reply_len);
2955 if (ret && ret != -ETIMEDOUT) {
2956 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2957 goto out;
2958 }
2959
2960 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2961 void *p = page_address(reply_pages[0]);
2962 void *const end = p + reply_len;
2963 u32 n;
2964
2965 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2966 while (n--) {
2967 u8 struct_v;
2968 u32 len;
2969
2970 ceph_decode_need(&p, end, 8 + 8, e_inval);
2971 p += 8 + 8; /* skip gid and cookie */
2972
2973 ceph_decode_32_safe(&p, end, len, e_inval);
2974 if (!len)
2975 continue;
2976
2977 if (lock_owner_responded) {
2978 rbd_warn(rbd_dev,
2979 "duplicate lock owners detected");
2980 ret = -EIO;
2981 goto out;
2982 }
2983
2984 lock_owner_responded = true;
2985 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2986 &struct_v, &len);
2987 if (ret) {
2988 rbd_warn(rbd_dev,
2989 "failed to decode ResponseMessage: %d",
2990 ret);
2991 goto e_inval;
2992 }
2993
2994 ret = ceph_decode_32(&p);
2995 }
2996 }
2997
2998 if (!lock_owner_responded) {
2999 rbd_warn(rbd_dev, "no lock owners detected");
3000 ret = -ETIMEDOUT;
3001 }
3002
3003out:
3004 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3005 return ret;
3006
3007e_inval:
3008 ret = -EINVAL;
3009 goto out;
3010}
3011
3012static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3013{
3014 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3015
3016 cancel_delayed_work(&rbd_dev->lock_dwork);
3017 if (wake_all)
3018 wake_up_all(&rbd_dev->lock_waitq);
3019 else
3020 wake_up(&rbd_dev->lock_waitq);
3021}
3022
3023static int get_lock_owner_info(struct rbd_device *rbd_dev,
3024 struct ceph_locker **lockers, u32 *num_lockers)
3025{
3026 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3027 u8 lock_type;
3028 char *lock_tag;
3029 int ret;
3030
3031 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3032
3033 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3034 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3035 &lock_type, &lock_tag, lockers, num_lockers);
3036 if (ret)
3037 return ret;
3038
3039 if (*num_lockers == 0) {
3040 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3041 goto out;
3042 }
3043
3044 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3045 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3046 lock_tag);
3047 ret = -EBUSY;
3048 goto out;
3049 }
3050
3051 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3052 rbd_warn(rbd_dev, "shared lock type detected");
3053 ret = -EBUSY;
3054 goto out;
3055 }
3056
3057 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3058 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3059 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3060 (*lockers)[0].id.cookie);
3061 ret = -EBUSY;
3062 goto out;
3063 }
3064
3065out:
3066 kfree(lock_tag);
3067 return ret;
3068}
3069
3070static int find_watcher(struct rbd_device *rbd_dev,
3071 const struct ceph_locker *locker)
3072{
3073 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3074 struct ceph_watch_item *watchers;
3075 u32 num_watchers;
3076 u64 cookie;
3077 int i;
3078 int ret;
3079
3080 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3081 &rbd_dev->header_oloc, &watchers,
3082 &num_watchers);
3083 if (ret)
3084 return ret;
3085
3086 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3087 for (i = 0; i < num_watchers; i++) {
3088 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3089 sizeof(locker->info.addr)) &&
3090 watchers[i].cookie == cookie) {
3091 struct rbd_client_id cid = {
3092 .gid = le64_to_cpu(watchers[i].name.num),
3093 .handle = cookie,
3094 };
3095
3096 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3097 rbd_dev, cid.gid, cid.handle);
3098 rbd_set_owner_cid(rbd_dev, &cid);
3099 ret = 1;
3100 goto out;
3101 }
3102 }
3103
3104 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3105 ret = 0;
3106out:
3107 kfree(watchers);
3108 return ret;
3109}
3110
3111/*
3112 * lock_rwsem must be held for write
3113 */
3114static int rbd_try_lock(struct rbd_device *rbd_dev)
3115{
3116 struct ceph_client *client = rbd_dev->rbd_client->client;
3117 struct ceph_locker *lockers;
3118 u32 num_lockers;
3119 int ret;
3120
3121 for (;;) {
3122 ret = rbd_lock(rbd_dev);
3123 if (ret != -EBUSY)
3124 return ret;
3125
3126 /* determine if the current lock holder is still alive */
3127 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3128 if (ret)
3129 return ret;
3130
3131 if (num_lockers == 0)
3132 goto again;
3133
3134 ret = find_watcher(rbd_dev, lockers);
3135 if (ret) {
3136 if (ret > 0)
3137 ret = 0; /* have to request lock */
3138 goto out;
3139 }
3140
3141 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3142 ENTITY_NAME(lockers[0].id.name));
3143
3144 ret = ceph_monc_blacklist_add(&client->monc,
3145 &lockers[0].info.addr);
3146 if (ret) {
3147 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3148 ENTITY_NAME(lockers[0].id.name), ret);
3149 goto out;
3150 }
3151
3152 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3153 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3154 lockers[0].id.cookie,
3155 &lockers[0].id.name);
3156 if (ret && ret != -ENOENT)
3157 goto out;
3158
3159again:
3160 ceph_free_lockers(lockers, num_lockers);
3161 }
3162
3163out:
3164 ceph_free_lockers(lockers, num_lockers);
3165 return ret;
3166}
3167
3168/*
3169 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3170 */
3171static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3172 int *pret)
3173{
3174 enum rbd_lock_state lock_state;
3175
3176 down_read(&rbd_dev->lock_rwsem);
3177 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3178 rbd_dev->lock_state);
3179 if (__rbd_is_lock_owner(rbd_dev)) {
3180 lock_state = rbd_dev->lock_state;
3181 up_read(&rbd_dev->lock_rwsem);
3182 return lock_state;
3183 }
3184
3185 up_read(&rbd_dev->lock_rwsem);
3186 down_write(&rbd_dev->lock_rwsem);
3187 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3188 rbd_dev->lock_state);
3189 if (!__rbd_is_lock_owner(rbd_dev)) {
3190 *pret = rbd_try_lock(rbd_dev);
3191 if (*pret)
3192 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3193 }
3194
3195 lock_state = rbd_dev->lock_state;
3196 up_write(&rbd_dev->lock_rwsem);
3197 return lock_state;
3198}
3199
3200static void rbd_acquire_lock(struct work_struct *work)
3201{
3202 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3203 struct rbd_device, lock_dwork);
3204 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003205 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003206
3207 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3208again:
3209 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3210 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3211 if (lock_state == RBD_LOCK_STATE_LOCKED)
3212 wake_requests(rbd_dev, true);
3213 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3214 rbd_dev, lock_state, ret);
3215 return;
3216 }
3217
3218 ret = rbd_request_lock(rbd_dev);
3219 if (ret == -ETIMEDOUT) {
3220 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003221 } else if (ret == -EROFS) {
3222 rbd_warn(rbd_dev, "peer will not release lock");
3223 /*
3224 * If this is rbd_add_acquire_lock(), we want to fail
3225 * immediately -- reuse BLACKLISTED flag. Otherwise we
3226 * want to block.
3227 */
3228 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3229 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3230 /* wake "rbd map --exclusive" process */
3231 wake_requests(rbd_dev, false);
3232 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003233 } else if (ret < 0) {
3234 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3235 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3236 RBD_RETRY_DELAY);
3237 } else {
3238 /*
3239 * lock owner acked, but resend if we don't see them
3240 * release the lock
3241 */
3242 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3243 rbd_dev);
3244 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3245 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3246 }
3247}
3248
3249/*
3250 * lock_rwsem must be held for write
3251 */
3252static bool rbd_release_lock(struct rbd_device *rbd_dev)
3253{
3254 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3255 rbd_dev->lock_state);
3256 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3257 return false;
3258
3259 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3260 downgrade_write(&rbd_dev->lock_rwsem);
3261 /*
3262 * Ensure that all in-flight IO is flushed.
3263 *
3264 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3265 * may be shared with other devices.
3266 */
3267 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3268 up_read(&rbd_dev->lock_rwsem);
3269
3270 down_write(&rbd_dev->lock_rwsem);
3271 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3272 rbd_dev->lock_state);
3273 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3274 return false;
3275
Ilya Dryomovbbead742017-04-13 12:17:38 +02003276 rbd_unlock(rbd_dev);
3277 /*
3278 * Give others a chance to grab the lock - we would re-acquire
3279 * almost immediately if we got new IO during ceph_osdc_sync()
3280 * otherwise. We need to ack our own notifications, so this
3281 * lock_dwork will be requeued from rbd_wait_state_locked()
3282 * after wake_requests() in rbd_handle_released_lock().
3283 */
3284 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003285 return true;
3286}
3287
3288static void rbd_release_lock_work(struct work_struct *work)
3289{
3290 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3291 unlock_work);
3292
3293 down_write(&rbd_dev->lock_rwsem);
3294 rbd_release_lock(rbd_dev);
3295 up_write(&rbd_dev->lock_rwsem);
3296}
3297
3298static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3299 void **p)
3300{
3301 struct rbd_client_id cid = { 0 };
3302
3303 if (struct_v >= 2) {
3304 cid.gid = ceph_decode_64(p);
3305 cid.handle = ceph_decode_64(p);
3306 }
3307
3308 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3309 cid.handle);
3310 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3311 down_write(&rbd_dev->lock_rwsem);
3312 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3313 /*
3314 * we already know that the remote client is
3315 * the owner
3316 */
3317 up_write(&rbd_dev->lock_rwsem);
3318 return;
3319 }
3320
3321 rbd_set_owner_cid(rbd_dev, &cid);
3322 downgrade_write(&rbd_dev->lock_rwsem);
3323 } else {
3324 down_read(&rbd_dev->lock_rwsem);
3325 }
3326
3327 if (!__rbd_is_lock_owner(rbd_dev))
3328 wake_requests(rbd_dev, false);
3329 up_read(&rbd_dev->lock_rwsem);
3330}
3331
3332static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3333 void **p)
3334{
3335 struct rbd_client_id cid = { 0 };
3336
3337 if (struct_v >= 2) {
3338 cid.gid = ceph_decode_64(p);
3339 cid.handle = ceph_decode_64(p);
3340 }
3341
3342 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3343 cid.handle);
3344 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3345 down_write(&rbd_dev->lock_rwsem);
3346 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3347 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3348 __func__, rbd_dev, cid.gid, cid.handle,
3349 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3350 up_write(&rbd_dev->lock_rwsem);
3351 return;
3352 }
3353
3354 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3355 downgrade_write(&rbd_dev->lock_rwsem);
3356 } else {
3357 down_read(&rbd_dev->lock_rwsem);
3358 }
3359
3360 if (!__rbd_is_lock_owner(rbd_dev))
3361 wake_requests(rbd_dev, false);
3362 up_read(&rbd_dev->lock_rwsem);
3363}
3364
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003365/*
3366 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3367 * ResponseMessage is needed.
3368 */
3369static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3370 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003371{
3372 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3373 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003374 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003375
3376 if (struct_v >= 2) {
3377 cid.gid = ceph_decode_64(p);
3378 cid.handle = ceph_decode_64(p);
3379 }
3380
3381 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3382 cid.handle);
3383 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003384 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003385
3386 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003387 if (__rbd_is_lock_owner(rbd_dev)) {
3388 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3389 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3390 goto out_unlock;
3391
3392 /*
3393 * encode ResponseMessage(0) so the peer can detect
3394 * a missing owner
3395 */
3396 result = 0;
3397
3398 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003399 if (!rbd_dev->opts->exclusive) {
3400 dout("%s rbd_dev %p queueing unlock_work\n",
3401 __func__, rbd_dev);
3402 queue_work(rbd_dev->task_wq,
3403 &rbd_dev->unlock_work);
3404 } else {
3405 /* refuse to release the lock */
3406 result = -EROFS;
3407 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003408 }
3409 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003410
3411out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003412 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003413 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003414}
3415
3416static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3417 u64 notify_id, u64 cookie, s32 *result)
3418{
3419 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07003420 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3421 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003422 int ret;
3423
3424 if (result) {
3425 void *p = buf;
3426
3427 /* encode ResponseMessage */
3428 ceph_start_encoding(&p, 1, 1,
3429 buf_size - CEPH_ENCODING_START_BLK_LEN);
3430 ceph_encode_32(&p, *result);
3431 } else {
3432 buf_size = 0;
3433 }
3434
3435 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3436 &rbd_dev->header_oloc, notify_id, cookie,
3437 buf, buf_size);
3438 if (ret)
3439 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3440}
3441
3442static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3443 u64 cookie)
3444{
3445 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3446 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3447}
3448
3449static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3450 u64 notify_id, u64 cookie, s32 result)
3451{
3452 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3453 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3454}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003455
3456static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3457 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003458{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003459 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003460 void *p = data;
3461 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003462 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003463 u32 len;
3464 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003465 int ret;
3466
Ilya Dryomoved95b212016-08-12 16:40:02 +02003467 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3468 __func__, rbd_dev, cookie, notify_id, data_len);
3469 if (data_len) {
3470 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3471 &struct_v, &len);
3472 if (ret) {
3473 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3474 ret);
3475 return;
3476 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003477
Ilya Dryomoved95b212016-08-12 16:40:02 +02003478 notify_op = ceph_decode_32(&p);
3479 } else {
3480 /* legacy notification for header updates */
3481 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3482 len = 0;
3483 }
Alex Elderb8d70032012-11-30 17:53:04 -06003484
Ilya Dryomoved95b212016-08-12 16:40:02 +02003485 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3486 switch (notify_op) {
3487 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3488 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3489 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3490 break;
3491 case RBD_NOTIFY_OP_RELEASED_LOCK:
3492 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3493 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3494 break;
3495 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003496 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3497 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003498 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003499 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003500 else
3501 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3502 break;
3503 case RBD_NOTIFY_OP_HEADER_UPDATE:
3504 ret = rbd_dev_refresh(rbd_dev);
3505 if (ret)
3506 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3507
3508 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3509 break;
3510 default:
3511 if (rbd_is_lock_owner(rbd_dev))
3512 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3513 cookie, -EOPNOTSUPP);
3514 else
3515 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3516 break;
3517 }
Alex Elderb8d70032012-11-30 17:53:04 -06003518}
3519
Ilya Dryomov99d16942016-08-12 16:11:41 +02003520static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3521
Ilya Dryomov922dab62016-05-26 01:15:02 +02003522static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003523{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003524 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003525
Ilya Dryomov922dab62016-05-26 01:15:02 +02003526 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003527
Ilya Dryomoved95b212016-08-12 16:40:02 +02003528 down_write(&rbd_dev->lock_rwsem);
3529 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3530 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003531
Ilya Dryomov99d16942016-08-12 16:11:41 +02003532 mutex_lock(&rbd_dev->watch_mutex);
3533 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3534 __rbd_unregister_watch(rbd_dev);
3535 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003536
Ilya Dryomov99d16942016-08-12 16:11:41 +02003537 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003538 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003539 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003540}
3541
3542/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003543 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003544 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003545static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003546{
3547 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003548 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003549
Ilya Dryomov922dab62016-05-26 01:15:02 +02003550 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003551 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003552
Ilya Dryomov922dab62016-05-26 01:15:02 +02003553 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3554 &rbd_dev->header_oloc, rbd_watch_cb,
3555 rbd_watch_errcb, rbd_dev);
3556 if (IS_ERR(handle))
3557 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003558
Ilya Dryomov922dab62016-05-26 01:15:02 +02003559 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003560 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003561}
3562
Ilya Dryomov99d16942016-08-12 16:11:41 +02003563/*
3564 * watch_mutex must be locked
3565 */
3566static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003567{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003568 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3569 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003570
Ilya Dryomov99d16942016-08-12 16:11:41 +02003571 rbd_assert(rbd_dev->watch_handle);
3572 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003573
Ilya Dryomov922dab62016-05-26 01:15:02 +02003574 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3575 if (ret)
3576 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003577
Ilya Dryomov922dab62016-05-26 01:15:02 +02003578 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003579}
3580
Ilya Dryomov99d16942016-08-12 16:11:41 +02003581static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003582{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003583 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003584
Ilya Dryomov99d16942016-08-12 16:11:41 +02003585 mutex_lock(&rbd_dev->watch_mutex);
3586 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3587 ret = __rbd_register_watch(rbd_dev);
3588 if (ret)
3589 goto out;
3590
3591 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3592 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3593
3594out:
3595 mutex_unlock(&rbd_dev->watch_mutex);
3596 return ret;
3597}
3598
3599static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3600{
3601 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3602
Ilya Dryomoved95b212016-08-12 16:40:02 +02003603 cancel_work_sync(&rbd_dev->acquired_lock_work);
3604 cancel_work_sync(&rbd_dev->released_lock_work);
3605 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3606 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003607}
3608
3609static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3610{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003611 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003612 cancel_tasks_sync(rbd_dev);
3613
3614 mutex_lock(&rbd_dev->watch_mutex);
3615 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3616 __rbd_unregister_watch(rbd_dev);
3617 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3618 mutex_unlock(&rbd_dev->watch_mutex);
3619
Dongsheng Yang23edca82018-06-04 06:24:37 -04003620 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomov811c6682016-04-15 16:22:16 +02003621 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003622}
3623
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003624/*
3625 * lock_rwsem must be held for write
3626 */
3627static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3628{
3629 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3630 char cookie[32];
3631 int ret;
3632
3633 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3634
3635 format_lock_cookie(rbd_dev, cookie);
3636 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3637 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3638 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3639 RBD_LOCK_TAG, cookie);
3640 if (ret) {
3641 if (ret != -EOPNOTSUPP)
3642 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3643 ret);
3644
3645 /*
3646 * Lock cookie cannot be updated on older OSDs, so do
3647 * a manual release and queue an acquire.
3648 */
3649 if (rbd_release_lock(rbd_dev))
3650 queue_delayed_work(rbd_dev->task_wq,
3651 &rbd_dev->lock_dwork, 0);
3652 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003653 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003654 }
3655}
3656
Ilya Dryomov99d16942016-08-12 16:11:41 +02003657static void rbd_reregister_watch(struct work_struct *work)
3658{
3659 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3660 struct rbd_device, watch_dwork);
3661 int ret;
3662
3663 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3664
3665 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003666 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3667 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003668 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003669 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003670
3671 ret = __rbd_register_watch(rbd_dev);
3672 if (ret) {
3673 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003674 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003675 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003676 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003677 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003678 queue_delayed_work(rbd_dev->task_wq,
3679 &rbd_dev->watch_dwork,
3680 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003681 }
3682 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003683 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003684 }
3685
3686 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3687 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3688 mutex_unlock(&rbd_dev->watch_mutex);
3689
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003690 down_write(&rbd_dev->lock_rwsem);
3691 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3692 rbd_reacquire_lock(rbd_dev);
3693 up_write(&rbd_dev->lock_rwsem);
3694
Ilya Dryomov99d16942016-08-12 16:11:41 +02003695 ret = rbd_dev_refresh(rbd_dev);
3696 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00003697 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003698}
3699
Alex Elder36be9a72013-01-19 00:30:28 -06003700/*
Alex Elderf40eb342013-04-25 15:09:42 -05003701 * Synchronous osd object method call. Returns the number of bytes
3702 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003703 */
3704static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003705 struct ceph_object_id *oid,
3706 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003707 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003708 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003709 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003710 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003711 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003712{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003713 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3714 struct page *req_page = NULL;
3715 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003716 int ret;
3717
3718 /*
Alex Elder6010a452013-04-05 01:27:11 -05003719 * Method calls are ultimately read operations. The result
3720 * should placed into the inbound buffer provided. They
3721 * also supply outbound data--parameters for the object
3722 * method. Currently if this is present it will be a
3723 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003724 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003725 if (outbound) {
3726 if (outbound_size > PAGE_SIZE)
3727 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003728
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003729 req_page = alloc_page(GFP_KERNEL);
3730 if (!req_page)
3731 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003732
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003733 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003734 }
Alex Elder430c28c2013-04-03 21:32:51 -05003735
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003736 reply_page = alloc_page(GFP_KERNEL);
3737 if (!reply_page) {
3738 if (req_page)
3739 __free_page(req_page);
3740 return -ENOMEM;
3741 }
Alex Elder36be9a72013-01-19 00:30:28 -06003742
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003743 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3744 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3745 reply_page, &inbound_size);
3746 if (!ret) {
3747 memcpy(inbound, page_address(reply_page), inbound_size);
3748 ret = inbound_size;
3749 }
Alex Elder57385b52013-04-21 12:14:45 -05003750
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003751 if (req_page)
3752 __free_page(req_page);
3753 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003754 return ret;
3755}
3756
Ilya Dryomoved95b212016-08-12 16:40:02 +02003757/*
3758 * lock_rwsem must be held for read
3759 */
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003760static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003761{
3762 DEFINE_WAIT(wait);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003763 unsigned long timeout;
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003764 int ret = 0;
3765
3766 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3767 return -EBLACKLISTED;
3768
3769 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3770 return 0;
3771
3772 if (!may_acquire) {
3773 rbd_warn(rbd_dev, "exclusive lock required");
3774 return -EROFS;
3775 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003776
3777 do {
3778 /*
3779 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3780 * and cancel_delayed_work() in wake_requests().
3781 */
3782 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3783 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3784 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3785 TASK_UNINTERRUPTIBLE);
3786 up_read(&rbd_dev->lock_rwsem);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003787 timeout = schedule_timeout(ceph_timeout_jiffies(
3788 rbd_dev->opts->lock_timeout));
Ilya Dryomoved95b212016-08-12 16:40:02 +02003789 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003790 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3791 ret = -EBLACKLISTED;
3792 break;
3793 }
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003794 if (!timeout) {
3795 rbd_warn(rbd_dev, "timed out waiting for lock");
3796 ret = -ETIMEDOUT;
3797 break;
3798 }
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003799 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003800
Ilya Dryomoved95b212016-08-12 16:40:02 +02003801 finish_wait(&rbd_dev->lock_waitq, &wait);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003802 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003803}
3804
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003805static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003806{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003807 struct request *rq = blk_mq_rq_from_pdu(work);
3808 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003809 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003810 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003811 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3812 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003813 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003814 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003815 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003816 int result;
3817
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003818 switch (req_op(rq)) {
3819 case REQ_OP_DISCARD:
3820 op_type = OBJ_OP_DISCARD;
3821 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01003822 case REQ_OP_WRITE_ZEROES:
3823 op_type = OBJ_OP_ZEROOUT;
3824 break;
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003825 case REQ_OP_WRITE:
3826 op_type = OBJ_OP_WRITE;
3827 break;
3828 case REQ_OP_READ:
3829 op_type = OBJ_OP_READ;
3830 break;
3831 default:
3832 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003833 result = -EIO;
3834 goto err;
3835 }
3836
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003837 /* Ignore/skip any zero-length requests */
3838
3839 if (!length) {
3840 dout("%s: zero-length request\n", __func__);
3841 result = 0;
3842 goto err_rq;
3843 }
3844
Ilya Dryomov9568c932017-10-12 12:35:19 +02003845 rbd_assert(op_type == OBJ_OP_READ ||
3846 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003847
3848 /*
3849 * Quit early if the mapped snapshot no longer exists. It's
3850 * still possible the snapshot will have disappeared by the
3851 * time our request arrives at the osd, but there's no sense in
3852 * sending it if we already know.
3853 */
3854 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3855 dout("request for non-existent snapshot");
3856 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3857 result = -ENXIO;
3858 goto err_rq;
3859 }
3860
3861 if (offset && length > U64_MAX - offset + 1) {
3862 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3863 length);
3864 result = -EINVAL;
3865 goto err_rq; /* Shouldn't happen */
3866 }
3867
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003868 blk_mq_start_request(rq);
3869
Josh Durgin4e752f02014-04-08 11:12:11 -07003870 down_read(&rbd_dev->header_rwsem);
3871 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003872 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003873 snapc = rbd_dev->header.snapc;
3874 ceph_get_snap_context(snapc);
3875 }
3876 up_read(&rbd_dev->header_rwsem);
3877
3878 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003879 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003880 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003881 result = -EIO;
3882 goto err_rq;
3883 }
3884
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003885 must_be_locked =
3886 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3887 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003888 if (must_be_locked) {
3889 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003890 result = rbd_wait_state_locked(rbd_dev,
3891 !rbd_dev->opts->exclusive);
3892 if (result)
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003893 goto err_unlock;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003894 }
3895
Ilya Dryomovdfd98752018-02-06 19:26:35 +01003896 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003897 if (!img_request) {
3898 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003899 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003900 }
3901 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003902 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003903
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01003904 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
Ilya Dryomov5a237812018-02-06 19:26:34 +01003905 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003906 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01003907 result = rbd_img_fill_from_bio(img_request, offset, length,
3908 rq->bio);
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01003909 if (result || !img_request->pending_count)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003910 goto err_img_request;
3911
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01003912 rbd_img_request_submit(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003913 if (must_be_locked)
3914 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003915 return;
3916
3917err_img_request:
3918 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003919err_unlock:
3920 if (must_be_locked)
3921 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003922err_rq:
3923 if (result)
3924 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003925 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003926 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003927err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003928 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003929}
3930
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003931static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003932 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003933{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003934 struct request *rq = bd->rq;
3935 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003936
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003937 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003938 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003939}
3940
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003941static void rbd_free_disk(struct rbd_device *rbd_dev)
3942{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003943 blk_cleanup_queue(rbd_dev->disk->queue);
3944 blk_mq_free_tag_set(&rbd_dev->tag_set);
3945 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003946 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003947}
3948
Alex Elder788e2df2013-01-17 12:25:27 -06003949static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003950 struct ceph_object_id *oid,
3951 struct ceph_object_locator *oloc,
3952 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003953
3954{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003955 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3956 struct ceph_osd_request *req;
3957 struct page **pages;
3958 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003959 int ret;
3960
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003961 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3962 if (!req)
3963 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003964
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003965 ceph_oid_copy(&req->r_base_oid, oid);
3966 ceph_oloc_copy(&req->r_base_oloc, oloc);
3967 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003968
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003969 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3970 if (IS_ERR(pages)) {
3971 ret = PTR_ERR(pages);
3972 goto out_req;
3973 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003974
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003975 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3976 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3977 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003978
Ilya Dryomov26f887e2018-10-15 16:11:37 +02003979 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
3980 if (ret)
3981 goto out_req;
3982
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003983 ceph_osdc_start_request(osdc, req, false);
3984 ret = ceph_osdc_wait_request(osdc, req);
3985 if (ret >= 0)
3986 ceph_copy_from_page_vector(pages, buf, 0, ret);
3987
3988out_req:
3989 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003990 return ret;
3991}
3992
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003993/*
Alex Elder662518b2013-05-06 09:51:29 -05003994 * Read the complete header for the given rbd device. On successful
3995 * return, the rbd_dev->header field will contain up-to-date
3996 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003997 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003998static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003999{
4000 struct rbd_image_header_ondisk *ondisk = NULL;
4001 u32 snap_count = 0;
4002 u64 names_size = 0;
4003 u32 want_count;
4004 int ret;
4005
4006 /*
4007 * The complete header will include an array of its 64-bit
4008 * snapshot ids, followed by the names of those snapshots as
4009 * a contiguous block of NUL-terminated strings. Note that
4010 * the number of snapshots could change by the time we read
4011 * it in, in which case we re-read it.
4012 */
4013 do {
4014 size_t size;
4015
4016 kfree(ondisk);
4017
4018 size = sizeof (*ondisk);
4019 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4020 size += names_size;
4021 ondisk = kmalloc(size, GFP_KERNEL);
4022 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05004023 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05004024
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004025 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4026 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05004027 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05004028 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004029 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05004030 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004031 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4032 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05004033 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004034 }
4035 if (!rbd_dev_ondisk_valid(ondisk)) {
4036 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004037 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05004038 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004039 }
4040
4041 names_size = le64_to_cpu(ondisk->snap_names_len);
4042 want_count = snap_count;
4043 snap_count = le32_to_cpu(ondisk->snap_count);
4044 } while (snap_count != want_count);
4045
Alex Elder662518b2013-05-06 09:51:29 -05004046 ret = rbd_header_from_disk(rbd_dev, ondisk);
4047out:
Alex Elder4156d992012-08-02 11:29:46 -05004048 kfree(ondisk);
4049
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004050 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004051}
4052
Alex Elder15228ed2013-05-01 12:43:03 -05004053/*
4054 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
4055 * has disappeared from the (just updated) snapshot context.
4056 */
4057static void rbd_exists_validate(struct rbd_device *rbd_dev)
4058{
4059 u64 snap_id;
4060
4061 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
4062 return;
4063
4064 snap_id = rbd_dev->spec->snap_id;
4065 if (snap_id == CEPH_NOSNAP)
4066 return;
4067
4068 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
4069 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4070}
4071
Josh Durgin98752012013-08-29 17:26:31 -07004072static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4073{
4074 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07004075
4076 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02004077 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4078 * try to update its size. If REMOVING is set, updating size
4079 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07004080 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02004081 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4082 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07004083 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4084 dout("setting size to %llu sectors", (unsigned long long)size);
4085 set_capacity(rbd_dev->disk, size);
4086 revalidate_disk(rbd_dev->disk);
4087 }
4088}
4089
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004090static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05004091{
Alex Eldere627db02013-05-06 07:40:30 -05004092 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05004093 int ret;
4094
Alex Eldercfbf6372013-05-31 17:40:45 -05004095 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004096 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004097
4098 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004099 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004100 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004101
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004102 /*
4103 * If there is a parent, see if it has disappeared due to the
4104 * mapped image getting flattened.
4105 */
4106 if (rbd_dev->parent) {
4107 ret = rbd_dev_v2_parent_info(rbd_dev);
4108 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004109 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004110 }
4111
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004112 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004113 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004114 } else {
4115 /* validate mapped snapshot's EXISTS flag */
4116 rbd_exists_validate(rbd_dev);
4117 }
Alex Elder15228ed2013-05-01 12:43:03 -05004118
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004119out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004120 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004121 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004122 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004123
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004124 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004125}
4126
Christoph Hellwigd6296d392017-05-01 10:19:08 -06004127static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4128 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004129{
4130 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4131
4132 INIT_WORK(work, rbd_queue_workfn);
4133 return 0;
4134}
4135
Eric Biggersf363b082017-03-30 13:39:16 -07004136static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004137 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004138 .init_request = rbd_init_request,
4139};
4140
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004141static int rbd_init_disk(struct rbd_device *rbd_dev)
4142{
4143 struct gendisk *disk;
4144 struct request_queue *q;
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004145 unsigned int objset_bytes =
4146 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004147 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004148
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004149 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004150 disk = alloc_disk(single_major ?
4151 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4152 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004153 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004154 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004155
Alex Elderf0f8cef2012-01-29 13:57:44 -06004156 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004157 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004158 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004159 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004160 if (single_major)
4161 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004162 disk->fops = &rbd_bd_ops;
4163 disk->private_data = rbd_dev;
4164
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004165 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4166 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004167 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004168 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ming Lei56d18f62019-02-15 19:13:24 +08004169 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004170 rbd_dev->tag_set.nr_hw_queues = 1;
4171 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4172
4173 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4174 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004175 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004176
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004177 q = blk_mq_init_queue(&rbd_dev->tag_set);
4178 if (IS_ERR(q)) {
4179 err = PTR_ERR(q);
4180 goto out_tag_set;
4181 }
4182
Bart Van Assche8b904b52018-03-07 17:10:10 -08004183 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004184 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004185
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004186 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004187 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01004188 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01004189 blk_queue_max_segment_size(q, UINT_MAX);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01004190 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
4191 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004192
Ilya Dryomovd9360542018-03-23 06:14:47 +01004193 if (rbd_dev->opts->trim) {
4194 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01004195 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
Ilya Dryomovd9360542018-03-23 06:14:47 +01004196 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4197 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4198 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004199
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004200 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01004201 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004202
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004203 /*
4204 * disk_release() expects a queue ref from add_disk() and will
4205 * put it. Hold an extra ref until add_disk() is called.
4206 */
4207 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004208 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004209 q->queuedata = rbd_dev;
4210
4211 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004212
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004213 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004214out_tag_set:
4215 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004216out_disk:
4217 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004218 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004219}
4220
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004221/*
4222 sysfs
4223*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004224
Alex Elder593a9e72012-02-07 12:03:37 -06004225static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4226{
4227 return container_of(dev, struct rbd_device, dev);
4228}
4229
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004230static ssize_t rbd_size_show(struct device *dev,
4231 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004232{
Alex Elder593a9e72012-02-07 12:03:37 -06004233 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004234
Alex Elderfc71d832013-04-26 15:44:36 -05004235 return sprintf(buf, "%llu\n",
4236 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004237}
4238
Alex Elder34b13182012-07-13 20:35:12 -05004239/*
4240 * Note this shows the features for whatever's mapped, which is not
4241 * necessarily the base image.
4242 */
4243static ssize_t rbd_features_show(struct device *dev,
4244 struct device_attribute *attr, char *buf)
4245{
4246 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4247
4248 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004249 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004250}
4251
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004252static ssize_t rbd_major_show(struct device *dev,
4253 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004254{
Alex Elder593a9e72012-02-07 12:03:37 -06004255 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004256
Alex Elderfc71d832013-04-26 15:44:36 -05004257 if (rbd_dev->major)
4258 return sprintf(buf, "%d\n", rbd_dev->major);
4259
4260 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004261}
Alex Elderfc71d832013-04-26 15:44:36 -05004262
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004263static ssize_t rbd_minor_show(struct device *dev,
4264 struct device_attribute *attr, char *buf)
4265{
4266 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4267
4268 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004269}
4270
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004271static ssize_t rbd_client_addr_show(struct device *dev,
4272 struct device_attribute *attr, char *buf)
4273{
4274 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4275 struct ceph_entity_addr *client_addr =
4276 ceph_client_addr(rbd_dev->rbd_client->client);
4277
4278 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4279 le32_to_cpu(client_addr->nonce));
4280}
4281
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004282static ssize_t rbd_client_id_show(struct device *dev,
4283 struct device_attribute *attr, char *buf)
4284{
Alex Elder593a9e72012-02-07 12:03:37 -06004285 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004286
Alex Elder1dbb4392012-01-24 10:08:37 -06004287 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004288 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004289}
4290
Mike Christie267fb902016-08-18 18:38:43 +02004291static ssize_t rbd_cluster_fsid_show(struct device *dev,
4292 struct device_attribute *attr, char *buf)
4293{
4294 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4295
4296 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4297}
4298
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004299static ssize_t rbd_config_info_show(struct device *dev,
4300 struct device_attribute *attr, char *buf)
4301{
4302 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4303
4304 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004305}
4306
4307static ssize_t rbd_pool_show(struct device *dev,
4308 struct device_attribute *attr, char *buf)
4309{
Alex Elder593a9e72012-02-07 12:03:37 -06004310 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004311
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004312 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004313}
4314
Alex Elder9bb2f332012-07-12 10:46:35 -05004315static ssize_t rbd_pool_id_show(struct device *dev,
4316 struct device_attribute *attr, char *buf)
4317{
4318 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4319
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004320 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004321 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004322}
4323
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004324static ssize_t rbd_pool_ns_show(struct device *dev,
4325 struct device_attribute *attr, char *buf)
4326{
4327 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4328
4329 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4330}
4331
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004332static ssize_t rbd_name_show(struct device *dev,
4333 struct device_attribute *attr, char *buf)
4334{
Alex Elder593a9e72012-02-07 12:03:37 -06004335 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004336
Alex Eldera92ffdf2012-10-30 19:40:33 -05004337 if (rbd_dev->spec->image_name)
4338 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4339
4340 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004341}
4342
Alex Elder589d30e2012-07-10 20:30:11 -05004343static ssize_t rbd_image_id_show(struct device *dev,
4344 struct device_attribute *attr, char *buf)
4345{
4346 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4347
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004348 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004349}
4350
Alex Elder34b13182012-07-13 20:35:12 -05004351/*
4352 * Shows the name of the currently-mapped snapshot (or
4353 * RBD_SNAP_HEAD_NAME for the base image).
4354 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004355static ssize_t rbd_snap_show(struct device *dev,
4356 struct device_attribute *attr,
4357 char *buf)
4358{
Alex Elder593a9e72012-02-07 12:03:37 -06004359 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004360
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004361 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004362}
4363
Mike Christie92a58672016-08-18 18:38:44 +02004364static ssize_t rbd_snap_id_show(struct device *dev,
4365 struct device_attribute *attr, char *buf)
4366{
4367 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4368
4369 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4370}
4371
Alex Elder86b00e02012-10-25 23:34:42 -05004372/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004373 * For a v2 image, shows the chain of parent images, separated by empty
4374 * lines. For v1 images or if there is no parent, shows "(no parent
4375 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004376 */
4377static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004378 struct device_attribute *attr,
4379 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004380{
4381 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004382 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004383
Ilya Dryomovff961282014-07-22 21:53:07 +04004384 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004385 return sprintf(buf, "(no parent image)\n");
4386
Ilya Dryomovff961282014-07-22 21:53:07 +04004387 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4388 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004389
Ilya Dryomovff961282014-07-22 21:53:07 +04004390 count += sprintf(&buf[count], "%s"
4391 "pool_id %llu\npool_name %s\n"
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004392 "pool_ns %s\n"
Ilya Dryomovff961282014-07-22 21:53:07 +04004393 "image_id %s\nimage_name %s\n"
4394 "snap_id %llu\nsnap_name %s\n"
4395 "overlap %llu\n",
4396 !count ? "" : "\n", /* first? */
4397 spec->pool_id, spec->pool_name,
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004398 spec->pool_ns ?: "",
Ilya Dryomovff961282014-07-22 21:53:07 +04004399 spec->image_id, spec->image_name ?: "(unknown)",
4400 spec->snap_id, spec->snap_name,
4401 rbd_dev->parent_overlap);
4402 }
Alex Elder86b00e02012-10-25 23:34:42 -05004403
Ilya Dryomovff961282014-07-22 21:53:07 +04004404 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004405}
4406
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004407static ssize_t rbd_image_refresh(struct device *dev,
4408 struct device_attribute *attr,
4409 const char *buf,
4410 size_t size)
4411{
Alex Elder593a9e72012-02-07 12:03:37 -06004412 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004413 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004414
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004415 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004416 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004417 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004418
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004419 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004420}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004421
Joe Perches5657a812018-05-24 13:38:59 -06004422static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
4423static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
4424static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
4425static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
4426static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
4427static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
4428static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
4429static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
4430static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
4431static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004432static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
Joe Perches5657a812018-05-24 13:38:59 -06004433static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
4434static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
4435static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
4436static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
4437static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
4438static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004439
4440static struct attribute *rbd_attrs[] = {
4441 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004442 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004443 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004444 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004445 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004446 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004447 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004448 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004449 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004450 &dev_attr_pool_id.attr,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004451 &dev_attr_pool_ns.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004452 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004453 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004454 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004455 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004456 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004457 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004458 NULL
4459};
4460
4461static struct attribute_group rbd_attr_group = {
4462 .attrs = rbd_attrs,
4463};
4464
4465static const struct attribute_group *rbd_attr_groups[] = {
4466 &rbd_attr_group,
4467 NULL
4468};
4469
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004470static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004471
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304472static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004473 .name = "rbd",
4474 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004475 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004476};
4477
Alex Elder8b8fb992012-10-26 17:25:24 -05004478static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4479{
4480 kref_get(&spec->kref);
4481
4482 return spec;
4483}
4484
4485static void rbd_spec_free(struct kref *kref);
4486static void rbd_spec_put(struct rbd_spec *spec)
4487{
4488 if (spec)
4489 kref_put(&spec->kref, rbd_spec_free);
4490}
4491
4492static struct rbd_spec *rbd_spec_alloc(void)
4493{
4494 struct rbd_spec *spec;
4495
4496 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4497 if (!spec)
4498 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004499
4500 spec->pool_id = CEPH_NOPOOL;
4501 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004502 kref_init(&spec->kref);
4503
Alex Elder8b8fb992012-10-26 17:25:24 -05004504 return spec;
4505}
4506
4507static void rbd_spec_free(struct kref *kref)
4508{
4509 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4510
4511 kfree(spec->pool_name);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004512 kfree(spec->pool_ns);
Alex Elder8b8fb992012-10-26 17:25:24 -05004513 kfree(spec->image_id);
4514 kfree(spec->image_name);
4515 kfree(spec->snap_name);
4516 kfree(spec);
4517}
4518
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004519static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004520{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004521 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004522 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004523
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004524 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004525 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004526 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004527
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004528 rbd_put_client(rbd_dev->rbd_client);
4529 rbd_spec_put(rbd_dev->spec);
4530 kfree(rbd_dev->opts);
4531 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004532}
4533
4534static void rbd_dev_release(struct device *dev)
4535{
4536 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4537 bool need_put = !!rbd_dev->opts;
4538
4539 if (need_put) {
4540 destroy_workqueue(rbd_dev->task_wq);
4541 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4542 }
4543
4544 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004545
4546 /*
4547 * This is racy, but way better than putting module outside of
4548 * the release callback. The race window is pretty small, so
4549 * doing something similar to dm (dm-builtin.c) is overkill.
4550 */
4551 if (need_put)
4552 module_put(THIS_MODULE);
4553}
4554
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004555static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4556 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004557{
4558 struct rbd_device *rbd_dev;
4559
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004560 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004561 if (!rbd_dev)
4562 return NULL;
4563
4564 spin_lock_init(&rbd_dev->lock);
4565 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004566 init_rwsem(&rbd_dev->header_rwsem);
4567
Ilya Dryomov7e973322017-01-25 18:16:22 +01004568 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004569 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004570 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004571 if (spec->pool_ns) {
4572 WARN_ON(!*spec->pool_ns);
4573 rbd_dev->header_oloc.pool_ns =
4574 ceph_find_or_create_string(spec->pool_ns,
4575 strlen(spec->pool_ns));
4576 }
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004577
Ilya Dryomov99d16942016-08-12 16:11:41 +02004578 mutex_init(&rbd_dev->watch_mutex);
4579 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4580 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4581
Ilya Dryomoved95b212016-08-12 16:40:02 +02004582 init_rwsem(&rbd_dev->lock_rwsem);
4583 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4584 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4585 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4586 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4587 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4588 init_waitqueue_head(&rbd_dev->lock_waitq);
4589
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004590 rbd_dev->dev.bus = &rbd_bus_type;
4591 rbd_dev->dev.type = &rbd_device_type;
4592 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004593 device_initialize(&rbd_dev->dev);
4594
Alex Elderc53d5892012-10-25 23:34:42 -05004595 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004596 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004597
Alex Elderc53d5892012-10-25 23:34:42 -05004598 return rbd_dev;
4599}
4600
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004601/*
4602 * Create a mapping rbd_dev.
4603 */
4604static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4605 struct rbd_spec *spec,
4606 struct rbd_options *opts)
4607{
4608 struct rbd_device *rbd_dev;
4609
4610 rbd_dev = __rbd_dev_create(rbdc, spec);
4611 if (!rbd_dev)
4612 return NULL;
4613
4614 rbd_dev->opts = opts;
4615
4616 /* get an id and fill in device name */
4617 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4618 minor_to_rbd_dev_id(1 << MINORBITS),
4619 GFP_KERNEL);
4620 if (rbd_dev->dev_id < 0)
4621 goto fail_rbd_dev;
4622
4623 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4624 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4625 rbd_dev->name);
4626 if (!rbd_dev->task_wq)
4627 goto fail_dev_id;
4628
4629 /* we have a ref from do_rbd_add() */
4630 __module_get(THIS_MODULE);
4631
4632 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4633 return rbd_dev;
4634
4635fail_dev_id:
4636 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4637fail_rbd_dev:
4638 rbd_dev_free(rbd_dev);
4639 return NULL;
4640}
4641
Alex Elderc53d5892012-10-25 23:34:42 -05004642static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4643{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004644 if (rbd_dev)
4645 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004646}
4647
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004648/*
Alex Elder9d475de2012-07-03 16:01:19 -05004649 * Get the size and object order for an image snapshot, or if
4650 * snap_id is CEPH_NOSNAP, gets this information for the base
4651 * image.
4652 */
4653static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4654 u8 *order, u64 *snap_size)
4655{
4656 __le64 snapid = cpu_to_le64(snap_id);
4657 int ret;
4658 struct {
4659 u8 order;
4660 __le64 size;
4661 } __attribute__ ((packed)) size_buf = { 0 };
4662
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004663 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4664 &rbd_dev->header_oloc, "get_size",
4665 &snapid, sizeof(snapid),
4666 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004667 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004668 if (ret < 0)
4669 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004670 if (ret < sizeof (size_buf))
4671 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004672
Josh Durginc3545572013-08-28 17:08:10 -07004673 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004674 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004675 dout(" order %u", (unsigned int)*order);
4676 }
Alex Elder9d475de2012-07-03 16:01:19 -05004677 *snap_size = le64_to_cpu(size_buf.size);
4678
Josh Durginc3545572013-08-28 17:08:10 -07004679 dout(" snap_id 0x%016llx snap_size = %llu\n",
4680 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004681 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004682
4683 return 0;
4684}
4685
4686static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4687{
4688 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4689 &rbd_dev->header.obj_order,
4690 &rbd_dev->header.image_size);
4691}
4692
Alex Elder1e130192012-07-03 16:01:19 -05004693static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4694{
4695 void *reply_buf;
4696 int ret;
4697 void *p;
4698
4699 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4700 if (!reply_buf)
4701 return -ENOMEM;
4702
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004703 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4704 &rbd_dev->header_oloc, "get_object_prefix",
4705 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004706 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004707 if (ret < 0)
4708 goto out;
4709
4710 p = reply_buf;
4711 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004712 p + ret, NULL, GFP_NOIO);
4713 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004714
4715 if (IS_ERR(rbd_dev->header.object_prefix)) {
4716 ret = PTR_ERR(rbd_dev->header.object_prefix);
4717 rbd_dev->header.object_prefix = NULL;
4718 } else {
4719 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4720 }
Alex Elder1e130192012-07-03 16:01:19 -05004721out:
4722 kfree(reply_buf);
4723
4724 return ret;
4725}
4726
Alex Elderb1b54022012-07-03 16:01:19 -05004727static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4728 u64 *snap_features)
4729{
4730 __le64 snapid = cpu_to_le64(snap_id);
4731 struct {
4732 __le64 features;
4733 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004734 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004735 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004736 int ret;
4737
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004738 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4739 &rbd_dev->header_oloc, "get_features",
4740 &snapid, sizeof(snapid),
4741 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004742 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004743 if (ret < 0)
4744 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004745 if (ret < sizeof (features_buf))
4746 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004747
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004748 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4749 if (unsup) {
4750 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4751 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004752 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004753 }
Alex Elderd8891402012-10-09 13:50:17 -07004754
Alex Elderb1b54022012-07-03 16:01:19 -05004755 *snap_features = le64_to_cpu(features_buf.features);
4756
4757 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004758 (unsigned long long)snap_id,
4759 (unsigned long long)*snap_features,
4760 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004761
4762 return 0;
4763}
4764
4765static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4766{
4767 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4768 &rbd_dev->header.features);
4769}
4770
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004771struct parent_image_info {
4772 u64 pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004773 const char *pool_ns;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004774 const char *image_id;
4775 u64 snap_id;
4776
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004777 bool has_overlap;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004778 u64 overlap;
4779};
4780
4781/*
4782 * The caller is responsible for @pii.
4783 */
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004784static int decode_parent_image_spec(void **p, void *end,
4785 struct parent_image_info *pii)
4786{
4787 u8 struct_v;
4788 u32 struct_len;
4789 int ret;
4790
4791 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
4792 &struct_v, &struct_len);
4793 if (ret)
4794 return ret;
4795
4796 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
4797 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4798 if (IS_ERR(pii->pool_ns)) {
4799 ret = PTR_ERR(pii->pool_ns);
4800 pii->pool_ns = NULL;
4801 return ret;
4802 }
4803 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4804 if (IS_ERR(pii->image_id)) {
4805 ret = PTR_ERR(pii->image_id);
4806 pii->image_id = NULL;
4807 return ret;
4808 }
4809 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
4810 return 0;
4811
4812e_inval:
4813 return -EINVAL;
4814}
4815
4816static int __get_parent_info(struct rbd_device *rbd_dev,
4817 struct page *req_page,
4818 struct page *reply_page,
4819 struct parent_image_info *pii)
4820{
4821 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4822 size_t reply_len = PAGE_SIZE;
4823 void *p, *end;
4824 int ret;
4825
4826 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4827 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4828 req_page, sizeof(u64), reply_page, &reply_len);
4829 if (ret)
4830 return ret == -EOPNOTSUPP ? 1 : ret;
4831
4832 p = page_address(reply_page);
4833 end = p + reply_len;
4834 ret = decode_parent_image_spec(&p, end, pii);
4835 if (ret)
4836 return ret;
4837
4838 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4839 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4840 req_page, sizeof(u64), reply_page, &reply_len);
4841 if (ret)
4842 return ret;
4843
4844 p = page_address(reply_page);
4845 end = p + reply_len;
4846 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
4847 if (pii->has_overlap)
4848 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4849
4850 return 0;
4851
4852e_inval:
4853 return -EINVAL;
4854}
4855
4856/*
4857 * The caller is responsible for @pii.
4858 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004859static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4860 struct page *req_page,
4861 struct page *reply_page,
4862 struct parent_image_info *pii)
4863{
4864 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4865 size_t reply_len = PAGE_SIZE;
4866 void *p, *end;
4867 int ret;
4868
4869 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4870 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4871 req_page, sizeof(u64), reply_page, &reply_len);
4872 if (ret)
4873 return ret;
4874
4875 p = page_address(reply_page);
4876 end = p + reply_len;
4877 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4878 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4879 if (IS_ERR(pii->image_id)) {
4880 ret = PTR_ERR(pii->image_id);
4881 pii->image_id = NULL;
4882 return ret;
4883 }
4884 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004885 pii->has_overlap = true;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004886 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4887
4888 return 0;
4889
4890e_inval:
4891 return -EINVAL;
4892}
4893
4894static int get_parent_info(struct rbd_device *rbd_dev,
4895 struct parent_image_info *pii)
4896{
4897 struct page *req_page, *reply_page;
4898 void *p;
4899 int ret;
4900
4901 req_page = alloc_page(GFP_KERNEL);
4902 if (!req_page)
4903 return -ENOMEM;
4904
4905 reply_page = alloc_page(GFP_KERNEL);
4906 if (!reply_page) {
4907 __free_page(req_page);
4908 return -ENOMEM;
4909 }
4910
4911 p = page_address(req_page);
4912 ceph_encode_64(&p, rbd_dev->spec->snap_id);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004913 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
4914 if (ret > 0)
4915 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
4916 pii);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004917
4918 __free_page(req_page);
4919 __free_page(reply_page);
4920 return ret;
4921}
4922
Alex Elder86b00e02012-10-25 23:34:42 -05004923static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4924{
4925 struct rbd_spec *parent_spec;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004926 struct parent_image_info pii = { 0 };
Alex Elder86b00e02012-10-25 23:34:42 -05004927 int ret;
4928
4929 parent_spec = rbd_spec_alloc();
4930 if (!parent_spec)
4931 return -ENOMEM;
4932
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004933 ret = get_parent_info(rbd_dev, &pii);
4934 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004935 goto out_err;
4936
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004937 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
4938 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
4939 pii.has_overlap, pii.overlap);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004940
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004941 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
Alex Elder392a9da2013-05-06 17:40:33 -05004942 /*
4943 * Either the parent never existed, or we have
4944 * record of it but the image got flattened so it no
4945 * longer has a parent. When the parent of a
4946 * layered image disappears we immediately set the
4947 * overlap to 0. The effect of this is that all new
4948 * requests will be treated as if the image had no
4949 * parent.
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004950 *
4951 * If !pii.has_overlap, the parent image spec is not
4952 * applicable. It's there to avoid duplication in each
4953 * snapshot record.
Alex Elder392a9da2013-05-06 17:40:33 -05004954 */
4955 if (rbd_dev->parent_overlap) {
4956 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004957 rbd_dev_parent_put(rbd_dev);
4958 pr_info("%s: clone image has been flattened\n",
4959 rbd_dev->disk->disk_name);
4960 }
4961
Alex Elder86b00e02012-10-25 23:34:42 -05004962 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004963 }
Alex Elder86b00e02012-10-25 23:34:42 -05004964
Alex Elder0903e872012-11-14 12:25:19 -06004965 /* The ceph file layout needs to fit pool id in 32 bits */
4966
4967 ret = -EIO;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004968 if (pii.pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004969 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004970 (unsigned long long)pii.pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004971 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004972 }
Alex Elder0903e872012-11-14 12:25:19 -06004973
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004974 /*
4975 * The parent won't change (except when the clone is
4976 * flattened, already handled that). So we only need to
4977 * record the parent spec we have not already done so.
4978 */
4979 if (!rbd_dev->parent_spec) {
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004980 parent_spec->pool_id = pii.pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004981 if (pii.pool_ns && *pii.pool_ns) {
4982 parent_spec->pool_ns = pii.pool_ns;
4983 pii.pool_ns = NULL;
4984 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004985 parent_spec->image_id = pii.image_id;
4986 pii.image_id = NULL;
4987 parent_spec->snap_id = pii.snap_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004988
Alex Elder70cf49c2013-05-06 17:40:33 -05004989 rbd_dev->parent_spec = parent_spec;
4990 parent_spec = NULL; /* rbd_dev now owns this */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004991 }
4992
4993 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004994 * We always update the parent overlap. If it's zero we issue
4995 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004996 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004997 if (!pii.overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004998 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004999 /* refresh, careful to warn just once */
5000 if (rbd_dev->parent_overlap)
5001 rbd_warn(rbd_dev,
5002 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005003 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005004 /* initial probe */
5005 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005006 }
Alex Elder70cf49c2013-05-06 17:40:33 -05005007 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005008 rbd_dev->parent_overlap = pii.overlap;
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005009
Alex Elder86b00e02012-10-25 23:34:42 -05005010out:
5011 ret = 0;
5012out_err:
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005013 kfree(pii.pool_ns);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005014 kfree(pii.image_id);
Alex Elder86b00e02012-10-25 23:34:42 -05005015 rbd_spec_put(parent_spec);
Alex Elder86b00e02012-10-25 23:34:42 -05005016 return ret;
5017}
5018
Alex Eldercc070d52013-04-21 12:14:45 -05005019static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5020{
5021 struct {
5022 __le64 stripe_unit;
5023 __le64 stripe_count;
5024 } __attribute__ ((packed)) striping_info_buf = { 0 };
5025 size_t size = sizeof (striping_info_buf);
5026 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05005027 int ret;
5028
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005029 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5030 &rbd_dev->header_oloc, "get_stripe_unit_count",
5031 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05005032 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5033 if (ret < 0)
5034 return ret;
5035 if (ret < size)
5036 return -ERANGE;
5037
Alex Eldercc070d52013-04-21 12:14:45 -05005038 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01005039 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5040 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05005041 return 0;
5042}
5043
Ilya Dryomov7e973322017-01-25 18:16:22 +01005044static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5045{
5046 __le64 data_pool_id;
5047 int ret;
5048
5049 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5050 &rbd_dev->header_oloc, "get_data_pool",
5051 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5052 if (ret < 0)
5053 return ret;
5054 if (ret < sizeof(data_pool_id))
5055 return -EBADMSG;
5056
5057 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5058 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5059 return 0;
5060}
5061
Alex Elder9e15b772012-10-30 19:40:33 -05005062static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5063{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005064 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05005065 size_t image_id_size;
5066 char *image_id;
5067 void *p;
5068 void *end;
5069 size_t size;
5070 void *reply_buf = NULL;
5071 size_t len = 0;
5072 char *image_name = NULL;
5073 int ret;
5074
5075 rbd_assert(!rbd_dev->spec->image_name);
5076
Alex Elder69e7a022012-11-01 08:39:26 -05005077 len = strlen(rbd_dev->spec->image_id);
5078 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05005079 image_id = kmalloc(image_id_size, GFP_KERNEL);
5080 if (!image_id)
5081 return NULL;
5082
5083 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05005084 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05005085 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05005086
5087 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5088 reply_buf = kmalloc(size, GFP_KERNEL);
5089 if (!reply_buf)
5090 goto out;
5091
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005092 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5093 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5094 "dir_get_name", image_id, image_id_size,
5095 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005096 if (ret < 0)
5097 goto out;
5098 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005099 end = reply_buf + ret;
5100
Alex Elder9e15b772012-10-30 19:40:33 -05005101 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5102 if (IS_ERR(image_name))
5103 image_name = NULL;
5104 else
5105 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5106out:
5107 kfree(reply_buf);
5108 kfree(image_id);
5109
5110 return image_name;
5111}
5112
Alex Elder2ad3d712013-04-30 00:44:33 -05005113static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5114{
5115 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5116 const char *snap_name;
5117 u32 which = 0;
5118
5119 /* Skip over names until we find the one we are looking for */
5120
5121 snap_name = rbd_dev->header.snap_names;
5122 while (which < snapc->num_snaps) {
5123 if (!strcmp(name, snap_name))
5124 return snapc->snaps[which];
5125 snap_name += strlen(snap_name) + 1;
5126 which++;
5127 }
5128 return CEPH_NOSNAP;
5129}
5130
5131static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5132{
5133 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5134 u32 which;
5135 bool found = false;
5136 u64 snap_id;
5137
5138 for (which = 0; !found && which < snapc->num_snaps; which++) {
5139 const char *snap_name;
5140
5141 snap_id = snapc->snaps[which];
5142 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005143 if (IS_ERR(snap_name)) {
5144 /* ignore no-longer existing snapshots */
5145 if (PTR_ERR(snap_name) == -ENOENT)
5146 continue;
5147 else
5148 break;
5149 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005150 found = !strcmp(name, snap_name);
5151 kfree(snap_name);
5152 }
5153 return found ? snap_id : CEPH_NOSNAP;
5154}
5155
5156/*
5157 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5158 * no snapshot by that name is found, or if an error occurs.
5159 */
5160static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5161{
5162 if (rbd_dev->image_format == 1)
5163 return rbd_v1_snap_id_by_name(rbd_dev, name);
5164
5165 return rbd_v2_snap_id_by_name(rbd_dev, name);
5166}
5167
Alex Elder9e15b772012-10-30 19:40:33 -05005168/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005169 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005170 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005171static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5172{
5173 struct rbd_spec *spec = rbd_dev->spec;
5174
5175 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5176 rbd_assert(spec->image_id && spec->image_name);
5177 rbd_assert(spec->snap_name);
5178
5179 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5180 u64 snap_id;
5181
5182 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5183 if (snap_id == CEPH_NOSNAP)
5184 return -ENOENT;
5185
5186 spec->snap_id = snap_id;
5187 } else {
5188 spec->snap_id = CEPH_NOSNAP;
5189 }
5190
5191 return 0;
5192}
5193
5194/*
5195 * A parent image will have all ids but none of the names.
5196 *
5197 * All names in an rbd spec are dynamically allocated. It's OK if we
5198 * can't figure out the name for an image id.
5199 */
5200static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005201{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005202 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5203 struct rbd_spec *spec = rbd_dev->spec;
5204 const char *pool_name;
5205 const char *image_name;
5206 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005207 int ret;
5208
Ilya Dryomov04077592014-07-23 17:11:20 +04005209 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5210 rbd_assert(spec->image_id);
5211 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005212
Alex Elder2e9f7f12013-04-26 09:43:48 -05005213 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005214
Alex Elder2e9f7f12013-04-26 09:43:48 -05005215 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5216 if (!pool_name) {
5217 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005218 return -EIO;
5219 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005220 pool_name = kstrdup(pool_name, GFP_KERNEL);
5221 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005222 return -ENOMEM;
5223
5224 /* Fetch the image name; tolerate failure here */
5225
Alex Elder2e9f7f12013-04-26 09:43:48 -05005226 image_name = rbd_dev_image_name(rbd_dev);
5227 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005228 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005229
Ilya Dryomov04077592014-07-23 17:11:20 +04005230 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005231
Alex Elder2e9f7f12013-04-26 09:43:48 -05005232 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005233 if (IS_ERR(snap_name)) {
5234 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005235 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005236 }
5237
5238 spec->pool_name = pool_name;
5239 spec->image_name = image_name;
5240 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005241
5242 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005243
Alex Elder9e15b772012-10-30 19:40:33 -05005244out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005245 kfree(image_name);
5246 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005247 return ret;
5248}
5249
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005250static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005251{
5252 size_t size;
5253 int ret;
5254 void *reply_buf;
5255 void *p;
5256 void *end;
5257 u64 seq;
5258 u32 snap_count;
5259 struct ceph_snap_context *snapc;
5260 u32 i;
5261
5262 /*
5263 * We'll need room for the seq value (maximum snapshot id),
5264 * snapshot count, and array of that many snapshot ids.
5265 * For now we have a fixed upper limit on the number we're
5266 * prepared to receive.
5267 */
5268 size = sizeof (__le64) + sizeof (__le32) +
5269 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5270 reply_buf = kzalloc(size, GFP_KERNEL);
5271 if (!reply_buf)
5272 return -ENOMEM;
5273
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005274 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5275 &rbd_dev->header_oloc, "get_snapcontext",
5276 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005277 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005278 if (ret < 0)
5279 goto out;
5280
Alex Elder35d489f2012-07-03 16:01:19 -05005281 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005282 end = reply_buf + ret;
5283 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005284 ceph_decode_64_safe(&p, end, seq, out);
5285 ceph_decode_32_safe(&p, end, snap_count, out);
5286
5287 /*
5288 * Make sure the reported number of snapshot ids wouldn't go
5289 * beyond the end of our buffer. But before checking that,
5290 * make sure the computed size of the snapshot context we
5291 * allocate is representable in a size_t.
5292 */
5293 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5294 / sizeof (u64)) {
5295 ret = -EINVAL;
5296 goto out;
5297 }
5298 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5299 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005300 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005301
Alex Elder812164f82013-04-30 00:44:32 -05005302 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005303 if (!snapc) {
5304 ret = -ENOMEM;
5305 goto out;
5306 }
Alex Elder35d489f2012-07-03 16:01:19 -05005307 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005308 for (i = 0; i < snap_count; i++)
5309 snapc->snaps[i] = ceph_decode_64(&p);
5310
Alex Elder49ece552013-05-06 08:37:00 -05005311 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005312 rbd_dev->header.snapc = snapc;
5313
5314 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005315 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005316out:
5317 kfree(reply_buf);
5318
Alex Elder57385b52013-04-21 12:14:45 -05005319 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005320}
5321
Alex Elder54cac612013-04-30 00:44:33 -05005322static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5323 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005324{
5325 size_t size;
5326 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005327 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005328 int ret;
5329 void *p;
5330 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005331 char *snap_name;
5332
5333 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5334 reply_buf = kmalloc(size, GFP_KERNEL);
5335 if (!reply_buf)
5336 return ERR_PTR(-ENOMEM);
5337
Alex Elder54cac612013-04-30 00:44:33 -05005338 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005339 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5340 &rbd_dev->header_oloc, "get_snapshot_name",
5341 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005342 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005343 if (ret < 0) {
5344 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005345 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005346 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005347
5348 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005349 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005350 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005351 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005352 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005353
Alex Elderf40eb342013-04-25 15:09:42 -05005354 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005355 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005356out:
5357 kfree(reply_buf);
5358
Alex Elderf40eb342013-04-25 15:09:42 -05005359 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005360}
5361
Alex Elder2df3fac2013-05-06 09:51:30 -05005362static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005363{
Alex Elder2df3fac2013-05-06 09:51:30 -05005364 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005365 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005366
Josh Durgin1617e402013-06-12 14:43:10 -07005367 ret = rbd_dev_v2_image_size(rbd_dev);
5368 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005369 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005370
Alex Elder2df3fac2013-05-06 09:51:30 -05005371 if (first_time) {
5372 ret = rbd_dev_v2_header_onetime(rbd_dev);
5373 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005374 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005375 }
5376
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005377 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005378 if (ret && first_time) {
5379 kfree(rbd_dev->header.object_prefix);
5380 rbd_dev->header.object_prefix = NULL;
5381 }
Alex Elder117973f2012-08-31 17:29:55 -05005382
5383 return ret;
5384}
5385
Ilya Dryomova720ae02014-07-23 17:11:19 +04005386static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5387{
5388 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5389
5390 if (rbd_dev->image_format == 1)
5391 return rbd_dev_v1_header_info(rbd_dev);
5392
5393 return rbd_dev_v2_header_info(rbd_dev);
5394}
5395
Alex Elder1ddbe942012-01-29 13:57:44 -06005396/*
Alex Eldere28fff262012-02-02 08:13:30 -06005397 * Skips over white space at *buf, and updates *buf to point to the
5398 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005399 * the token (string of non-white space characters) found. Note
5400 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005401 */
5402static inline size_t next_token(const char **buf)
5403{
5404 /*
5405 * These are the characters that produce nonzero for
5406 * isspace() in the "C" and "POSIX" locales.
5407 */
5408 const char *spaces = " \f\n\r\t\v";
5409
5410 *buf += strspn(*buf, spaces); /* Find start of token */
5411
5412 return strcspn(*buf, spaces); /* Return token length */
5413}
5414
5415/*
Alex Elderea3352f2012-07-09 21:04:23 -05005416 * Finds the next token in *buf, dynamically allocates a buffer big
5417 * enough to hold a copy of it, and copies the token into the new
5418 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5419 * that a duplicate buffer is created even for a zero-length token.
5420 *
5421 * Returns a pointer to the newly-allocated duplicate, or a null
5422 * pointer if memory for the duplicate was not available. If
5423 * the lenp argument is a non-null pointer, the length of the token
5424 * (not including the '\0') is returned in *lenp.
5425 *
5426 * If successful, the *buf pointer will be updated to point beyond
5427 * the end of the found token.
5428 *
5429 * Note: uses GFP_KERNEL for allocation.
5430 */
5431static inline char *dup_token(const char **buf, size_t *lenp)
5432{
5433 char *dup;
5434 size_t len;
5435
5436 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005437 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005438 if (!dup)
5439 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005440 *(dup + len) = '\0';
5441 *buf += len;
5442
5443 if (lenp)
5444 *lenp = len;
5445
5446 return dup;
5447}
5448
5449/*
Alex Elder859c31d2012-10-25 23:34:42 -05005450 * Parse the options provided for an "rbd add" (i.e., rbd image
5451 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5452 * and the data written is passed here via a NUL-terminated buffer.
5453 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005454 *
Alex Elder859c31d2012-10-25 23:34:42 -05005455 * The information extracted from these options is recorded in
5456 * the other parameters which return dynamically-allocated
5457 * structures:
5458 * ceph_opts
5459 * The address of a pointer that will refer to a ceph options
5460 * structure. Caller must release the returned pointer using
5461 * ceph_destroy_options() when it is no longer needed.
5462 * rbd_opts
5463 * Address of an rbd options pointer. Fully initialized by
5464 * this function; caller must release with kfree().
5465 * spec
5466 * Address of an rbd image specification pointer. Fully
5467 * initialized by this function based on parsed options.
5468 * Caller must release with rbd_spec_put().
5469 *
5470 * The options passed take this form:
5471 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5472 * where:
5473 * <mon_addrs>
5474 * A comma-separated list of one or more monitor addresses.
5475 * A monitor address is an ip address, optionally followed
5476 * by a port number (separated by a colon).
5477 * I.e.: ip1[:port1][,ip2[:port2]...]
5478 * <options>
5479 * A comma-separated list of ceph and/or rbd options.
5480 * <pool_name>
5481 * The name of the rados pool containing the rbd image.
5482 * <image_name>
5483 * The name of the image in that pool to map.
5484 * <snap_id>
5485 * An optional snapshot id. If provided, the mapping will
5486 * present data from the image at the time that snapshot was
5487 * created. The image head is used if no snapshot id is
5488 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005489 */
Alex Elder859c31d2012-10-25 23:34:42 -05005490static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005491 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005492 struct rbd_options **opts,
5493 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005494{
Alex Elderd22f76e2012-07-12 10:46:35 -05005495 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005496 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005497 const char *mon_addrs;
Alex Elderecb4dc222013-04-26 09:43:47 -05005498 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005499 size_t mon_addrs_size;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005500 struct parse_rbd_opts_ctx pctx = { 0 };
Alex Elder859c31d2012-10-25 23:34:42 -05005501 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005502 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005503
5504 /* The first four tokens are required */
5505
Alex Elder7ef32142012-02-02 08:13:30 -06005506 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005507 if (!len) {
5508 rbd_warn(NULL, "no monitor address(es) provided");
5509 return -EINVAL;
5510 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005511 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005512 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005513 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005514
Alex Elderdc79b112012-10-25 23:34:41 -05005515 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005516 options = dup_token(&buf, NULL);
5517 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005518 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005519 if (!*options) {
5520 rbd_warn(NULL, "no options provided");
5521 goto out_err;
5522 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005523
Ilya Dryomovc3001562018-07-03 15:28:43 +02005524 pctx.spec = rbd_spec_alloc();
5525 if (!pctx.spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005526 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005527
Ilya Dryomovc3001562018-07-03 15:28:43 +02005528 pctx.spec->pool_name = dup_token(&buf, NULL);
5529 if (!pctx.spec->pool_name)
Alex Elder859c31d2012-10-25 23:34:42 -05005530 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005531 if (!*pctx.spec->pool_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005532 rbd_warn(NULL, "no pool name provided");
5533 goto out_err;
5534 }
Alex Eldere28fff262012-02-02 08:13:30 -06005535
Ilya Dryomovc3001562018-07-03 15:28:43 +02005536 pctx.spec->image_name = dup_token(&buf, NULL);
5537 if (!pctx.spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005538 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005539 if (!*pctx.spec->image_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005540 rbd_warn(NULL, "no image name provided");
5541 goto out_err;
5542 }
Alex Eldere28fff262012-02-02 08:13:30 -06005543
Alex Elderf28e5652012-10-25 23:34:41 -05005544 /*
5545 * Snapshot name is optional; default is to use "-"
5546 * (indicating the head/no snapshot).
5547 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005548 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005549 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005550 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5551 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005552 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005553 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005554 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005555 }
Alex Elderecb4dc222013-04-26 09:43:47 -05005556 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5557 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005558 goto out_mem;
Alex Elderecb4dc222013-04-26 09:43:47 -05005559 *(snap_name + len) = '\0';
Ilya Dryomovc3001562018-07-03 15:28:43 +02005560 pctx.spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005561
Alex Elder0ddebc02012-10-25 23:34:41 -05005562 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005563
Ilya Dryomovc3001562018-07-03 15:28:43 +02005564 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5565 if (!pctx.opts)
Alex Elder4e9afeb2012-10-25 23:34:41 -05005566 goto out_mem;
5567
Ilya Dryomovc3001562018-07-03 15:28:43 +02005568 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5569 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01005570 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005571 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5572 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5573 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5574 pctx.opts->trim = RBD_TRIM_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005575
Alex Elder859c31d2012-10-25 23:34:42 -05005576 copts = ceph_parse_options(options, mon_addrs,
Ilya Dryomovc3001562018-07-03 15:28:43 +02005577 mon_addrs + mon_addrs_size - 1,
5578 parse_rbd_opts_token, &pctx);
Alex Elder859c31d2012-10-25 23:34:42 -05005579 if (IS_ERR(copts)) {
5580 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005581 goto out_err;
5582 }
Alex Elder859c31d2012-10-25 23:34:42 -05005583 kfree(options);
5584
5585 *ceph_opts = copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005586 *opts = pctx.opts;
5587 *rbd_spec = pctx.spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005588
Alex Elderdc79b112012-10-25 23:34:41 -05005589 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005590out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005591 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005592out_err:
Ilya Dryomovc3001562018-07-03 15:28:43 +02005593 kfree(pctx.opts);
5594 rbd_spec_put(pctx.spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005595 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005596
Alex Elderdc79b112012-10-25 23:34:41 -05005597 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005598}
5599
Ilya Dryomove010dd02017-04-13 12:17:39 +02005600static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5601{
5602 down_write(&rbd_dev->lock_rwsem);
5603 if (__rbd_is_lock_owner(rbd_dev))
5604 rbd_unlock(rbd_dev);
5605 up_write(&rbd_dev->lock_rwsem);
5606}
5607
5608static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5609{
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005610 int ret;
5611
Ilya Dryomove010dd02017-04-13 12:17:39 +02005612 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5613 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5614 return -EINVAL;
5615 }
5616
5617 /* FIXME: "rbd map --exclusive" should be in interruptible */
5618 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005619 ret = rbd_wait_state_locked(rbd_dev, true);
Ilya Dryomove010dd02017-04-13 12:17:39 +02005620 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005621 if (ret) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02005622 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5623 return -EROFS;
5624 }
5625
5626 return 0;
5627}
5628
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005629/*
Alex Elder589d30e2012-07-10 20:30:11 -05005630 * An rbd format 2 image has a unique identifier, distinct from the
5631 * name given to it by the user. Internally, that identifier is
5632 * what's used to specify the names of objects related to the image.
5633 *
5634 * A special "rbd id" object is used to map an rbd image name to its
5635 * id. If that object doesn't exist, then there is no v2 rbd image
5636 * with the supplied name.
5637 *
5638 * This function will record the given rbd_dev's image_id field if
5639 * it can be determined, and in that case will return 0. If any
5640 * errors occur a negative errno will be returned and the rbd_dev's
5641 * image_id field will be unchanged (and should be NULL).
5642 */
5643static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5644{
5645 int ret;
5646 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005647 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005648 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005649 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005650
Alex Elder589d30e2012-07-10 20:30:11 -05005651 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005652 * When probing a parent image, the image id is already
5653 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005654 * need to fetch the image id again in this case. We
5655 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005656 */
Alex Elderc0fba362013-04-25 23:15:08 -05005657 if (rbd_dev->spec->image_id) {
5658 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5659
Alex Elder2c0d0a12012-10-30 19:40:33 -05005660 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005661 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005662
5663 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005664 * First, see if the format 2 image id file exists, and if
5665 * so, get the image's persistent id from it.
5666 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005667 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5668 rbd_dev->spec->image_name);
5669 if (ret)
5670 return ret;
5671
5672 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005673
5674 /* Response will be an encoded string, which includes a length */
5675
5676 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5677 response = kzalloc(size, GFP_NOIO);
5678 if (!response) {
5679 ret = -ENOMEM;
5680 goto out;
5681 }
5682
Alex Elderc0fba362013-04-25 23:15:08 -05005683 /* If it doesn't exist we'll assume it's a format 1 image */
5684
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005685 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5686 "get_id", NULL, 0,
5687 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005688 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005689 if (ret == -ENOENT) {
5690 image_id = kstrdup("", GFP_KERNEL);
5691 ret = image_id ? 0 : -ENOMEM;
5692 if (!ret)
5693 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005694 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005695 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005696
Alex Elderc0fba362013-04-25 23:15:08 -05005697 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005698 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005699 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005700 if (!ret)
5701 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005702 }
5703
5704 if (!ret) {
5705 rbd_dev->spec->image_id = image_id;
5706 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005707 }
5708out:
5709 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005710 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005711 return ret;
5712}
5713
Alex Elder3abef3b2013-05-13 20:35:37 -05005714/*
5715 * Undo whatever state changes are made by v1 or v2 header info
5716 * call.
5717 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005718static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5719{
5720 struct rbd_image_header *header;
5721
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005722 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005723
5724 /* Free dynamic fields from the header, then zero it out */
5725
5726 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005727 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005728 kfree(header->snap_sizes);
5729 kfree(header->snap_names);
5730 kfree(header->object_prefix);
5731 memset(header, 0, sizeof (*header));
5732}
5733
Alex Elder2df3fac2013-05-06 09:51:30 -05005734static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005735{
5736 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005737
Alex Elder1e130192012-07-03 16:01:19 -05005738 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005739 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005740 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005741
Alex Elder2df3fac2013-05-06 09:51:30 -05005742 /*
5743 * Get the and check features for the image. Currently the
5744 * features are assumed to never change.
5745 */
Alex Elderb1b54022012-07-03 16:01:19 -05005746 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005747 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005748 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005749
Alex Eldercc070d52013-04-21 12:14:45 -05005750 /* If the image supports fancy striping, get its parameters */
5751
5752 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5753 ret = rbd_dev_v2_striping_info(rbd_dev);
5754 if (ret < 0)
5755 goto out_err;
5756 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005757
Ilya Dryomov7e973322017-01-25 18:16:22 +01005758 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5759 ret = rbd_dev_v2_data_pool(rbd_dev);
5760 if (ret)
5761 goto out_err;
5762 }
5763
Ilya Dryomov263423f2017-01-25 18:16:22 +01005764 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005765 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005766
Alex Elder9d475de2012-07-03 16:01:19 -05005767out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005768 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005769 kfree(rbd_dev->header.object_prefix);
5770 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005771 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005772}
5773
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005774/*
5775 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5776 * rbd_dev_image_probe() recursion depth, which means it's also the
5777 * length of the already discovered part of the parent chain.
5778 */
5779static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005780{
Alex Elder2f82ee52012-10-30 19:40:33 -05005781 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005782 int ret;
5783
5784 if (!rbd_dev->parent_spec)
5785 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005786
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005787 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5788 pr_info("parent chain is too long (%d)\n", depth);
5789 ret = -EINVAL;
5790 goto out_err;
5791 }
5792
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005793 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005794 if (!parent) {
5795 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005796 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005797 }
5798
5799 /*
5800 * Images related by parent/child relationships always share
5801 * rbd_client and spec/parent_spec, so bump their refcounts.
5802 */
5803 __rbd_get_client(rbd_dev->rbd_client);
5804 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005805
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005806 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005807 if (ret < 0)
5808 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005809
Alex Elder124afba2013-04-26 15:44:36 -05005810 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005811 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005812 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005813
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005814out_err:
5815 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005816 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005817 return ret;
5818}
5819
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005820static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5821{
5822 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5823 rbd_dev_mapping_clear(rbd_dev);
5824 rbd_free_disk(rbd_dev);
5825 if (!single_major)
5826 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5827}
5828
Ilya Dryomov811c6682016-04-15 16:22:16 +02005829/*
5830 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5831 * upon return.
5832 */
Alex Elder200a6a82013-04-28 23:32:34 -05005833static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005834{
Alex Elder83a06262012-10-30 15:47:17 -05005835 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005836
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005837 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005838
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005839 if (!single_major) {
5840 ret = register_blkdev(0, rbd_dev->name);
5841 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005842 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005843
5844 rbd_dev->major = ret;
5845 rbd_dev->minor = 0;
5846 } else {
5847 rbd_dev->major = rbd_major;
5848 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5849 }
Alex Elder83a06262012-10-30 15:47:17 -05005850
5851 /* Set up the blkdev mapping. */
5852
5853 ret = rbd_init_disk(rbd_dev);
5854 if (ret)
5855 goto err_out_blkdev;
5856
Alex Elderf35a4de2013-05-06 09:51:29 -05005857 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005858 if (ret)
5859 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005860
Alex Elderf35a4de2013-05-06 09:51:29 -05005861 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005862 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005863
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005864 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005865 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005866 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005867
Alex Elder129b79d2013-04-26 15:44:36 -05005868 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005869 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005870 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005871
Alex Elderf35a4de2013-05-06 09:51:29 -05005872err_out_mapping:
5873 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005874err_out_disk:
5875 rbd_free_disk(rbd_dev);
5876err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005877 if (!single_major)
5878 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005879err_out_unlock:
5880 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005881 return ret;
5882}
5883
Alex Elder332bb122013-04-27 09:59:30 -05005884static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5885{
5886 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005887 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005888
5889 /* Record the header object name for this rbd image. */
5890
5891 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005892 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005893 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5894 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005895 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005896 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5897 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005898
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005899 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005900}
5901
Alex Elder200a6a82013-04-28 23:32:34 -05005902static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5903{
Alex Elder6fd48b32013-04-28 23:32:34 -05005904 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005905 if (rbd_dev->opts)
5906 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005907 rbd_dev->image_format = 0;
5908 kfree(rbd_dev->spec->image_id);
5909 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005910}
5911
Alex Eldera30b71b2012-07-10 20:30:11 -05005912/*
5913 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005914 * device. If this image is the one being mapped (i.e., not a
5915 * parent), initiate a watch on its header object before using that
5916 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005917 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005918static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005919{
5920 int ret;
5921
5922 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005923 * Get the id from the image id object. Unless there's an
5924 * error, rbd_dev->spec->image_id will be filled in with
5925 * a dynamically-allocated string, and rbd_dev->image_format
5926 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005927 */
5928 ret = rbd_dev_image_id(rbd_dev);
5929 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005930 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005931
Alex Elder332bb122013-04-27 09:59:30 -05005932 ret = rbd_dev_header_name(rbd_dev);
5933 if (ret)
5934 goto err_out_format;
5935
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005936 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005937 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005938 if (ret) {
5939 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005940 pr_info("image %s/%s%s%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005941 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005942 rbd_dev->spec->pool_ns ?: "",
5943 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005944 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005945 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005946 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005947 }
Alex Elderb644de22013-04-27 09:59:31 -05005948
Ilya Dryomova720ae02014-07-23 17:11:19 +04005949 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005950 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005951 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005952
Ilya Dryomov04077592014-07-23 17:11:20 +04005953 /*
5954 * If this image is the one being mapped, we have pool name and
5955 * id, image name and id, and snap name - need to fill snap id.
5956 * Otherwise this is a parent image, identified by pool, image
5957 * and snap ids - need to fill in names for those ids.
5958 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005959 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005960 ret = rbd_spec_fill_snap_id(rbd_dev);
5961 else
5962 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005963 if (ret) {
5964 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005965 pr_info("snap %s/%s%s%s@%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005966 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005967 rbd_dev->spec->pool_ns ?: "",
5968 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005969 rbd_dev->spec->image_name,
5970 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005971 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005972 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005973
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005974 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5975 ret = rbd_dev_v2_parent_info(rbd_dev);
5976 if (ret)
5977 goto err_out_probe;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005978 }
5979
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005980 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005981 if (ret)
5982 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005983
Alex Elder30d60ba2013-05-06 09:51:30 -05005984 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005985 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005986 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005987
Alex Elder6fd48b32013-04-28 23:32:34 -05005988err_out_probe:
5989 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005990err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005991 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005992 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005993err_out_format:
5994 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005995 kfree(rbd_dev->spec->image_id);
5996 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005997 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005998}
5999
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006000static ssize_t do_rbd_add(struct bus_type *bus,
6001 const char *buf,
6002 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006003{
Alex Eldercb8627c2012-07-09 21:04:23 -05006004 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05006005 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05006006 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05006007 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05006008 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006009 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006010
6011 if (!try_module_get(THIS_MODULE))
6012 return -ENODEV;
6013
Alex Eldera725f65e2012-02-02 08:13:30 -06006014 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05006015 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05006016 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006017 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06006018
Alex Elder9d3997f2012-10-25 23:34:42 -05006019 rbdc = rbd_get_client(ceph_opts);
6020 if (IS_ERR(rbdc)) {
6021 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006022 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05006023 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006024
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006025 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01006026 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006027 if (rc < 0) {
6028 if (rc == -ENOENT)
6029 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006030 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006031 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05006032 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05006033
Ilya Dryomovd1475432015-06-22 13:24:48 +03006034 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006035 if (!rbd_dev) {
6036 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05006037 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006038 }
Alex Elderc53d5892012-10-25 23:34:42 -05006039 rbdc = NULL; /* rbd_dev now owns this */
6040 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03006041 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006042
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006043 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
6044 if (!rbd_dev->config_info) {
6045 rc = -ENOMEM;
6046 goto err_out_rbd_dev;
6047 }
6048
Ilya Dryomov811c6682016-04-15 16:22:16 +02006049 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006050 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006051 if (rc < 0) {
6052 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05006053 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006054 }
Alex Elder05fd6f62012-08-29 17:11:07 -05006055
Alex Elder7ce4eef2013-05-06 17:40:33 -05006056 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05006057 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02006058 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05006059
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01006060 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
6061 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
6062 rbd_dev->layout.object_size);
6063 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
6064 }
6065
Alex Elderb536f692013-04-28 23:32:34 -05006066 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006067 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006068 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05006069
Ilya Dryomove010dd02017-04-13 12:17:39 +02006070 if (rbd_dev->opts->exclusive) {
6071 rc = rbd_add_acquire_lock(rbd_dev);
6072 if (rc)
6073 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05006074 }
6075
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006076 /* Everything's ready. Announce the disk to the world. */
6077
6078 rc = device_add(&rbd_dev->dev);
6079 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02006080 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006081
6082 add_disk(rbd_dev->disk);
6083 /* see rbd_init_disk() */
6084 blk_put_queue(rbd_dev->disk->queue);
6085
6086 spin_lock(&rbd_dev_list_lock);
6087 list_add_tail(&rbd_dev->node, &rbd_dev_list);
6088 spin_unlock(&rbd_dev_list_lock);
6089
6090 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
6091 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
6092 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006093 rc = count;
6094out:
6095 module_put(THIS_MODULE);
6096 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05006097
Ilya Dryomove010dd02017-04-13 12:17:39 +02006098err_out_image_lock:
6099 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006100err_out_device_setup:
6101 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006102err_out_image_probe:
6103 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05006104err_out_rbd_dev:
6105 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05006106err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05006107 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006108err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05006109 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03006110 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006111 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006112}
6113
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006114static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006115{
6116 if (single_major)
6117 return -EINVAL;
6118
6119 return do_rbd_add(bus, buf, count);
6120}
6121
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006122static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
6123 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006124{
6125 return do_rbd_add(bus, buf, count);
6126}
6127
Alex Elder05a46af2013-04-26 15:44:36 -05006128static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6129{
Alex Elderad945fc2013-04-26 15:44:36 -05006130 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05006131 struct rbd_device *first = rbd_dev;
6132 struct rbd_device *second = first->parent;
6133 struct rbd_device *third;
6134
6135 /*
6136 * Follow to the parent with no grandparent and
6137 * remove it.
6138 */
6139 while (second && (third = second->parent)) {
6140 first = second;
6141 second = third;
6142 }
Alex Elderad945fc2013-04-26 15:44:36 -05006143 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006144 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006145 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05006146 first->parent = NULL;
6147 first->parent_overlap = 0;
6148
6149 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05006150 rbd_spec_put(first->parent_spec);
6151 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05006152 }
6153}
6154
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006155static ssize_t do_rbd_remove(struct bus_type *bus,
6156 const char *buf,
6157 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006158{
6159 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05006160 struct list_head *tmp;
6161 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02006162 char opt_buf[6];
Mike Christie0276dca2016-08-18 18:38:45 +02006163 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05006164 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006165
Mike Christie0276dca2016-08-18 18:38:45 +02006166 dev_id = -1;
6167 opt_buf[0] = '\0';
6168 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6169 if (dev_id < 0) {
6170 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006171 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02006172 }
6173 if (opt_buf[0] != '\0') {
6174 if (!strcmp(opt_buf, "force")) {
6175 force = true;
6176 } else {
6177 pr_err("bad remove option at '%s'\n", opt_buf);
6178 return -EINVAL;
6179 }
6180 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006181
Alex Elder751cc0e2013-05-31 15:17:01 -05006182 ret = -ENOENT;
6183 spin_lock(&rbd_dev_list_lock);
6184 list_for_each(tmp, &rbd_dev_list) {
6185 rbd_dev = list_entry(tmp, struct rbd_device, node);
6186 if (rbd_dev->dev_id == dev_id) {
6187 ret = 0;
6188 break;
6189 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006190 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006191 if (!ret) {
6192 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006193 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006194 ret = -EBUSY;
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006195 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6196 &rbd_dev->flags))
6197 ret = -EINPROGRESS;
Alex Elder751cc0e2013-05-31 15:17:01 -05006198 spin_unlock_irq(&rbd_dev->lock);
6199 }
6200 spin_unlock(&rbd_dev_list_lock);
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006201 if (ret)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006202 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006203
Mike Christie0276dca2016-08-18 18:38:45 +02006204 if (force) {
6205 /*
6206 * Prevent new IO from being queued and wait for existing
6207 * IO to complete/fail.
6208 */
6209 blk_mq_freeze_queue(rbd_dev->disk->queue);
6210 blk_set_queue_dying(rbd_dev->disk->queue);
6211 }
6212
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006213 del_gendisk(rbd_dev->disk);
6214 spin_lock(&rbd_dev_list_lock);
6215 list_del_init(&rbd_dev->node);
6216 spin_unlock(&rbd_dev_list_lock);
6217 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006218
Ilya Dryomove010dd02017-04-13 12:17:39 +02006219 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006220 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006221 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006222 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006223 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006224}
6225
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006226static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006227{
6228 if (single_major)
6229 return -EINVAL;
6230
6231 return do_rbd_remove(bus, buf, count);
6232}
6233
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006234static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
6235 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006236{
6237 return do_rbd_remove(bus, buf, count);
6238}
6239
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006240/*
6241 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006242 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006243 */
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006244static int __init rbd_sysfs_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006245{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006246 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006247
Alex Elderfed4c142012-02-07 12:03:36 -06006248 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006249 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006250 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006251
Alex Elderfed4c142012-02-07 12:03:36 -06006252 ret = bus_register(&rbd_bus_type);
6253 if (ret < 0)
6254 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006255
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006256 return ret;
6257}
6258
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006259static void __exit rbd_sysfs_cleanup(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006260{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006261 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006262 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006263}
6264
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006265static int __init rbd_slab_init(void)
Alex Elder1c2a9df2013-05-01 12:43:03 -05006266{
6267 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006268 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006269 if (!rbd_img_request_cache)
6270 return -ENOMEM;
6271
6272 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006273 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006274 if (!rbd_obj_request_cache)
6275 goto out_err;
6276
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006277 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006278
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006279out_err:
Alex Elder868311b2013-05-01 12:43:03 -05006280 kmem_cache_destroy(rbd_img_request_cache);
6281 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006282 return -ENOMEM;
6283}
6284
6285static void rbd_slab_exit(void)
6286{
Alex Elder868311b2013-05-01 12:43:03 -05006287 rbd_assert(rbd_obj_request_cache);
6288 kmem_cache_destroy(rbd_obj_request_cache);
6289 rbd_obj_request_cache = NULL;
6290
Alex Elder1c2a9df2013-05-01 12:43:03 -05006291 rbd_assert(rbd_img_request_cache);
6292 kmem_cache_destroy(rbd_img_request_cache);
6293 rbd_img_request_cache = NULL;
6294}
6295
Alex Eldercc344fa2013-02-19 12:25:56 -06006296static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006297{
6298 int rc;
6299
Alex Elder1e32d342013-01-30 11:13:33 -06006300 if (!libceph_compatible(NULL)) {
6301 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006302 return -EINVAL;
6303 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006304
Alex Elder1c2a9df2013-05-01 12:43:03 -05006305 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006306 if (rc)
6307 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006308
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006309 /*
6310 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006311 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006312 */
6313 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6314 if (!rbd_wq) {
6315 rc = -ENOMEM;
6316 goto err_out_slab;
6317 }
6318
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006319 if (single_major) {
6320 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6321 if (rbd_major < 0) {
6322 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006323 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006324 }
6325 }
6326
Alex Elder1c2a9df2013-05-01 12:43:03 -05006327 rc = rbd_sysfs_init();
6328 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006329 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006330
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006331 if (single_major)
6332 pr_info("loaded (major %d)\n", rbd_major);
6333 else
6334 pr_info("loaded\n");
6335
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006336 return 0;
6337
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006338err_out_blkdev:
6339 if (single_major)
6340 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006341err_out_wq:
6342 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006343err_out_slab:
6344 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006345 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006346}
6347
Alex Eldercc344fa2013-02-19 12:25:56 -06006348static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006349{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006350 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006351 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006352 if (single_major)
6353 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006354 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006355 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006356}
6357
6358module_init(rbd_init);
6359module_exit(rbd_exit);
6360
Alex Elderd552c612013-05-31 20:13:09 -05006361MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006362MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6363MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006364/* following authorship retained from original osdblk.c */
6365MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6366
Ilya Dryomov90da2582013-12-13 15:28:56 +02006367MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006368MODULE_LICENSE("GPL");