blob: 3ef97121a8f54a3eaf23823247eb0305c095c9c3 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
Mark Rutlandbfc18e32018-06-21 13:13:04 +010064 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
Alex Eldera2acd002013-05-08 22:50:04 -050065 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
118#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100119#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100120
Ilya Dryomoved95b212016-08-12 16:40:02 +0200121#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
122 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100123 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100124 RBD_FEATURE_DATA_POOL | \
125 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700126
127/* Features supported by this (client software) implementation. */
128
Alex Elder770eba62012-10-25 23:34:40 -0500129#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700130
Alex Elder81a89792012-02-02 08:13:30 -0600131/*
132 * An RBD device name will be "rbd#", where the "rbd" comes from
133 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600134 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700135#define DEV_NAME_LEN 32
136
137/*
138 * block device image metadata (in-memory version)
139 */
140struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500141 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500142 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500144 u64 stripe_unit;
145 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100146 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500147 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700148
Alex Elderf84344f2012-08-31 17:29:51 -0500149 /* The remaining fields need to be updated occasionally */
150 u64 image_size;
151 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500152 char *snap_names; /* format 1 only */
153 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700154};
155
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500156/*
157 * An rbd image specification.
158 *
159 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500160 * identify an image. Each rbd_dev structure includes a pointer to
161 * an rbd_spec structure that encapsulates this identity.
162 *
163 * Each of the id's in an rbd_spec has an associated name. For a
164 * user-mapped image, the names are supplied and the id's associated
165 * with them are looked up. For a layered image, a parent image is
166 * defined by the tuple, and the names are looked up.
167 *
168 * An rbd_dev structure contains a parent_spec pointer which is
169 * non-null if the image it represents is a child in a layered
170 * image. This pointer will refer to the rbd_spec structure used
171 * by the parent rbd_dev for its own identity (i.e., the structure
172 * is shared between the parent and child).
173 *
174 * Since these structures are populated once, during the discovery
175 * phase of image construction, they are effectively immutable so
176 * we make no effort to synchronize access to them.
177 *
178 * Note that code herein does not assume the image name is known (it
179 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500180 */
181struct rbd_spec {
182 u64 pool_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500183 const char *pool_name;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200184 const char *pool_ns; /* NULL if default, never "" */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500185
Alex Elderecb4dc222013-04-26 09:43:47 -0500186 const char *image_id;
187 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500188
189 u64 snap_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500190 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500191
192 struct kref kref;
193};
194
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700195/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600196 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700197 */
198struct rbd_client {
199 struct ceph_client *client;
200 struct kref kref;
201 struct list_head node;
202};
203
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600205
Alex Elder9969ebc2013-01-18 12:31:10 -0600206enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100207 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100208 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100209 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100210 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600211};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800213enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100214 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800215 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800216 OBJ_OP_DISCARD,
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100217 OBJ_OP_ZEROOUT,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800218};
219
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100220/*
221 * Writes go through the following state machine to deal with
222 * layering:
223 *
224 * need copyup
225 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
226 * | ^ |
227 * v \------------------------------/
228 * done
229 * ^
230 * |
231 * RBD_OBJ_WRITE_FLAT
232 *
233 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
234 * there is a parent or not.
235 */
236enum rbd_obj_write_state {
237 RBD_OBJ_WRITE_FLAT = 1,
238 RBD_OBJ_WRITE_GUARD,
239 RBD_OBJ_WRITE_COPYUP,
Alex Elder926f9b32013-02-11 12:33:24 -0600240};
241
Alex Elderbf0d5f502012-11-22 00:00:08 -0600242struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100243 struct ceph_object_extent ex;
Alex Elderc5b5ef62013-02-11 12:33:24 -0600244 union {
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100245 bool tried_parent; /* for reads */
246 enum rbd_obj_write_state write_state; /* for writes */
247 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600248
Ilya Dryomov51c35092018-01-29 14:04:08 +0100249 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100250 struct ceph_file_extent *img_extents;
251 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600252
Alex Elder788e2df2013-01-17 12:25:27 -0600253 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100254 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100256 struct ceph_bvec_iter bvec_pos;
257 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100258 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600259 };
260 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100261 struct bio_vec *copyup_bvecs;
262 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600263
264 struct ceph_osd_request *osd_req;
265
266 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800267 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600268
Alex Elderbf0d5f502012-11-22 00:00:08 -0600269 struct kref kref;
270};
271
Alex Elder0c425242013-02-08 09:55:49 -0600272enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600273 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600274 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600275};
276
Alex Elderbf0d5f502012-11-22 00:00:08 -0600277struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600278 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100279 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100280 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600281 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600282 union {
Alex Elder9849e982013-01-24 16:13:36 -0600283 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600284 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600285 };
286 union {
287 struct request *rq; /* block request */
288 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600289 };
Ilya Dryomov15961b42018-02-01 11:50:47 +0100290 spinlock_t completion_lock;
Alex Elder55f27e02013-04-10 12:34:25 -0500291 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600292 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100294 struct list_head object_extents; /* obj_req.ex structs */
Ilya Dryomov7114eda2018-02-01 11:50:47 +0100295 u32 pending_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600296
297 struct kref kref;
298};
299
300#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100301 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600302#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100303 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600304
Ilya Dryomov99d16942016-08-12 16:11:41 +0200305enum rbd_watch_state {
306 RBD_WATCH_STATE_UNREGISTERED,
307 RBD_WATCH_STATE_REGISTERED,
308 RBD_WATCH_STATE_ERROR,
309};
310
Ilya Dryomoved95b212016-08-12 16:40:02 +0200311enum rbd_lock_state {
312 RBD_LOCK_STATE_UNLOCKED,
313 RBD_LOCK_STATE_LOCKED,
314 RBD_LOCK_STATE_RELEASING,
315};
316
317/* WatchNotify::ClientId */
318struct rbd_client_id {
319 u64 gid;
320 u64 handle;
321};
322
Alex Elderf84344f2012-08-31 17:29:51 -0500323struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500324 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500325 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500326};
327
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328/*
329 * a single device
330 */
331struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500332 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333
334 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200335 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700337
Alex Eldera30b71b2012-07-10 20:30:11 -0500338 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 struct rbd_client *rbd_client;
340
341 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
342
Alex Elderb82d1672013-01-14 12:43:31 -0600343 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
345 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600346 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500347 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300348 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200349 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200351 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200352 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500353
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200354 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600355
Ilya Dryomov99d16942016-08-12 16:11:41 +0200356 struct mutex watch_mutex;
357 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200358 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200359 u64 watch_cookie;
360 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361
Ilya Dryomoved95b212016-08-12 16:40:02 +0200362 struct rw_semaphore lock_rwsem;
363 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200364 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200365 struct rbd_client_id owner_cid;
366 struct work_struct acquired_lock_work;
367 struct work_struct released_lock_work;
368 struct delayed_work lock_dwork;
369 struct work_struct unlock_work;
370 wait_queue_head_t lock_waitq;
371
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200372 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700373
Alex Elder86b00e02012-10-25 23:34:42 -0500374 struct rbd_spec *parent_spec;
375 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500376 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500377 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500378
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100379 /* Block layer tags. */
380 struct blk_mq_tag_set tag_set;
381
Josh Durginc6666012011-11-21 17:11:12 -0800382 /* protects updating the header */
383 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500384
385 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700386
387 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800388
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800389 /* sysfs related */
390 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600391 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800392};
393
Alex Elderb82d1672013-01-14 12:43:31 -0600394/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200395 * Flag bits for rbd_dev->flags:
396 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
397 * by rbd_dev->lock
398 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600399 */
Alex Elder6d292902013-01-14 12:43:31 -0600400enum rbd_dev_flags {
401 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600402 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200403 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600404};
405
Alex Eldercfbf6372013-05-31 17:40:45 -0500406static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600407
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700408static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600409static DEFINE_SPINLOCK(rbd_dev_list_lock);
410
Alex Elder432b8582012-01-29 13:57:44 -0600411static LIST_HEAD(rbd_client_list); /* clients */
412static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700413
Alex Elder78c2a442013-05-01 12:43:04 -0500414/* Slab caches for frequently-allocated structures */
415
Alex Elder1c2a9df2013-05-01 12:43:03 -0500416static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500417static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500418
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200419static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200420static DEFINE_IDA(rbd_dev_id_ida);
421
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400422static struct workqueue_struct *rbd_wq;
423
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200424/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100425 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200426 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100427static bool single_major = true;
Joe Perches5657a812018-05-24 13:38:59 -0600428module_param(single_major, bool, 0444);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100429MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200430
Alex Elderf0f8cef2012-01-29 13:57:44 -0600431static ssize_t rbd_add(struct bus_type *bus, const char *buf,
432 size_t count);
433static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
434 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200435static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
436 size_t count);
437static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
438 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200439static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600440
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200441static int rbd_dev_id_to_minor(int dev_id)
442{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200443 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200444}
445
446static int minor_to_rbd_dev_id(int minor)
447{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200448 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200449}
450
Ilya Dryomoved95b212016-08-12 16:40:02 +0200451static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
452{
453 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
454 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
455}
456
457static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
458{
459 bool is_lock_owner;
460
461 down_read(&rbd_dev->lock_rwsem);
462 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
463 up_read(&rbd_dev->lock_rwsem);
464 return is_lock_owner;
465}
466
Ilya Dryomov8767b292017-03-02 19:56:57 +0100467static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
468{
469 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
470}
471
Joe Perches5657a812018-05-24 13:38:59 -0600472static BUS_ATTR(add, 0200, NULL, rbd_add);
473static BUS_ATTR(remove, 0200, NULL, rbd_remove);
474static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
475static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
476static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700477
478static struct attribute *rbd_bus_attrs[] = {
479 &bus_attr_add.attr,
480 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200481 &bus_attr_add_single_major.attr,
482 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100483 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700484 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600485};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200486
487static umode_t rbd_bus_is_visible(struct kobject *kobj,
488 struct attribute *attr, int index)
489{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200490 if (!single_major &&
491 (attr == &bus_attr_add_single_major.attr ||
492 attr == &bus_attr_remove_single_major.attr))
493 return 0;
494
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200495 return attr->mode;
496}
497
498static const struct attribute_group rbd_bus_group = {
499 .attrs = rbd_bus_attrs,
500 .is_visible = rbd_bus_is_visible,
501};
502__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600503
504static struct bus_type rbd_bus_type = {
505 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700506 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600507};
508
509static void rbd_root_dev_release(struct device *dev)
510{
511}
512
513static struct device rbd_root_dev = {
514 .init_name = "rbd",
515 .release = rbd_root_dev_release,
516};
517
Alex Elder06ecc6c2012-11-01 10:17:15 -0500518static __printf(2, 3)
519void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
520{
521 struct va_format vaf;
522 va_list args;
523
524 va_start(args, fmt);
525 vaf.fmt = fmt;
526 vaf.va = &args;
527
528 if (!rbd_dev)
529 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
530 else if (rbd_dev->disk)
531 printk(KERN_WARNING "%s: %s: %pV\n",
532 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
533 else if (rbd_dev->spec && rbd_dev->spec->image_name)
534 printk(KERN_WARNING "%s: image %s: %pV\n",
535 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
536 else if (rbd_dev->spec && rbd_dev->spec->image_id)
537 printk(KERN_WARNING "%s: id %s: %pV\n",
538 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
539 else /* punt */
540 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
541 RBD_DRV_NAME, rbd_dev, &vaf);
542 va_end(args);
543}
544
Alex Elderaafb2302012-09-06 16:00:54 -0500545#ifdef RBD_DEBUG
546#define rbd_assert(expr) \
547 if (unlikely(!(expr))) { \
548 printk(KERN_ERR "\nAssertion failure in %s() " \
549 "at line %d:\n\n" \
550 "\trbd_assert(%s);\n\n", \
551 __func__, __LINE__, #expr); \
552 BUG(); \
553 }
554#else /* !RBD_DEBUG */
555# define rbd_assert(expr) ((void) 0)
556#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800557
Alex Elder05a46af2013-04-26 15:44:36 -0500558static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600559
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500560static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500561static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400562static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400563static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500564static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
565 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500566static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
567 u8 *order, u64 *snap_size);
568static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
569 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700570
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700571static int rbd_open(struct block_device *bdev, fmode_t mode)
572{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600573 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600574 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575
Alex Eldera14ea262013-02-05 13:23:12 -0600576 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600577 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
578 removing = true;
579 else
580 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600581 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600582 if (removing)
583 return -ENOENT;
584
Alex Elderc3e946c2012-11-16 09:29:16 -0600585 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700586
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587 return 0;
588}
589
Al Virodb2a1442013-05-05 21:52:57 -0400590static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800591{
592 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600593 unsigned long open_count_before;
594
Alex Eldera14ea262013-02-05 13:23:12 -0600595 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600596 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600597 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600598 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800599
Alex Elderc3e946c2012-11-16 09:29:16 -0600600 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800601}
602
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800603static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
604{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200605 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800606
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200607 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800608 return -EFAULT;
609
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200610 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800611 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
612 return -EROFS;
613
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200614 /* Let blkdev_roset() handle it */
615 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800616}
617
618static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
619 unsigned int cmd, unsigned long arg)
620{
621 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200622 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800623
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800624 switch (cmd) {
625 case BLKROSET:
626 ret = rbd_ioctl_set_ro(rbd_dev, arg);
627 break;
628 default:
629 ret = -ENOTTY;
630 }
631
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800632 return ret;
633}
634
635#ifdef CONFIG_COMPAT
636static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
637 unsigned int cmd, unsigned long arg)
638{
639 return rbd_ioctl(bdev, mode, cmd, arg);
640}
641#endif /* CONFIG_COMPAT */
642
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643static const struct block_device_operations rbd_bd_ops = {
644 .owner = THIS_MODULE,
645 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800646 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800647 .ioctl = rbd_ioctl,
648#ifdef CONFIG_COMPAT
649 .compat_ioctl = rbd_compat_ioctl,
650#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651};
652
653/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500654 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500655 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656 */
Alex Elderf8c38922012-08-10 13:12:07 -0700657static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658{
659 struct rbd_client *rbdc;
660 int ret = -ENOMEM;
661
Alex Elder37206ee2013-02-20 17:32:08 -0600662 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
664 if (!rbdc)
665 goto out_opt;
666
667 kref_init(&rbdc->kref);
668 INIT_LIST_HEAD(&rbdc->node);
669
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100670 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500672 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500673 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674
675 ret = ceph_open_session(rbdc->client);
676 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500677 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678
Alex Elder432b8582012-01-29 13:57:44 -0600679 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600681 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682
Alex Elder37206ee2013-02-20 17:32:08 -0600683 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600684
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500686out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500688out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689 kfree(rbdc);
690out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500691 if (ceph_opts)
692 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600693 dout("%s: error %d\n", __func__, ret);
694
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400695 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696}
697
Alex Elder2f82ee52012-10-30 19:40:33 -0500698static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
699{
700 kref_get(&rbdc->kref);
701
702 return rbdc;
703}
704
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700706 * Find a ceph client with specific addr and configuration. If
707 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700709static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700710{
711 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700712 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713
Alex Elder43ae4702012-07-03 16:01:18 -0500714 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700715 return NULL;
716
Alex Elder1f7ba332012-08-10 13:12:07 -0700717 spin_lock(&rbd_client_list_lock);
718 list_for_each_entry(client_node, &rbd_client_list, node) {
719 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500720 __rbd_get_client(client_node);
721
Alex Elder1f7ba332012-08-10 13:12:07 -0700722 found = true;
723 break;
724 }
725 }
726 spin_unlock(&rbd_client_list_lock);
727
728 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729}
730
731/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300732 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700733 */
734enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300735 Opt_queue_depth,
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400736 Opt_lock_timeout,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700737 Opt_last_int,
738 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200739 Opt_pool_ns,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700740 Opt_last_string,
741 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700742 Opt_read_only,
743 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200744 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200745 Opt_exclusive,
Ilya Dryomovd9360542018-03-23 06:14:47 +0100746 Opt_notrim,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300747 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700748};
749
Alex Elder43ae4702012-07-03 16:01:18 -0500750static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300751 {Opt_queue_depth, "queue_depth=%d"},
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400752 {Opt_lock_timeout, "lock_timeout=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700753 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200754 {Opt_pool_ns, "_pool_ns=%s"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700755 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500756 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700757 {Opt_read_only, "ro"}, /* Alternate spelling */
758 {Opt_read_write, "read_write"},
759 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200760 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200761 {Opt_exclusive, "exclusive"},
Ilya Dryomovd9360542018-03-23 06:14:47 +0100762 {Opt_notrim, "notrim"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300763 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700764};
765
Alex Elder98571b52013-01-20 14:44:42 -0600766struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300767 int queue_depth;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400768 unsigned long lock_timeout;
Alex Elder98571b52013-01-20 14:44:42 -0600769 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200770 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200771 bool exclusive;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100772 bool trim;
Alex Elder98571b52013-01-20 14:44:42 -0600773};
774
Ilya Dryomovb5584182015-06-23 16:21:19 +0300775#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400776#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
Alex Elder98571b52013-01-20 14:44:42 -0600777#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200778#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200779#define RBD_EXCLUSIVE_DEFAULT false
Ilya Dryomovd9360542018-03-23 06:14:47 +0100780#define RBD_TRIM_DEFAULT true
Alex Elder98571b52013-01-20 14:44:42 -0600781
Ilya Dryomovc3001562018-07-03 15:28:43 +0200782struct parse_rbd_opts_ctx {
783 struct rbd_spec *spec;
784 struct rbd_options *opts;
785};
786
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700787static int parse_rbd_opts_token(char *c, void *private)
788{
Ilya Dryomovc3001562018-07-03 15:28:43 +0200789 struct parse_rbd_opts_ctx *pctx = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700790 substring_t argstr[MAX_OPT_ARGS];
791 int token, intval, ret;
792
Alex Elder43ae4702012-07-03 16:01:18 -0500793 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700794 if (token < Opt_last_int) {
795 ret = match_int(&argstr[0], &intval);
796 if (ret < 0) {
Ilya Dryomov2f56b6b2018-06-27 16:38:13 +0200797 pr_err("bad option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700798 return ret;
799 }
800 dout("got int token %d val %d\n", token, intval);
801 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300802 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700803 } else {
804 dout("got token %d\n", token);
805 }
806
807 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300808 case Opt_queue_depth:
809 if (intval < 1) {
810 pr_err("queue_depth out of range\n");
811 return -EINVAL;
812 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200813 pctx->opts->queue_depth = intval;
Ilya Dryomovb5584182015-06-23 16:21:19 +0300814 break;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400815 case Opt_lock_timeout:
816 /* 0 is "wait forever" (i.e. infinite timeout) */
817 if (intval < 0 || intval > INT_MAX / 1000) {
818 pr_err("lock_timeout out of range\n");
819 return -EINVAL;
820 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200821 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400822 break;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200823 case Opt_pool_ns:
824 kfree(pctx->spec->pool_ns);
825 pctx->spec->pool_ns = match_strdup(argstr);
826 if (!pctx->spec->pool_ns)
827 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700828 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700829 case Opt_read_only:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200830 pctx->opts->read_only = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700831 break;
832 case Opt_read_write:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200833 pctx->opts->read_only = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700834 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200835 case Opt_lock_on_read:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200836 pctx->opts->lock_on_read = true;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200837 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200838 case Opt_exclusive:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200839 pctx->opts->exclusive = true;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200840 break;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100841 case Opt_notrim:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200842 pctx->opts->trim = false;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100843 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700844 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300845 /* libceph prints "bad option" msg */
846 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700847 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300848
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700849 return 0;
850}
851
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800852static char* obj_op_name(enum obj_operation_type op_type)
853{
854 switch (op_type) {
855 case OBJ_OP_READ:
856 return "read";
857 case OBJ_OP_WRITE:
858 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800859 case OBJ_OP_DISCARD:
860 return "discard";
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100861 case OBJ_OP_ZEROOUT:
862 return "zeroout";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800863 default:
864 return "???";
865 }
866}
867
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700868/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700869 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600870 *
Alex Elder432b8582012-01-29 13:57:44 -0600871 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700872 */
873static void rbd_client_release(struct kref *kref)
874{
875 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
876
Alex Elder37206ee2013-02-20 17:32:08 -0600877 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500878 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700879 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500880 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700881
882 ceph_destroy_client(rbdc->client);
883 kfree(rbdc);
884}
885
886/*
887 * Drop reference to ceph client node. If it's not referenced anymore, release
888 * it.
889 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500890static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700891{
Alex Elderc53d5892012-10-25 23:34:42 -0500892 if (rbdc)
893 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700894}
895
Ilya Dryomovdd435852018-02-22 13:43:24 +0100896static int wait_for_latest_osdmap(struct ceph_client *client)
897{
898 u64 newest_epoch;
899 int ret;
900
901 ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
902 if (ret)
903 return ret;
904
905 if (client->osdc.osdmap->epoch >= newest_epoch)
906 return 0;
907
908 ceph_osdc_maybe_request_map(&client->osdc);
909 return ceph_monc_wait_osdmap(&client->monc, newest_epoch,
910 client->options->mount_timeout);
911}
912
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100913/*
914 * Get a ceph client with specific addr and configuration, if one does
915 * not exist create it. Either way, ceph_opts is consumed by this
916 * function.
917 */
918static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
919{
920 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100921 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100922
923 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
924 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100925 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100926 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100927
928 /*
929 * Using an existing client. Make sure ->pg_pools is up to
930 * date before we look up the pool id in do_rbd_add().
931 */
932 ret = wait_for_latest_osdmap(rbdc->client);
933 if (ret) {
934 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
935 rbd_put_client(rbdc);
936 rbdc = ERR_PTR(ret);
937 }
938 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100939 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100940 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100941 mutex_unlock(&client_mutex);
942
943 return rbdc;
944}
945
Alex Eldera30b71b2012-07-10 20:30:11 -0500946static bool rbd_image_format_valid(u32 image_format)
947{
948 return image_format == 1 || image_format == 2;
949}
950
Alex Elder8e94af82012-07-25 09:32:40 -0500951static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
952{
Alex Elder103a1502012-08-02 11:29:45 -0500953 size_t size;
954 u32 snap_count;
955
956 /* The header has to start with the magic rbd header text */
957 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
958 return false;
959
Alex Elderdb2388b2012-10-20 22:17:27 -0500960 /* The bio layer requires at least sector-sized I/O */
961
962 if (ondisk->options.order < SECTOR_SHIFT)
963 return false;
964
965 /* If we use u64 in a few spots we may be able to loosen this */
966
967 if (ondisk->options.order > 8 * sizeof (int) - 1)
968 return false;
969
Alex Elder103a1502012-08-02 11:29:45 -0500970 /*
971 * The size of a snapshot header has to fit in a size_t, and
972 * that limits the number of snapshots.
973 */
974 snap_count = le32_to_cpu(ondisk->snap_count);
975 size = SIZE_MAX - sizeof (struct ceph_snap_context);
976 if (snap_count > size / sizeof (__le64))
977 return false;
978
979 /*
980 * Not only that, but the size of the entire the snapshot
981 * header must also be representable in a size_t.
982 */
983 size -= snap_count * sizeof (__le64);
984 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
985 return false;
986
987 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500988}
989
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700990/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100991 * returns the size of an object in the image
992 */
993static u32 rbd_obj_bytes(struct rbd_image_header *header)
994{
995 return 1U << header->obj_order;
996}
997
Ilya Dryomov263423f2017-01-25 18:16:22 +0100998static void rbd_init_layout(struct rbd_device *rbd_dev)
999{
1000 if (rbd_dev->header.stripe_unit == 0 ||
1001 rbd_dev->header.stripe_count == 0) {
1002 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1003 rbd_dev->header.stripe_count = 1;
1004 }
1005
1006 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1007 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1008 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +01001009 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1010 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001011 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1012}
1013
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001014/*
Alex Elderbb23e372013-05-06 09:51:29 -05001015 * Fill an rbd image header with information from the given format 1
1016 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001017 */
Alex Elder662518b2013-05-06 09:51:29 -05001018static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001019 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001020{
Alex Elder662518b2013-05-06 09:51:29 -05001021 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001022 bool first_time = header->object_prefix == NULL;
1023 struct ceph_snap_context *snapc;
1024 char *object_prefix = NULL;
1025 char *snap_names = NULL;
1026 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001027 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001028 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001029 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001030
Alex Elderbb23e372013-05-06 09:51:29 -05001031 /* Allocate this now to avoid having to handle failure below */
1032
1033 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001034 object_prefix = kstrndup(ondisk->object_prefix,
1035 sizeof(ondisk->object_prefix),
1036 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001037 if (!object_prefix)
1038 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001039 }
1040
1041 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001042
Alex Elder103a1502012-08-02 11:29:45 -05001043 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001044 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1045 if (!snapc)
1046 goto out_err;
1047 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001048 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001049 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001050 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1051
Alex Elderbb23e372013-05-06 09:51:29 -05001052 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001053
Alex Elderbb23e372013-05-06 09:51:29 -05001054 if (snap_names_len > (u64)SIZE_MAX)
1055 goto out_2big;
1056 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1057 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001058 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001059
1060 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001061 snap_sizes = kmalloc_array(snap_count,
1062 sizeof(*header->snap_sizes),
1063 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001064 if (!snap_sizes)
1065 goto out_err;
1066
Alex Elderf785cc12012-08-23 23:22:06 -05001067 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001068 * Copy the names, and fill in each snapshot's id
1069 * and size.
1070 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001071 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001072 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001073 * snap_names_len bytes beyond the end of the
1074 * snapshot id array, this memcpy() is safe.
1075 */
Alex Elderbb23e372013-05-06 09:51:29 -05001076 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1077 snaps = ondisk->snaps;
1078 for (i = 0; i < snap_count; i++) {
1079 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1080 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1081 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001082 }
Alex Elder849b4262012-07-09 21:04:24 -05001083
Alex Elderbb23e372013-05-06 09:51:29 -05001084 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001085
Alex Elderbb23e372013-05-06 09:51:29 -05001086 if (first_time) {
1087 header->object_prefix = object_prefix;
1088 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001089 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001090 } else {
1091 ceph_put_snap_context(header->snapc);
1092 kfree(header->snap_names);
1093 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001094 }
1095
1096 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001097
Alex Elderf84344f2012-08-31 17:29:51 -05001098 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001099 header->snapc = snapc;
1100 header->snap_names = snap_names;
1101 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001102
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001103 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001104out_2big:
1105 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001106out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001107 kfree(snap_sizes);
1108 kfree(snap_names);
1109 ceph_put_snap_context(snapc);
1110 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001111
Alex Elderbb23e372013-05-06 09:51:29 -05001112 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001113}
1114
Alex Elder9682fc62013-04-30 00:44:33 -05001115static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1116{
1117 const char *snap_name;
1118
1119 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1120
1121 /* Skip over names until we find the one we are looking for */
1122
1123 snap_name = rbd_dev->header.snap_names;
1124 while (which--)
1125 snap_name += strlen(snap_name) + 1;
1126
1127 return kstrdup(snap_name, GFP_KERNEL);
1128}
1129
Alex Elder30d1cff2013-05-01 12:43:03 -05001130/*
1131 * Snapshot id comparison function for use with qsort()/bsearch().
1132 * Note that result is for snapshots in *descending* order.
1133 */
1134static int snapid_compare_reverse(const void *s1, const void *s2)
1135{
1136 u64 snap_id1 = *(u64 *)s1;
1137 u64 snap_id2 = *(u64 *)s2;
1138
1139 if (snap_id1 < snap_id2)
1140 return 1;
1141 return snap_id1 == snap_id2 ? 0 : -1;
1142}
1143
1144/*
1145 * Search a snapshot context to see if the given snapshot id is
1146 * present.
1147 *
1148 * Returns the position of the snapshot id in the array if it's found,
1149 * or BAD_SNAP_INDEX otherwise.
1150 *
1151 * Note: The snapshot array is in kept sorted (by the osd) in
1152 * reverse order, highest snapshot id first.
1153 */
Alex Elder9682fc62013-04-30 00:44:33 -05001154static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1155{
1156 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001157 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001158
Alex Elder30d1cff2013-05-01 12:43:03 -05001159 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1160 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001161
Alex Elder30d1cff2013-05-01 12:43:03 -05001162 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001163}
1164
Alex Elder2ad3d712013-04-30 00:44:33 -05001165static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1166 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001167{
1168 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001169 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001170
1171 which = rbd_dev_snap_index(rbd_dev, snap_id);
1172 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001173 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001174
Josh Durginda6a6b62013-09-04 17:57:31 -07001175 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1176 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001177}
1178
Alex Elder9e15b772012-10-30 19:40:33 -05001179static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1180{
Alex Elder9e15b772012-10-30 19:40:33 -05001181 if (snap_id == CEPH_NOSNAP)
1182 return RBD_SNAP_HEAD_NAME;
1183
Alex Elder54cac612013-04-30 00:44:33 -05001184 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1185 if (rbd_dev->image_format == 1)
1186 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001187
Alex Elder54cac612013-04-30 00:44:33 -05001188 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001189}
1190
Alex Elder2ad3d712013-04-30 00:44:33 -05001191static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1192 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001193{
Alex Elder2ad3d712013-04-30 00:44:33 -05001194 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1195 if (snap_id == CEPH_NOSNAP) {
1196 *snap_size = rbd_dev->header.image_size;
1197 } else if (rbd_dev->image_format == 1) {
1198 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001199
Alex Elder2ad3d712013-04-30 00:44:33 -05001200 which = rbd_dev_snap_index(rbd_dev, snap_id);
1201 if (which == BAD_SNAP_INDEX)
1202 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001203
Alex Elder2ad3d712013-04-30 00:44:33 -05001204 *snap_size = rbd_dev->header.snap_sizes[which];
1205 } else {
1206 u64 size = 0;
1207 int ret;
1208
1209 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1210 if (ret)
1211 return ret;
1212
1213 *snap_size = size;
1214 }
1215 return 0;
1216}
1217
1218static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1219 u64 *snap_features)
1220{
1221 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1222 if (snap_id == CEPH_NOSNAP) {
1223 *snap_features = rbd_dev->header.features;
1224 } else if (rbd_dev->image_format == 1) {
1225 *snap_features = 0; /* No features for format 1 */
1226 } else {
1227 u64 features = 0;
1228 int ret;
1229
1230 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1231 if (ret)
1232 return ret;
1233
1234 *snap_features = features;
1235 }
1236 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001237}
1238
Alex Elderd1cf5782013-04-27 09:59:30 -05001239static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001240{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001241 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001242 u64 size = 0;
1243 u64 features = 0;
1244 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001245
Alex Elder2ad3d712013-04-30 00:44:33 -05001246 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1247 if (ret)
1248 return ret;
1249 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1250 if (ret)
1251 return ret;
1252
1253 rbd_dev->mapping.size = size;
1254 rbd_dev->mapping.features = features;
1255
Alex Elder8b0241f2013-04-25 23:15:08 -05001256 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001257}
1258
Alex Elderd1cf5782013-04-27 09:59:30 -05001259static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1260{
1261 rbd_dev->mapping.size = 0;
1262 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001263}
1264
Ilya Dryomov5359a172018-01-20 10:30:10 +01001265static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001266{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001267 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001268 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001269
Ilya Dryomov5359a172018-01-20 10:30:10 +01001270 buf = bvec_kmap_irq(bv, &flags);
1271 memset(buf, 0, bv->bv_len);
1272 flush_dcache_page(bv->bv_page);
1273 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001274}
1275
Ilya Dryomov5359a172018-01-20 10:30:10 +01001276static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001277{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001278 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001279
Ilya Dryomov5359a172018-01-20 10:30:10 +01001280 ceph_bio_iter_advance(&it, off);
1281 ceph_bio_iter_advance_step(&it, bytes, ({
1282 zero_bvec(&bv);
1283 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001284}
1285
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001286static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001287{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001288 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001289
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001290 ceph_bvec_iter_advance(&it, off);
1291 ceph_bvec_iter_advance_step(&it, bytes, ({
1292 zero_bvec(&bv);
1293 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001294}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001295
Alex Elderf7760da2012-10-20 22:17:27 -05001296/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001297 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001298 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001299 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001300 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001301 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001302static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1303 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001304{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001305 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001306 case OBJ_REQUEST_BIO:
1307 zero_bios(&obj_req->bio_pos, off, bytes);
1308 break;
1309 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001310 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001311 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1312 break;
1313 default:
1314 rbd_assert(0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001315 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001316}
1317
1318static void rbd_obj_request_destroy(struct kref *kref);
1319static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1320{
1321 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001322 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001323 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001324 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1325}
1326
Alex Elder0f2d5be2014-04-26 14:21:44 +04001327static void rbd_img_request_get(struct rbd_img_request *img_request)
1328{
1329 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001330 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001331 kref_get(&img_request->kref);
1332}
1333
Alex Elderbf0d5f502012-11-22 00:00:08 -06001334static void rbd_img_request_destroy(struct kref *kref);
1335static void rbd_img_request_put(struct rbd_img_request *img_request)
1336{
1337 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001338 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001339 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001340 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001341}
1342
1343static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1344 struct rbd_obj_request *obj_request)
1345{
Alex Elder25dcf952013-01-25 17:08:55 -06001346 rbd_assert(obj_request->img_request == NULL);
1347
Alex Elderb155e862013-04-15 14:50:37 -05001348 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001349 obj_request->img_request = img_request;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01001350 img_request->pending_count++;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001351 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001352}
1353
1354static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1355 struct rbd_obj_request *obj_request)
1356{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001357 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001358 list_del(&obj_request->ex.oe_item);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001359 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001360 rbd_obj_request_put(obj_request);
1361}
1362
Ilya Dryomov980917f2016-09-12 18:59:42 +02001363static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001364{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001365 struct ceph_osd_request *osd_req = obj_request->osd_req;
1366
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001367 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001368 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1369 obj_request->ex.oe_len, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001370 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001371}
1372
Alex Elder0c425242013-02-08 09:55:49 -06001373/*
1374 * The default/initial value for all image request flags is 0. Each
1375 * is conditionally set to 1 at image request initialization time
1376 * and currently never change thereafter.
1377 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001378static void img_request_layered_set(struct rbd_img_request *img_request)
1379{
1380 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1381 smp_mb();
1382}
1383
Alex Eldera2acd002013-05-08 22:50:04 -05001384static void img_request_layered_clear(struct rbd_img_request *img_request)
1385{
1386 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1387 smp_mb();
1388}
1389
Alex Elderd0b2e942013-01-24 16:13:36 -06001390static bool img_request_layered_test(struct rbd_img_request *img_request)
1391{
1392 smp_mb();
1393 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1394}
1395
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001396static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001397{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001398 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1399
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001400 return !obj_req->ex.oe_off &&
1401 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001402}
1403
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001404static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001405{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001406 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001407
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001408 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001409 rbd_dev->layout.object_size;
1410}
1411
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001412static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1413{
1414 return ceph_file_extents_bytes(obj_req->img_extents,
1415 obj_req->num_img_extents);
1416}
1417
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001418static bool rbd_img_is_write(struct rbd_img_request *img_req)
1419{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001420 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001421 case OBJ_OP_READ:
1422 return false;
1423 case OBJ_OP_WRITE:
1424 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001425 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001426 return true;
1427 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02001428 BUG();
Alex Elder6e2a4502013-03-27 09:16:30 -05001429 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001430}
1431
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001432static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
Ilya Dryomov27617132015-07-16 17:36:11 +03001433
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001434static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001435{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001436 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001437
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001438 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1439 osd_req->r_result, obj_req);
1440 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001441
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001442 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1443 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1444 obj_req->xferred = osd_req->r_result;
1445 else
1446 /*
1447 * Writes aren't allowed to return a data payload. In some
1448 * guarded write cases (e.g. stat + zero on an empty object)
1449 * a stat response makes it through, but we don't care.
1450 */
1451 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001452
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001453 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001454}
1455
Alex Elder9d4df012013-04-19 15:34:50 -05001456static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001457{
Alex Elder8c042b02013-04-03 01:28:58 -05001458 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001459
Ilya Dryomova162b302018-01-30 17:52:10 +01001460 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001461 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001462}
1463
1464static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1465{
Alex Elder9d4df012013-04-19 15:34:50 -05001466 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001467
Ilya Dryomova162b302018-01-30 17:52:10 +01001468 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001469 ktime_get_real_ts64(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001470 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001471}
1472
Ilya Dryomovbc812072017-01-25 18:16:23 +01001473static struct ceph_osd_request *
Ilya Dryomova162b302018-01-30 17:52:10 +01001474rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001475{
Ilya Dryomova162b302018-01-30 17:52:10 +01001476 struct rbd_img_request *img_req = obj_req->img_request;
1477 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001478 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1479 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001480 const char *name_format = rbd_dev->image_format == 1 ?
1481 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001482
Ilya Dryomova162b302018-01-30 17:52:10 +01001483 req = ceph_osdc_alloc_request(osdc,
1484 (rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1485 num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001486 if (!req)
1487 return NULL;
1488
Ilya Dryomovbc812072017-01-25 18:16:23 +01001489 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001490 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001491
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001492 /*
1493 * Data objects may be stored in a separate pool, but always in
1494 * the same namespace in that pool as the header in its pool.
1495 */
1496 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001497 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001498
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001499 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001500 rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001501 goto err_req;
1502
Ilya Dryomovbc812072017-01-25 18:16:23 +01001503 return req;
1504
1505err_req:
1506 ceph_osdc_put_request(req);
1507 return NULL;
1508}
1509
Alex Elderbf0d5f502012-11-22 00:00:08 -06001510static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1511{
1512 ceph_osdc_put_request(osd_req);
1513}
1514
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001515static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001516{
1517 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001518
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001519 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001520 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001521 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001522
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001523 ceph_object_extent_init(&obj_request->ex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001524 kref_init(&obj_request->kref);
1525
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001526 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001527 return obj_request;
1528}
1529
1530static void rbd_obj_request_destroy(struct kref *kref)
1531{
1532 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001533 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001534
1535 obj_request = container_of(kref, struct rbd_obj_request, kref);
1536
Alex Elder37206ee2013-02-20 17:32:08 -06001537 dout("%s: obj %p\n", __func__, obj_request);
1538
Alex Elderbf0d5f502012-11-22 00:00:08 -06001539 if (obj_request->osd_req)
1540 rbd_osd_req_destroy(obj_request->osd_req);
1541
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001542 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001543 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001544 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001545 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001546 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001547 case OBJ_REQUEST_OWN_BVECS:
1548 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001549 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001550 default:
1551 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001552 }
1553
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001554 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001555 if (obj_request->copyup_bvecs) {
1556 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1557 if (obj_request->copyup_bvecs[i].bv_page)
1558 __free_page(obj_request->copyup_bvecs[i].bv_page);
1559 }
1560 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001561 }
1562
Alex Elder868311b2013-05-01 12:43:03 -05001563 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001564}
1565
Alex Elderfb65d2282013-05-08 22:50:04 -05001566/* It's OK to call this for a device with no parent */
1567
1568static void rbd_spec_put(struct rbd_spec *spec);
1569static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1570{
1571 rbd_dev_remove_parent(rbd_dev);
1572 rbd_spec_put(rbd_dev->parent_spec);
1573 rbd_dev->parent_spec = NULL;
1574 rbd_dev->parent_overlap = 0;
1575}
1576
Alex Elderbf0d5f502012-11-22 00:00:08 -06001577/*
Alex Eldera2acd002013-05-08 22:50:04 -05001578 * Parent image reference counting is used to determine when an
1579 * image's parent fields can be safely torn down--after there are no
1580 * more in-flight requests to the parent image. When the last
1581 * reference is dropped, cleaning them up is safe.
1582 */
1583static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1584{
1585 int counter;
1586
1587 if (!rbd_dev->parent_spec)
1588 return;
1589
1590 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1591 if (counter > 0)
1592 return;
1593
1594 /* Last reference; clean up parent data structures */
1595
1596 if (!counter)
1597 rbd_dev_unparent(rbd_dev);
1598 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001599 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001600}
1601
1602/*
1603 * If an image has a non-zero parent overlap, get a reference to its
1604 * parent.
1605 *
1606 * Returns true if the rbd device has a parent with a non-zero
1607 * overlap and a reference for it was successfully taken, or
1608 * false otherwise.
1609 */
1610static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1611{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001612 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001613
1614 if (!rbd_dev->parent_spec)
1615 return false;
1616
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001617 down_read(&rbd_dev->header_rwsem);
1618 if (rbd_dev->parent_overlap)
1619 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1620 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001621
1622 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001623 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001624
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001625 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001626}
1627
Alex Elderbf0d5f502012-11-22 00:00:08 -06001628/*
1629 * Caller is responsible for filling in the list of object requests
1630 * that comprises the image request, and the Linux request pointer
1631 * (if there is one).
1632 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001633static struct rbd_img_request *rbd_img_request_create(
1634 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001635 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001636 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001637{
1638 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001639
Ilya Dryomova0c58952018-01-22 16:03:06 +01001640 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001641 if (!img_request)
1642 return NULL;
1643
Alex Elderbf0d5f502012-11-22 00:00:08 -06001644 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001645 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001646 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001647 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001648 else
1649 img_request->snapc = snapc;
1650
Alex Eldera2acd002013-05-08 22:50:04 -05001651 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001652 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001653
Alex Elderbf0d5f502012-11-22 00:00:08 -06001654 spin_lock_init(&img_request->completion_lock);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001655 INIT_LIST_HEAD(&img_request->object_extents);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001656 kref_init(&img_request->kref);
1657
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001658 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1659 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001660 return img_request;
1661}
1662
1663static void rbd_img_request_destroy(struct kref *kref)
1664{
1665 struct rbd_img_request *img_request;
1666 struct rbd_obj_request *obj_request;
1667 struct rbd_obj_request *next_obj_request;
1668
1669 img_request = container_of(kref, struct rbd_img_request, kref);
1670
Alex Elder37206ee2013-02-20 17:32:08 -06001671 dout("%s: img %p\n", __func__, img_request);
1672
Alex Elderbf0d5f502012-11-22 00:00:08 -06001673 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1674 rbd_img_obj_request_del(img_request, obj_request);
1675
Alex Eldera2acd002013-05-08 22:50:04 -05001676 if (img_request_layered_test(img_request)) {
1677 img_request_layered_clear(img_request);
1678 rbd_dev_parent_put(img_request->rbd_dev);
1679 }
1680
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001681 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001682 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001683
Alex Elder1c2a9df2013-05-01 12:43:03 -05001684 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001685}
1686
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001687static void prune_extents(struct ceph_file_extent *img_extents,
1688 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05001689{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001690 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05001691
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001692 /* drop extents completely beyond the overlap */
1693 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1694 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05001695
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001696 if (cnt) {
1697 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05001698
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001699 /* trim final overlapping extent */
1700 if (ex->fe_off + ex->fe_len > overlap)
1701 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06001702 }
1703
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001704 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05001705}
1706
Alex Elderf1a47392013-04-19 15:34:50 -05001707/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001708 * Determine the byte range(s) covered by either just the object extent
1709 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001710 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001711static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1712 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001713{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001714 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001715 int ret;
1716
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001717 if (!rbd_dev->parent_overlap)
1718 return 0;
1719
1720 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1721 entire ? 0 : obj_req->ex.oe_off,
1722 entire ? rbd_dev->layout.object_size :
1723 obj_req->ex.oe_len,
1724 &obj_req->img_extents,
1725 &obj_req->num_img_extents);
1726 if (ret)
1727 return ret;
1728
1729 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1730 rbd_dev->parent_overlap);
1731 return 0;
1732}
1733
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001734static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1735{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001736 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001737 case OBJ_REQUEST_BIO:
1738 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1739 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001740 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001741 break;
1742 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001743 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001744 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001745 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001746 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001747 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1748 &obj_req->bvec_pos);
1749 break;
1750 default:
1751 rbd_assert(0);
1752 }
1753}
1754
1755static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1756{
Ilya Dryomova162b302018-01-30 17:52:10 +01001757 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001758 if (!obj_req->osd_req)
Ilya Dryomov710214e2016-09-15 17:53:32 +02001759 return -ENOMEM;
1760
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001761 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001762 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001763 rbd_osd_req_setup_data(obj_req, 0);
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001764
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001765 rbd_osd_req_format_read(obj_req);
1766 return 0;
1767}
1768
1769static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1770 unsigned int which)
1771{
1772 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001773
Alex Elderc5b5ef62013-02-11 12:33:24 -06001774 /*
1775 * The response data for a STAT call consists of:
1776 * le64 length;
1777 * struct {
1778 * le32 tv_sec;
1779 * le32 tv_nsec;
1780 * } mtime;
1781 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001782 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1783 if (IS_ERR(pages))
1784 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06001785
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001786 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1787 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1788 8 + sizeof(struct ceph_timespec),
1789 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001790 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001791}
1792
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001793static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1794 unsigned int which)
Alex Elderb454e362013-04-19 15:34:50 -05001795{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001796 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1797 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05001798
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001799 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
1800 rbd_dev->layout.object_size,
1801 rbd_dev->layout.object_size);
Alex Elderb454e362013-04-19 15:34:50 -05001802
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001803 if (rbd_obj_is_entire(obj_req))
1804 opcode = CEPH_OSD_OP_WRITEFULL;
1805 else
1806 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001807
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001808 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001809 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001810 rbd_osd_req_setup_data(obj_req, which++);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001811
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001812 rbd_assert(which == obj_req->osd_req->r_num_ops);
1813 rbd_osd_req_format_write(obj_req);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001814}
1815
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001816static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001817{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001818 unsigned int num_osd_ops, which = 0;
1819 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02001820
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001821 /* reverse map the entire object onto the parent */
1822 ret = rbd_obj_calc_img_extents(obj_req, true);
1823 if (ret)
1824 return ret;
1825
1826 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001827 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1828 num_osd_ops = 3; /* stat + setallochint + write/writefull */
1829 } else {
1830 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1831 num_osd_ops = 2; /* setallochint + write/writefull */
1832 }
1833
Ilya Dryomova162b302018-01-30 17:52:10 +01001834 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001835 if (!obj_req->osd_req)
1836 return -ENOMEM;
1837
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001838 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001839 ret = __rbd_obj_setup_stat(obj_req, which++);
1840 if (ret)
1841 return ret;
1842 }
1843
1844 __rbd_obj_setup_write(obj_req, which);
1845 return 0;
1846}
1847
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001848static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
1849{
1850 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
1851 CEPH_OSD_OP_ZERO;
1852}
1853
1854static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1855{
1856 int ret;
1857
1858 /* reverse map the entire object onto the parent */
1859 ret = rbd_obj_calc_img_extents(obj_req, true);
1860 if (ret)
1861 return ret;
1862
1863 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
1864 if (!obj_req->osd_req)
1865 return -ENOMEM;
1866
1867 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
1868 osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0);
1869 } else {
1870 osd_req_op_extent_init(obj_req->osd_req, 0,
1871 truncate_or_zero_opcode(obj_req),
1872 obj_req->ex.oe_off, obj_req->ex.oe_len,
1873 0, 0);
1874 }
1875
1876 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1877 rbd_osd_req_format_write(obj_req);
1878 return 0;
1879}
1880
1881static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001882 unsigned int which)
1883{
1884 u16 opcode;
1885
1886 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001887 if (obj_req->num_img_extents) {
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001888 osd_req_op_init(obj_req->osd_req, which++,
1889 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001890 opcode = CEPH_OSD_OP_TRUNCATE;
1891 } else {
1892 osd_req_op_init(obj_req->osd_req, which++,
1893 CEPH_OSD_OP_DELETE, 0);
1894 opcode = 0;
1895 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001896 } else {
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001897 opcode = truncate_or_zero_opcode(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001898 }
1899
1900 if (opcode)
1901 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001902 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001903 0, 0);
1904
1905 rbd_assert(which == obj_req->osd_req->r_num_ops);
1906 rbd_osd_req_format_write(obj_req);
1907}
1908
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001909static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001910{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001911 unsigned int num_osd_ops, which = 0;
1912 int ret;
1913
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001914 /* reverse map the entire object onto the parent */
1915 ret = rbd_obj_calc_img_extents(obj_req, true);
1916 if (ret)
1917 return ret;
1918
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001919 if (rbd_obj_is_entire(obj_req)) {
1920 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001921 if (obj_req->num_img_extents)
1922 num_osd_ops = 2; /* create + truncate */
1923 else
1924 num_osd_ops = 1; /* delete */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001925 } else {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001926 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001927 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1928 num_osd_ops = 2; /* stat + truncate/zero */
1929 } else {
1930 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1931 num_osd_ops = 1; /* truncate/zero */
1932 }
1933 }
1934
Ilya Dryomova162b302018-01-30 17:52:10 +01001935 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001936 if (!obj_req->osd_req)
1937 return -ENOMEM;
1938
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001939 if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001940 ret = __rbd_obj_setup_stat(obj_req, which++);
1941 if (ret)
1942 return ret;
1943 }
1944
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001945 __rbd_obj_setup_zeroout(obj_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001946 return 0;
1947}
1948
1949/*
1950 * For each object request in @img_req, allocate an OSD request, add
1951 * individual OSD ops and prepare them for submission. The number of
1952 * OSD ops depends on op_type and the overlap point (if any).
1953 */
1954static int __rbd_img_fill_request(struct rbd_img_request *img_req)
1955{
1956 struct rbd_obj_request *obj_req;
1957 int ret;
1958
1959 for_each_obj_request(img_req, obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001960 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001961 case OBJ_OP_READ:
1962 ret = rbd_obj_setup_read(obj_req);
1963 break;
1964 case OBJ_OP_WRITE:
1965 ret = rbd_obj_setup_write(obj_req);
1966 break;
1967 case OBJ_OP_DISCARD:
1968 ret = rbd_obj_setup_discard(obj_req);
1969 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001970 case OBJ_OP_ZEROOUT:
1971 ret = rbd_obj_setup_zeroout(obj_req);
1972 break;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001973 default:
1974 rbd_assert(0);
1975 }
1976 if (ret)
1977 return ret;
Ilya Dryomov26f887e2018-10-15 16:11:37 +02001978
1979 ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
1980 if (ret)
1981 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001982 }
1983
1984 return 0;
1985}
1986
Ilya Dryomov5a237812018-02-06 19:26:34 +01001987union rbd_img_fill_iter {
1988 struct ceph_bio_iter bio_iter;
1989 struct ceph_bvec_iter bvec_iter;
1990};
1991
1992struct rbd_img_fill_ctx {
1993 enum obj_request_type pos_type;
1994 union rbd_img_fill_iter *pos;
1995 union rbd_img_fill_iter iter;
1996 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01001997 ceph_object_extent_fn_t count_fn;
1998 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01001999};
2000
2001static struct ceph_object_extent *alloc_object_extent(void *arg)
2002{
2003 struct rbd_img_request *img_req = arg;
2004 struct rbd_obj_request *obj_req;
2005
2006 obj_req = rbd_obj_request_create();
2007 if (!obj_req)
2008 return NULL;
2009
2010 rbd_img_obj_request_add(img_req, obj_req);
2011 return &obj_req->ex;
2012}
2013
2014/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002015 * While su != os && sc == 1 is technically not fancy (it's the same
2016 * layout as su == os && sc == 1), we can't use the nocopy path for it
2017 * because ->set_pos_fn() should be called only once per object.
2018 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2019 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01002020 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002021static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2022{
2023 return l->stripe_unit != l->object_size;
2024}
2025
2026static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2027 struct ceph_file_extent *img_extents,
2028 u32 num_img_extents,
2029 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01002030{
2031 u32 i;
2032 int ret;
2033
2034 img_req->data_type = fctx->pos_type;
2035
2036 /*
2037 * Create object requests and set each object request's starting
2038 * position in the provided bio (list) or bio_vec array.
2039 */
2040 fctx->iter = *fctx->pos;
2041 for (i = 0; i < num_img_extents; i++) {
2042 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2043 img_extents[i].fe_off,
2044 img_extents[i].fe_len,
2045 &img_req->object_extents,
2046 alloc_object_extent, img_req,
2047 fctx->set_pos_fn, &fctx->iter);
2048 if (ret)
2049 return ret;
2050 }
2051
2052 return __rbd_img_fill_request(img_req);
2053}
2054
Ilya Dryomovafb97882018-02-06 19:26:35 +01002055/*
2056 * Map a list of image extents to a list of object extents, create the
2057 * corresponding object requests (normally each to a different object,
2058 * but not always) and add them to @img_req. For each object request,
2059 * set up its data descriptor to point to the corresponding chunk(s) of
2060 * @fctx->pos data buffer.
2061 *
2062 * Because ceph_file_to_extents() will merge adjacent object extents
2063 * together, each object request's data descriptor may point to multiple
2064 * different chunks of @fctx->pos data buffer.
2065 *
2066 * @fctx->pos data buffer is assumed to be large enough.
2067 */
2068static int rbd_img_fill_request(struct rbd_img_request *img_req,
2069 struct ceph_file_extent *img_extents,
2070 u32 num_img_extents,
2071 struct rbd_img_fill_ctx *fctx)
2072{
2073 struct rbd_device *rbd_dev = img_req->rbd_dev;
2074 struct rbd_obj_request *obj_req;
2075 u32 i;
2076 int ret;
2077
2078 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2079 !rbd_layout_is_fancy(&rbd_dev->layout))
2080 return rbd_img_fill_request_nocopy(img_req, img_extents,
2081 num_img_extents, fctx);
2082
2083 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2084
2085 /*
2086 * Create object requests and determine ->bvec_count for each object
2087 * request. Note that ->bvec_count sum over all object requests may
2088 * be greater than the number of bio_vecs in the provided bio (list)
2089 * or bio_vec array because when mapped, those bio_vecs can straddle
2090 * stripe unit boundaries.
2091 */
2092 fctx->iter = *fctx->pos;
2093 for (i = 0; i < num_img_extents; i++) {
2094 ret = ceph_file_to_extents(&rbd_dev->layout,
2095 img_extents[i].fe_off,
2096 img_extents[i].fe_len,
2097 &img_req->object_extents,
2098 alloc_object_extent, img_req,
2099 fctx->count_fn, &fctx->iter);
2100 if (ret)
2101 return ret;
2102 }
2103
2104 for_each_obj_request(img_req, obj_req) {
2105 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2106 sizeof(*obj_req->bvec_pos.bvecs),
2107 GFP_NOIO);
2108 if (!obj_req->bvec_pos.bvecs)
2109 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002110 }
2111
2112 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002113 * Fill in each object request's private bio_vec array, splitting and
2114 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002115 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002116 fctx->iter = *fctx->pos;
2117 for (i = 0; i < num_img_extents; i++) {
2118 ret = ceph_iterate_extents(&rbd_dev->layout,
2119 img_extents[i].fe_off,
2120 img_extents[i].fe_len,
2121 &img_req->object_extents,
2122 fctx->copy_fn, &fctx->iter);
2123 if (ret)
2124 return ret;
2125 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002126
Ilya Dryomovafb97882018-02-06 19:26:35 +01002127 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002128}
2129
Ilya Dryomov5a237812018-02-06 19:26:34 +01002130static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2131 u64 off, u64 len)
2132{
2133 struct ceph_file_extent ex = { off, len };
2134 union rbd_img_fill_iter dummy;
2135 struct rbd_img_fill_ctx fctx = {
2136 .pos_type = OBJ_REQUEST_NODATA,
2137 .pos = &dummy,
2138 };
2139
2140 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2141}
2142
2143static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2144{
2145 struct rbd_obj_request *obj_req =
2146 container_of(ex, struct rbd_obj_request, ex);
2147 struct ceph_bio_iter *it = arg;
2148
2149 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2150 obj_req->bio_pos = *it;
2151 ceph_bio_iter_advance(it, bytes);
2152}
2153
Ilya Dryomovafb97882018-02-06 19:26:35 +01002154static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2155{
2156 struct rbd_obj_request *obj_req =
2157 container_of(ex, struct rbd_obj_request, ex);
2158 struct ceph_bio_iter *it = arg;
2159
2160 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2161 ceph_bio_iter_advance_step(it, bytes, ({
2162 obj_req->bvec_count++;
2163 }));
2164
2165}
2166
2167static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2168{
2169 struct rbd_obj_request *obj_req =
2170 container_of(ex, struct rbd_obj_request, ex);
2171 struct ceph_bio_iter *it = arg;
2172
2173 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2174 ceph_bio_iter_advance_step(it, bytes, ({
2175 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2176 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2177 }));
2178}
2179
Ilya Dryomov5a237812018-02-06 19:26:34 +01002180static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2181 struct ceph_file_extent *img_extents,
2182 u32 num_img_extents,
2183 struct ceph_bio_iter *bio_pos)
2184{
2185 struct rbd_img_fill_ctx fctx = {
2186 .pos_type = OBJ_REQUEST_BIO,
2187 .pos = (union rbd_img_fill_iter *)bio_pos,
2188 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002189 .count_fn = count_bio_bvecs,
2190 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002191 };
2192
2193 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2194 &fctx);
2195}
2196
2197static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2198 u64 off, u64 len, struct bio *bio)
2199{
2200 struct ceph_file_extent ex = { off, len };
2201 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2202
2203 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2204}
2205
2206static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2207{
2208 struct rbd_obj_request *obj_req =
2209 container_of(ex, struct rbd_obj_request, ex);
2210 struct ceph_bvec_iter *it = arg;
2211
2212 obj_req->bvec_pos = *it;
2213 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2214 ceph_bvec_iter_advance(it, bytes);
2215}
2216
Ilya Dryomovafb97882018-02-06 19:26:35 +01002217static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2218{
2219 struct rbd_obj_request *obj_req =
2220 container_of(ex, struct rbd_obj_request, ex);
2221 struct ceph_bvec_iter *it = arg;
2222
2223 ceph_bvec_iter_advance_step(it, bytes, ({
2224 obj_req->bvec_count++;
2225 }));
2226}
2227
2228static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2229{
2230 struct rbd_obj_request *obj_req =
2231 container_of(ex, struct rbd_obj_request, ex);
2232 struct ceph_bvec_iter *it = arg;
2233
2234 ceph_bvec_iter_advance_step(it, bytes, ({
2235 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2236 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2237 }));
2238}
2239
Ilya Dryomov5a237812018-02-06 19:26:34 +01002240static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2241 struct ceph_file_extent *img_extents,
2242 u32 num_img_extents,
2243 struct ceph_bvec_iter *bvec_pos)
2244{
2245 struct rbd_img_fill_ctx fctx = {
2246 .pos_type = OBJ_REQUEST_BVECS,
2247 .pos = (union rbd_img_fill_iter *)bvec_pos,
2248 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002249 .count_fn = count_bvecs,
2250 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002251 };
2252
2253 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2254 &fctx);
2255}
2256
2257static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2258 struct ceph_file_extent *img_extents,
2259 u32 num_img_extents,
2260 struct bio_vec *bvecs)
2261{
2262 struct ceph_bvec_iter it = {
2263 .bvecs = bvecs,
2264 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2265 num_img_extents) },
2266 };
2267
2268 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2269 &it);
2270}
2271
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002272static void rbd_img_request_submit(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002273{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002274 struct rbd_obj_request *obj_request;
2275
Alex Elder37206ee2013-02-20 17:32:08 -06002276 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002277
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002278 rbd_img_request_get(img_request);
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002279 for_each_obj_request(img_request, obj_request)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002280 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002281
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002282 rbd_img_request_put(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002283}
2284
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002285static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002286{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002287 struct rbd_img_request *img_req = obj_req->img_request;
2288 struct rbd_img_request *child_img_req;
2289 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002290
Ilya Dryomove93aca02018-02-06 19:26:35 +01002291 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2292 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002293 if (!child_img_req)
2294 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002295
Ilya Dryomove93aca02018-02-06 19:26:35 +01002296 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2297 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002298
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002299 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002300 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002301 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002302 ret = __rbd_img_fill_from_bio(child_img_req,
2303 obj_req->img_extents,
2304 obj_req->num_img_extents,
2305 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002306 break;
2307 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002308 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002309 ret = __rbd_img_fill_from_bvecs(child_img_req,
2310 obj_req->img_extents,
2311 obj_req->num_img_extents,
2312 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002313 break;
2314 default:
2315 rbd_assert(0);
2316 }
2317 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002318 ret = rbd_img_fill_from_bvecs(child_img_req,
2319 obj_req->img_extents,
2320 obj_req->num_img_extents,
2321 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002322 }
2323 if (ret) {
2324 rbd_img_request_put(child_img_req);
2325 return ret;
2326 }
2327
2328 rbd_img_request_submit(child_img_req);
2329 return 0;
2330}
2331
2332static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2333{
2334 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2335 int ret;
2336
2337 if (obj_req->result == -ENOENT &&
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002338 rbd_dev->parent_overlap && !obj_req->tried_parent) {
2339 /* reverse map this object extent onto the parent */
2340 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002341 if (ret) {
2342 obj_req->result = ret;
2343 return true;
2344 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002345
2346 if (obj_req->num_img_extents) {
2347 obj_req->tried_parent = true;
2348 ret = rbd_obj_read_from_parent(obj_req);
2349 if (ret) {
2350 obj_req->result = ret;
2351 return true;
2352 }
2353 return false;
2354 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002355 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002356
2357 /*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002358 * -ENOENT means a hole in the image -- zero-fill the entire
2359 * length of the request. A short read also implies zero-fill
2360 * to the end of the request. In both cases we update xferred
2361 * count to indicate the whole request was satisfied.
Alex Elder02c74fb2013-05-06 17:40:33 -05002362 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002363 if (obj_req->result == -ENOENT ||
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002364 (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002365 rbd_assert(!obj_req->xferred || !obj_req->result);
2366 rbd_obj_zero_range(obj_req, obj_req->xferred,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002367 obj_req->ex.oe_len - obj_req->xferred);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002368 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002369 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002370 }
2371
2372 return true;
2373}
2374
2375/*
2376 * copyup_bvecs pages are never highmem pages
2377 */
2378static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2379{
2380 struct ceph_bvec_iter it = {
2381 .bvecs = bvecs,
2382 .iter = { .bi_size = bytes },
2383 };
2384
2385 ceph_bvec_iter_advance_step(&it, bytes, ({
2386 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2387 bv.bv_len))
2388 return false;
2389 }));
2390 return true;
2391}
2392
2393static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2394{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002395 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
Chengguang Xufe943d52018-04-12 12:04:55 +08002396 int ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002397
2398 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2399 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2400 rbd_osd_req_destroy(obj_req->osd_req);
2401
2402 /*
2403 * Create a copyup request with the same number of OSD ops as
2404 * the original request. The original request was stat + op(s),
2405 * the new copyup request will be copyup + the same op(s).
2406 */
Ilya Dryomova162b302018-01-30 17:52:10 +01002407 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002408 if (!obj_req->osd_req)
2409 return -ENOMEM;
2410
Ilya Dryomov24639ce562018-09-26 19:12:07 +02002411 ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup");
Chengguang Xufe943d52018-04-12 12:04:55 +08002412 if (ret)
2413 return ret;
2414
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002415 /*
2416 * Only send non-zero copyup data to save some I/O and network
2417 * bandwidth -- zero copyup data is equivalent to the object not
2418 * existing.
2419 */
2420 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2421 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2422 bytes = 0;
2423 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002424 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
Ilya Dryomov0010f702018-05-04 16:57:30 +02002425 obj_req->copyup_bvecs,
2426 obj_req->copyup_bvec_count,
2427 bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002428
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002429 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002430 case OBJ_OP_WRITE:
2431 __rbd_obj_setup_write(obj_req, 1);
2432 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002433 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002434 rbd_assert(!rbd_obj_is_entire(obj_req));
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002435 __rbd_obj_setup_zeroout(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002436 break;
2437 default:
2438 rbd_assert(0);
2439 }
2440
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002441 ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
2442 if (ret)
2443 return ret;
2444
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002445 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002446 return 0;
2447}
2448
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002449static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2450{
2451 u32 i;
2452
2453 rbd_assert(!obj_req->copyup_bvecs);
2454 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2455 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2456 sizeof(*obj_req->copyup_bvecs),
2457 GFP_NOIO);
2458 if (!obj_req->copyup_bvecs)
2459 return -ENOMEM;
2460
2461 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2462 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2463
2464 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2465 if (!obj_req->copyup_bvecs[i].bv_page)
2466 return -ENOMEM;
2467
2468 obj_req->copyup_bvecs[i].bv_offset = 0;
2469 obj_req->copyup_bvecs[i].bv_len = len;
2470 obj_overlap -= len;
2471 }
2472
2473 rbd_assert(!obj_overlap);
2474 return 0;
2475}
2476
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002477static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2478{
2479 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002480 int ret;
2481
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002482 rbd_assert(obj_req->num_img_extents);
2483 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2484 rbd_dev->parent_overlap);
2485 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002486 /*
2487 * The overlap has become 0 (most likely because the
2488 * image has been flattened). Use rbd_obj_issue_copyup()
2489 * to re-submit the original write request -- the copyup
2490 * operation itself will be a no-op, since someone must
2491 * have populated the child object while we weren't
2492 * looking. Move to WRITE_FLAT state as we'll be done
2493 * with the operation once the null copyup completes.
2494 */
2495 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2496 return rbd_obj_issue_copyup(obj_req, 0);
2497 }
2498
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002499 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002500 if (ret)
2501 return ret;
2502
2503 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002504 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002505}
2506
2507static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2508{
2509 int ret;
2510
2511again:
2512 switch (obj_req->write_state) {
2513 case RBD_OBJ_WRITE_GUARD:
2514 rbd_assert(!obj_req->xferred);
2515 if (obj_req->result == -ENOENT) {
2516 /*
2517 * The target object doesn't exist. Read the data for
2518 * the entire target object up to the overlap point (if
2519 * any) from the parent, so we can use it for a copyup.
2520 */
2521 ret = rbd_obj_handle_write_guard(obj_req);
2522 if (ret) {
2523 obj_req->result = ret;
2524 return true;
2525 }
2526 return false;
2527 }
2528 /* fall through */
2529 case RBD_OBJ_WRITE_FLAT:
2530 if (!obj_req->result)
2531 /*
2532 * There is no such thing as a successful short
2533 * write -- indicate the whole request was satisfied.
2534 */
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002535 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002536 return true;
2537 case RBD_OBJ_WRITE_COPYUP:
2538 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2539 if (obj_req->result)
2540 goto again;
2541
2542 rbd_assert(obj_req->xferred);
2543 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2544 if (ret) {
2545 obj_req->result = ret;
2546 return true;
2547 }
2548 return false;
2549 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002550 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002551 }
2552}
2553
2554/*
2555 * Returns true if @obj_req is completed, or false otherwise.
2556 */
2557static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2558{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002559 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002560 case OBJ_OP_READ:
2561 return rbd_obj_handle_read(obj_req);
2562 case OBJ_OP_WRITE:
2563 return rbd_obj_handle_write(obj_req);
2564 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002565 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002566 if (rbd_obj_handle_write(obj_req)) {
2567 /*
2568 * Hide -ENOENT from delete/truncate/zero -- discarding
2569 * a non-existent object is not a problem.
2570 */
2571 if (obj_req->result == -ENOENT) {
2572 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002573 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002574 }
2575 return true;
2576 }
2577 return false;
2578 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002579 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002580 }
2581}
2582
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002583static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2584{
2585 struct rbd_img_request *img_req = obj_req->img_request;
2586
2587 rbd_assert((!obj_req->result &&
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002588 obj_req->xferred == obj_req->ex.oe_len) ||
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002589 (obj_req->result < 0 && !obj_req->xferred));
2590 if (!obj_req->result) {
2591 img_req->xferred += obj_req->xferred;
Ilya Dryomov980917f2016-09-12 18:59:42 +02002592 return;
Alex Elder02c74fb2013-05-06 17:40:33 -05002593 }
2594
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002595 rbd_warn(img_req->rbd_dev,
2596 "%s at objno %llu %llu~%llu result %d xferred %llu",
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002597 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2598 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002599 obj_req->xferred);
2600 if (!img_req->result) {
2601 img_req->result = obj_req->result;
2602 img_req->xferred = 0;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002603 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06002604}
2605
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002606static void rbd_img_end_child_request(struct rbd_img_request *img_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002607{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002608 struct rbd_obj_request *obj_req = img_req->obj_request;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002609
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002610 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002611 rbd_assert((!img_req->result &&
2612 img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
2613 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002614
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002615 obj_req->result = img_req->result;
2616 obj_req->xferred = img_req->xferred;
2617 rbd_img_request_put(img_req);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002618}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002619
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002620static void rbd_img_end_request(struct rbd_img_request *img_req)
2621{
2622 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2623 rbd_assert((!img_req->result &&
2624 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2625 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002626
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002627 blk_mq_end_request(img_req->rq,
2628 errno_to_blk_status(img_req->result));
2629 rbd_img_request_put(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002630}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002631
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002632static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2633{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002634 struct rbd_img_request *img_req;
2635
2636again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002637 if (!__rbd_obj_handle_request(obj_req))
2638 return;
2639
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002640 img_req = obj_req->img_request;
2641 spin_lock(&img_req->completion_lock);
2642 rbd_obj_end_request(obj_req);
2643 rbd_assert(img_req->pending_count);
2644 if (--img_req->pending_count) {
2645 spin_unlock(&img_req->completion_lock);
2646 return;
2647 }
2648
2649 spin_unlock(&img_req->completion_lock);
2650 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2651 obj_req = img_req->obj_request;
2652 rbd_img_end_child_request(img_req);
2653 goto again;
2654 }
2655 rbd_img_end_request(img_req);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002656}
2657
Ilya Dryomoved95b212016-08-12 16:40:02 +02002658static const struct rbd_client_id rbd_empty_cid;
2659
2660static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2661 const struct rbd_client_id *rhs)
2662{
2663 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2664}
2665
2666static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2667{
2668 struct rbd_client_id cid;
2669
2670 mutex_lock(&rbd_dev->watch_mutex);
2671 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2672 cid.handle = rbd_dev->watch_cookie;
2673 mutex_unlock(&rbd_dev->watch_mutex);
2674 return cid;
2675}
2676
2677/*
2678 * lock_rwsem must be held for write
2679 */
2680static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2681 const struct rbd_client_id *cid)
2682{
2683 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2684 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2685 cid->gid, cid->handle);
2686 rbd_dev->owner_cid = *cid; /* struct */
2687}
2688
2689static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2690{
2691 mutex_lock(&rbd_dev->watch_mutex);
2692 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2693 mutex_unlock(&rbd_dev->watch_mutex);
2694}
2695
Florian Margaineedd8ca82017-12-13 16:43:59 +01002696static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2697{
2698 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2699
2700 strcpy(rbd_dev->lock_cookie, cookie);
2701 rbd_set_owner_cid(rbd_dev, &cid);
2702 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2703}
2704
Ilya Dryomoved95b212016-08-12 16:40:02 +02002705/*
2706 * lock_rwsem must be held for write
2707 */
2708static int rbd_lock(struct rbd_device *rbd_dev)
2709{
2710 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002711 char cookie[32];
2712 int ret;
2713
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002714 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2715 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002716
2717 format_lock_cookie(rbd_dev, cookie);
2718 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2719 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2720 RBD_LOCK_TAG, "", 0);
2721 if (ret)
2722 return ret;
2723
2724 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002725 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002726 return 0;
2727}
2728
2729/*
2730 * lock_rwsem must be held for write
2731 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002732static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002733{
2734 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002735 int ret;
2736
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002737 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2738 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002739
Ilya Dryomoved95b212016-08-12 16:40:02 +02002740 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002741 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002742 if (ret && ret != -ENOENT)
2743 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002744
Ilya Dryomovbbead742017-04-13 12:17:38 +02002745 /* treat errors as the image is unlocked */
2746 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002747 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002748 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2749 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002750}
2751
2752static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2753 enum rbd_notify_op notify_op,
2754 struct page ***preply_pages,
2755 size_t *preply_len)
2756{
2757 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2758 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07002759 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
2760 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002761 void *p = buf;
2762
2763 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2764
2765 /* encode *LockPayload NotifyMessage (op + ClientId) */
2766 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2767 ceph_encode_32(&p, notify_op);
2768 ceph_encode_64(&p, cid.gid);
2769 ceph_encode_64(&p, cid.handle);
2770
2771 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2772 &rbd_dev->header_oloc, buf, buf_size,
2773 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2774}
2775
2776static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2777 enum rbd_notify_op notify_op)
2778{
2779 struct page **reply_pages;
2780 size_t reply_len;
2781
2782 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2783 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2784}
2785
2786static void rbd_notify_acquired_lock(struct work_struct *work)
2787{
2788 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2789 acquired_lock_work);
2790
2791 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2792}
2793
2794static void rbd_notify_released_lock(struct work_struct *work)
2795{
2796 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2797 released_lock_work);
2798
2799 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2800}
2801
2802static int rbd_request_lock(struct rbd_device *rbd_dev)
2803{
2804 struct page **reply_pages;
2805 size_t reply_len;
2806 bool lock_owner_responded = false;
2807 int ret;
2808
2809 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2810
2811 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2812 &reply_pages, &reply_len);
2813 if (ret && ret != -ETIMEDOUT) {
2814 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2815 goto out;
2816 }
2817
2818 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2819 void *p = page_address(reply_pages[0]);
2820 void *const end = p + reply_len;
2821 u32 n;
2822
2823 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2824 while (n--) {
2825 u8 struct_v;
2826 u32 len;
2827
2828 ceph_decode_need(&p, end, 8 + 8, e_inval);
2829 p += 8 + 8; /* skip gid and cookie */
2830
2831 ceph_decode_32_safe(&p, end, len, e_inval);
2832 if (!len)
2833 continue;
2834
2835 if (lock_owner_responded) {
2836 rbd_warn(rbd_dev,
2837 "duplicate lock owners detected");
2838 ret = -EIO;
2839 goto out;
2840 }
2841
2842 lock_owner_responded = true;
2843 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2844 &struct_v, &len);
2845 if (ret) {
2846 rbd_warn(rbd_dev,
2847 "failed to decode ResponseMessage: %d",
2848 ret);
2849 goto e_inval;
2850 }
2851
2852 ret = ceph_decode_32(&p);
2853 }
2854 }
2855
2856 if (!lock_owner_responded) {
2857 rbd_warn(rbd_dev, "no lock owners detected");
2858 ret = -ETIMEDOUT;
2859 }
2860
2861out:
2862 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2863 return ret;
2864
2865e_inval:
2866 ret = -EINVAL;
2867 goto out;
2868}
2869
2870static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2871{
2872 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2873
2874 cancel_delayed_work(&rbd_dev->lock_dwork);
2875 if (wake_all)
2876 wake_up_all(&rbd_dev->lock_waitq);
2877 else
2878 wake_up(&rbd_dev->lock_waitq);
2879}
2880
2881static int get_lock_owner_info(struct rbd_device *rbd_dev,
2882 struct ceph_locker **lockers, u32 *num_lockers)
2883{
2884 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2885 u8 lock_type;
2886 char *lock_tag;
2887 int ret;
2888
2889 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2890
2891 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2892 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2893 &lock_type, &lock_tag, lockers, num_lockers);
2894 if (ret)
2895 return ret;
2896
2897 if (*num_lockers == 0) {
2898 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2899 goto out;
2900 }
2901
2902 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2903 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2904 lock_tag);
2905 ret = -EBUSY;
2906 goto out;
2907 }
2908
2909 if (lock_type == CEPH_CLS_LOCK_SHARED) {
2910 rbd_warn(rbd_dev, "shared lock type detected");
2911 ret = -EBUSY;
2912 goto out;
2913 }
2914
2915 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2916 strlen(RBD_LOCK_COOKIE_PREFIX))) {
2917 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2918 (*lockers)[0].id.cookie);
2919 ret = -EBUSY;
2920 goto out;
2921 }
2922
2923out:
2924 kfree(lock_tag);
2925 return ret;
2926}
2927
2928static int find_watcher(struct rbd_device *rbd_dev,
2929 const struct ceph_locker *locker)
2930{
2931 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2932 struct ceph_watch_item *watchers;
2933 u32 num_watchers;
2934 u64 cookie;
2935 int i;
2936 int ret;
2937
2938 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2939 &rbd_dev->header_oloc, &watchers,
2940 &num_watchers);
2941 if (ret)
2942 return ret;
2943
2944 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2945 for (i = 0; i < num_watchers; i++) {
2946 if (!memcmp(&watchers[i].addr, &locker->info.addr,
2947 sizeof(locker->info.addr)) &&
2948 watchers[i].cookie == cookie) {
2949 struct rbd_client_id cid = {
2950 .gid = le64_to_cpu(watchers[i].name.num),
2951 .handle = cookie,
2952 };
2953
2954 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
2955 rbd_dev, cid.gid, cid.handle);
2956 rbd_set_owner_cid(rbd_dev, &cid);
2957 ret = 1;
2958 goto out;
2959 }
2960 }
2961
2962 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
2963 ret = 0;
2964out:
2965 kfree(watchers);
2966 return ret;
2967}
2968
2969/*
2970 * lock_rwsem must be held for write
2971 */
2972static int rbd_try_lock(struct rbd_device *rbd_dev)
2973{
2974 struct ceph_client *client = rbd_dev->rbd_client->client;
2975 struct ceph_locker *lockers;
2976 u32 num_lockers;
2977 int ret;
2978
2979 for (;;) {
2980 ret = rbd_lock(rbd_dev);
2981 if (ret != -EBUSY)
2982 return ret;
2983
2984 /* determine if the current lock holder is still alive */
2985 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
2986 if (ret)
2987 return ret;
2988
2989 if (num_lockers == 0)
2990 goto again;
2991
2992 ret = find_watcher(rbd_dev, lockers);
2993 if (ret) {
2994 if (ret > 0)
2995 ret = 0; /* have to request lock */
2996 goto out;
2997 }
2998
2999 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3000 ENTITY_NAME(lockers[0].id.name));
3001
3002 ret = ceph_monc_blacklist_add(&client->monc,
3003 &lockers[0].info.addr);
3004 if (ret) {
3005 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3006 ENTITY_NAME(lockers[0].id.name), ret);
3007 goto out;
3008 }
3009
3010 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3011 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3012 lockers[0].id.cookie,
3013 &lockers[0].id.name);
3014 if (ret && ret != -ENOENT)
3015 goto out;
3016
3017again:
3018 ceph_free_lockers(lockers, num_lockers);
3019 }
3020
3021out:
3022 ceph_free_lockers(lockers, num_lockers);
3023 return ret;
3024}
3025
3026/*
3027 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3028 */
3029static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3030 int *pret)
3031{
3032 enum rbd_lock_state lock_state;
3033
3034 down_read(&rbd_dev->lock_rwsem);
3035 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3036 rbd_dev->lock_state);
3037 if (__rbd_is_lock_owner(rbd_dev)) {
3038 lock_state = rbd_dev->lock_state;
3039 up_read(&rbd_dev->lock_rwsem);
3040 return lock_state;
3041 }
3042
3043 up_read(&rbd_dev->lock_rwsem);
3044 down_write(&rbd_dev->lock_rwsem);
3045 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3046 rbd_dev->lock_state);
3047 if (!__rbd_is_lock_owner(rbd_dev)) {
3048 *pret = rbd_try_lock(rbd_dev);
3049 if (*pret)
3050 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3051 }
3052
3053 lock_state = rbd_dev->lock_state;
3054 up_write(&rbd_dev->lock_rwsem);
3055 return lock_state;
3056}
3057
3058static void rbd_acquire_lock(struct work_struct *work)
3059{
3060 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3061 struct rbd_device, lock_dwork);
3062 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003063 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003064
3065 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3066again:
3067 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3068 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3069 if (lock_state == RBD_LOCK_STATE_LOCKED)
3070 wake_requests(rbd_dev, true);
3071 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3072 rbd_dev, lock_state, ret);
3073 return;
3074 }
3075
3076 ret = rbd_request_lock(rbd_dev);
3077 if (ret == -ETIMEDOUT) {
3078 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003079 } else if (ret == -EROFS) {
3080 rbd_warn(rbd_dev, "peer will not release lock");
3081 /*
3082 * If this is rbd_add_acquire_lock(), we want to fail
3083 * immediately -- reuse BLACKLISTED flag. Otherwise we
3084 * want to block.
3085 */
3086 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3087 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3088 /* wake "rbd map --exclusive" process */
3089 wake_requests(rbd_dev, false);
3090 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003091 } else if (ret < 0) {
3092 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3093 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3094 RBD_RETRY_DELAY);
3095 } else {
3096 /*
3097 * lock owner acked, but resend if we don't see them
3098 * release the lock
3099 */
3100 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3101 rbd_dev);
3102 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3103 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3104 }
3105}
3106
3107/*
3108 * lock_rwsem must be held for write
3109 */
3110static bool rbd_release_lock(struct rbd_device *rbd_dev)
3111{
3112 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3113 rbd_dev->lock_state);
3114 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3115 return false;
3116
3117 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3118 downgrade_write(&rbd_dev->lock_rwsem);
3119 /*
3120 * Ensure that all in-flight IO is flushed.
3121 *
3122 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3123 * may be shared with other devices.
3124 */
3125 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3126 up_read(&rbd_dev->lock_rwsem);
3127
3128 down_write(&rbd_dev->lock_rwsem);
3129 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3130 rbd_dev->lock_state);
3131 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3132 return false;
3133
Ilya Dryomovbbead742017-04-13 12:17:38 +02003134 rbd_unlock(rbd_dev);
3135 /*
3136 * Give others a chance to grab the lock - we would re-acquire
3137 * almost immediately if we got new IO during ceph_osdc_sync()
3138 * otherwise. We need to ack our own notifications, so this
3139 * lock_dwork will be requeued from rbd_wait_state_locked()
3140 * after wake_requests() in rbd_handle_released_lock().
3141 */
3142 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003143 return true;
3144}
3145
3146static void rbd_release_lock_work(struct work_struct *work)
3147{
3148 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3149 unlock_work);
3150
3151 down_write(&rbd_dev->lock_rwsem);
3152 rbd_release_lock(rbd_dev);
3153 up_write(&rbd_dev->lock_rwsem);
3154}
3155
3156static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3157 void **p)
3158{
3159 struct rbd_client_id cid = { 0 };
3160
3161 if (struct_v >= 2) {
3162 cid.gid = ceph_decode_64(p);
3163 cid.handle = ceph_decode_64(p);
3164 }
3165
3166 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3167 cid.handle);
3168 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3169 down_write(&rbd_dev->lock_rwsem);
3170 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3171 /*
3172 * we already know that the remote client is
3173 * the owner
3174 */
3175 up_write(&rbd_dev->lock_rwsem);
3176 return;
3177 }
3178
3179 rbd_set_owner_cid(rbd_dev, &cid);
3180 downgrade_write(&rbd_dev->lock_rwsem);
3181 } else {
3182 down_read(&rbd_dev->lock_rwsem);
3183 }
3184
3185 if (!__rbd_is_lock_owner(rbd_dev))
3186 wake_requests(rbd_dev, false);
3187 up_read(&rbd_dev->lock_rwsem);
3188}
3189
3190static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3191 void **p)
3192{
3193 struct rbd_client_id cid = { 0 };
3194
3195 if (struct_v >= 2) {
3196 cid.gid = ceph_decode_64(p);
3197 cid.handle = ceph_decode_64(p);
3198 }
3199
3200 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3201 cid.handle);
3202 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3203 down_write(&rbd_dev->lock_rwsem);
3204 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3205 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3206 __func__, rbd_dev, cid.gid, cid.handle,
3207 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3208 up_write(&rbd_dev->lock_rwsem);
3209 return;
3210 }
3211
3212 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3213 downgrade_write(&rbd_dev->lock_rwsem);
3214 } else {
3215 down_read(&rbd_dev->lock_rwsem);
3216 }
3217
3218 if (!__rbd_is_lock_owner(rbd_dev))
3219 wake_requests(rbd_dev, false);
3220 up_read(&rbd_dev->lock_rwsem);
3221}
3222
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003223/*
3224 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3225 * ResponseMessage is needed.
3226 */
3227static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3228 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003229{
3230 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3231 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003232 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003233
3234 if (struct_v >= 2) {
3235 cid.gid = ceph_decode_64(p);
3236 cid.handle = ceph_decode_64(p);
3237 }
3238
3239 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3240 cid.handle);
3241 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003242 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003243
3244 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003245 if (__rbd_is_lock_owner(rbd_dev)) {
3246 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3247 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3248 goto out_unlock;
3249
3250 /*
3251 * encode ResponseMessage(0) so the peer can detect
3252 * a missing owner
3253 */
3254 result = 0;
3255
3256 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003257 if (!rbd_dev->opts->exclusive) {
3258 dout("%s rbd_dev %p queueing unlock_work\n",
3259 __func__, rbd_dev);
3260 queue_work(rbd_dev->task_wq,
3261 &rbd_dev->unlock_work);
3262 } else {
3263 /* refuse to release the lock */
3264 result = -EROFS;
3265 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003266 }
3267 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003268
3269out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003270 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003271 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003272}
3273
3274static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3275 u64 notify_id, u64 cookie, s32 *result)
3276{
3277 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07003278 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3279 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003280 int ret;
3281
3282 if (result) {
3283 void *p = buf;
3284
3285 /* encode ResponseMessage */
3286 ceph_start_encoding(&p, 1, 1,
3287 buf_size - CEPH_ENCODING_START_BLK_LEN);
3288 ceph_encode_32(&p, *result);
3289 } else {
3290 buf_size = 0;
3291 }
3292
3293 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3294 &rbd_dev->header_oloc, notify_id, cookie,
3295 buf, buf_size);
3296 if (ret)
3297 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3298}
3299
3300static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3301 u64 cookie)
3302{
3303 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3304 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3305}
3306
3307static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3308 u64 notify_id, u64 cookie, s32 result)
3309{
3310 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3311 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3312}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003313
3314static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3315 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003316{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003317 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003318 void *p = data;
3319 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003320 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003321 u32 len;
3322 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003323 int ret;
3324
Ilya Dryomoved95b212016-08-12 16:40:02 +02003325 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3326 __func__, rbd_dev, cookie, notify_id, data_len);
3327 if (data_len) {
3328 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3329 &struct_v, &len);
3330 if (ret) {
3331 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3332 ret);
3333 return;
3334 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003335
Ilya Dryomoved95b212016-08-12 16:40:02 +02003336 notify_op = ceph_decode_32(&p);
3337 } else {
3338 /* legacy notification for header updates */
3339 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3340 len = 0;
3341 }
Alex Elderb8d70032012-11-30 17:53:04 -06003342
Ilya Dryomoved95b212016-08-12 16:40:02 +02003343 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3344 switch (notify_op) {
3345 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3346 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3347 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3348 break;
3349 case RBD_NOTIFY_OP_RELEASED_LOCK:
3350 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3351 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3352 break;
3353 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003354 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3355 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003356 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003357 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003358 else
3359 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3360 break;
3361 case RBD_NOTIFY_OP_HEADER_UPDATE:
3362 ret = rbd_dev_refresh(rbd_dev);
3363 if (ret)
3364 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3365
3366 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3367 break;
3368 default:
3369 if (rbd_is_lock_owner(rbd_dev))
3370 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3371 cookie, -EOPNOTSUPP);
3372 else
3373 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3374 break;
3375 }
Alex Elderb8d70032012-11-30 17:53:04 -06003376}
3377
Ilya Dryomov99d16942016-08-12 16:11:41 +02003378static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3379
Ilya Dryomov922dab62016-05-26 01:15:02 +02003380static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003381{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003382 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003383
Ilya Dryomov922dab62016-05-26 01:15:02 +02003384 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003385
Ilya Dryomoved95b212016-08-12 16:40:02 +02003386 down_write(&rbd_dev->lock_rwsem);
3387 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3388 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003389
Ilya Dryomov99d16942016-08-12 16:11:41 +02003390 mutex_lock(&rbd_dev->watch_mutex);
3391 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3392 __rbd_unregister_watch(rbd_dev);
3393 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003394
Ilya Dryomov99d16942016-08-12 16:11:41 +02003395 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003396 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003397 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003398}
3399
3400/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003401 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003402 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003403static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003404{
3405 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003406 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003407
Ilya Dryomov922dab62016-05-26 01:15:02 +02003408 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003409 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003410
Ilya Dryomov922dab62016-05-26 01:15:02 +02003411 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3412 &rbd_dev->header_oloc, rbd_watch_cb,
3413 rbd_watch_errcb, rbd_dev);
3414 if (IS_ERR(handle))
3415 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003416
Ilya Dryomov922dab62016-05-26 01:15:02 +02003417 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003418 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003419}
3420
Ilya Dryomov99d16942016-08-12 16:11:41 +02003421/*
3422 * watch_mutex must be locked
3423 */
3424static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003425{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003426 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3427 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003428
Ilya Dryomov99d16942016-08-12 16:11:41 +02003429 rbd_assert(rbd_dev->watch_handle);
3430 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003431
Ilya Dryomov922dab62016-05-26 01:15:02 +02003432 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3433 if (ret)
3434 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003435
Ilya Dryomov922dab62016-05-26 01:15:02 +02003436 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003437}
3438
Ilya Dryomov99d16942016-08-12 16:11:41 +02003439static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003440{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003441 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003442
Ilya Dryomov99d16942016-08-12 16:11:41 +02003443 mutex_lock(&rbd_dev->watch_mutex);
3444 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3445 ret = __rbd_register_watch(rbd_dev);
3446 if (ret)
3447 goto out;
3448
3449 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3450 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3451
3452out:
3453 mutex_unlock(&rbd_dev->watch_mutex);
3454 return ret;
3455}
3456
3457static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3458{
3459 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3460
Ilya Dryomoved95b212016-08-12 16:40:02 +02003461 cancel_work_sync(&rbd_dev->acquired_lock_work);
3462 cancel_work_sync(&rbd_dev->released_lock_work);
3463 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3464 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003465}
3466
3467static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3468{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003469 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003470 cancel_tasks_sync(rbd_dev);
3471
3472 mutex_lock(&rbd_dev->watch_mutex);
3473 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3474 __rbd_unregister_watch(rbd_dev);
3475 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3476 mutex_unlock(&rbd_dev->watch_mutex);
3477
Dongsheng Yang23edca82018-06-04 06:24:37 -04003478 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomov811c6682016-04-15 16:22:16 +02003479 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003480}
3481
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003482/*
3483 * lock_rwsem must be held for write
3484 */
3485static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3486{
3487 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3488 char cookie[32];
3489 int ret;
3490
3491 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3492
3493 format_lock_cookie(rbd_dev, cookie);
3494 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3495 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3496 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3497 RBD_LOCK_TAG, cookie);
3498 if (ret) {
3499 if (ret != -EOPNOTSUPP)
3500 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3501 ret);
3502
3503 /*
3504 * Lock cookie cannot be updated on older OSDs, so do
3505 * a manual release and queue an acquire.
3506 */
3507 if (rbd_release_lock(rbd_dev))
3508 queue_delayed_work(rbd_dev->task_wq,
3509 &rbd_dev->lock_dwork, 0);
3510 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003511 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003512 }
3513}
3514
Ilya Dryomov99d16942016-08-12 16:11:41 +02003515static void rbd_reregister_watch(struct work_struct *work)
3516{
3517 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3518 struct rbd_device, watch_dwork);
3519 int ret;
3520
3521 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3522
3523 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003524 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3525 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003526 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003527 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003528
3529 ret = __rbd_register_watch(rbd_dev);
3530 if (ret) {
3531 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003532 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003533 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003534 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003535 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003536 queue_delayed_work(rbd_dev->task_wq,
3537 &rbd_dev->watch_dwork,
3538 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003539 }
3540 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003541 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003542 }
3543
3544 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3545 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3546 mutex_unlock(&rbd_dev->watch_mutex);
3547
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003548 down_write(&rbd_dev->lock_rwsem);
3549 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3550 rbd_reacquire_lock(rbd_dev);
3551 up_write(&rbd_dev->lock_rwsem);
3552
Ilya Dryomov99d16942016-08-12 16:11:41 +02003553 ret = rbd_dev_refresh(rbd_dev);
3554 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00003555 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003556}
3557
Alex Elder36be9a72013-01-19 00:30:28 -06003558/*
Alex Elderf40eb342013-04-25 15:09:42 -05003559 * Synchronous osd object method call. Returns the number of bytes
3560 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003561 */
3562static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003563 struct ceph_object_id *oid,
3564 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003565 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003566 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003567 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003568 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003569 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003570{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003571 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3572 struct page *req_page = NULL;
3573 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003574 int ret;
3575
3576 /*
Alex Elder6010a452013-04-05 01:27:11 -05003577 * Method calls are ultimately read operations. The result
3578 * should placed into the inbound buffer provided. They
3579 * also supply outbound data--parameters for the object
3580 * method. Currently if this is present it will be a
3581 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003582 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003583 if (outbound) {
3584 if (outbound_size > PAGE_SIZE)
3585 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003586
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003587 req_page = alloc_page(GFP_KERNEL);
3588 if (!req_page)
3589 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003590
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003591 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003592 }
Alex Elder430c28c2013-04-03 21:32:51 -05003593
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003594 reply_page = alloc_page(GFP_KERNEL);
3595 if (!reply_page) {
3596 if (req_page)
3597 __free_page(req_page);
3598 return -ENOMEM;
3599 }
Alex Elder36be9a72013-01-19 00:30:28 -06003600
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003601 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3602 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3603 reply_page, &inbound_size);
3604 if (!ret) {
3605 memcpy(inbound, page_address(reply_page), inbound_size);
3606 ret = inbound_size;
3607 }
Alex Elder57385b52013-04-21 12:14:45 -05003608
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003609 if (req_page)
3610 __free_page(req_page);
3611 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003612 return ret;
3613}
3614
Ilya Dryomoved95b212016-08-12 16:40:02 +02003615/*
3616 * lock_rwsem must be held for read
3617 */
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003618static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003619{
3620 DEFINE_WAIT(wait);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003621 unsigned long timeout;
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003622 int ret = 0;
3623
3624 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3625 return -EBLACKLISTED;
3626
3627 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3628 return 0;
3629
3630 if (!may_acquire) {
3631 rbd_warn(rbd_dev, "exclusive lock required");
3632 return -EROFS;
3633 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003634
3635 do {
3636 /*
3637 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3638 * and cancel_delayed_work() in wake_requests().
3639 */
3640 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3641 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3642 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3643 TASK_UNINTERRUPTIBLE);
3644 up_read(&rbd_dev->lock_rwsem);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003645 timeout = schedule_timeout(ceph_timeout_jiffies(
3646 rbd_dev->opts->lock_timeout));
Ilya Dryomoved95b212016-08-12 16:40:02 +02003647 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003648 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3649 ret = -EBLACKLISTED;
3650 break;
3651 }
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003652 if (!timeout) {
3653 rbd_warn(rbd_dev, "timed out waiting for lock");
3654 ret = -ETIMEDOUT;
3655 break;
3656 }
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003657 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003658
Ilya Dryomoved95b212016-08-12 16:40:02 +02003659 finish_wait(&rbd_dev->lock_waitq, &wait);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003660 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003661}
3662
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003663static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003664{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003665 struct request *rq = blk_mq_rq_from_pdu(work);
3666 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003667 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003668 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003669 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3670 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003671 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003672 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003673 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003674 int result;
3675
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003676 switch (req_op(rq)) {
3677 case REQ_OP_DISCARD:
3678 op_type = OBJ_OP_DISCARD;
3679 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01003680 case REQ_OP_WRITE_ZEROES:
3681 op_type = OBJ_OP_ZEROOUT;
3682 break;
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003683 case REQ_OP_WRITE:
3684 op_type = OBJ_OP_WRITE;
3685 break;
3686 case REQ_OP_READ:
3687 op_type = OBJ_OP_READ;
3688 break;
3689 default:
3690 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003691 result = -EIO;
3692 goto err;
3693 }
3694
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003695 /* Ignore/skip any zero-length requests */
3696
3697 if (!length) {
3698 dout("%s: zero-length request\n", __func__);
3699 result = 0;
3700 goto err_rq;
3701 }
3702
Ilya Dryomov9568c932017-10-12 12:35:19 +02003703 rbd_assert(op_type == OBJ_OP_READ ||
3704 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003705
3706 /*
3707 * Quit early if the mapped snapshot no longer exists. It's
3708 * still possible the snapshot will have disappeared by the
3709 * time our request arrives at the osd, but there's no sense in
3710 * sending it if we already know.
3711 */
3712 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3713 dout("request for non-existent snapshot");
3714 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3715 result = -ENXIO;
3716 goto err_rq;
3717 }
3718
3719 if (offset && length > U64_MAX - offset + 1) {
3720 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3721 length);
3722 result = -EINVAL;
3723 goto err_rq; /* Shouldn't happen */
3724 }
3725
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003726 blk_mq_start_request(rq);
3727
Josh Durgin4e752f02014-04-08 11:12:11 -07003728 down_read(&rbd_dev->header_rwsem);
3729 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003730 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003731 snapc = rbd_dev->header.snapc;
3732 ceph_get_snap_context(snapc);
3733 }
3734 up_read(&rbd_dev->header_rwsem);
3735
3736 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003737 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003738 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003739 result = -EIO;
3740 goto err_rq;
3741 }
3742
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003743 must_be_locked =
3744 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3745 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003746 if (must_be_locked) {
3747 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003748 result = rbd_wait_state_locked(rbd_dev,
3749 !rbd_dev->opts->exclusive);
3750 if (result)
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003751 goto err_unlock;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003752 }
3753
Ilya Dryomovdfd98752018-02-06 19:26:35 +01003754 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003755 if (!img_request) {
3756 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003757 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003758 }
3759 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003760 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003761
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01003762 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
Ilya Dryomov5a237812018-02-06 19:26:34 +01003763 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003764 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01003765 result = rbd_img_fill_from_bio(img_request, offset, length,
3766 rq->bio);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003767 if (result)
3768 goto err_img_request;
3769
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01003770 rbd_img_request_submit(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003771 if (must_be_locked)
3772 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003773 return;
3774
3775err_img_request:
3776 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003777err_unlock:
3778 if (must_be_locked)
3779 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003780err_rq:
3781 if (result)
3782 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003783 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003784 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003785err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003786 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003787}
3788
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003789static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003790 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003791{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003792 struct request *rq = bd->rq;
3793 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003794
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003795 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003796 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003797}
3798
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003799static void rbd_free_disk(struct rbd_device *rbd_dev)
3800{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003801 blk_cleanup_queue(rbd_dev->disk->queue);
3802 blk_mq_free_tag_set(&rbd_dev->tag_set);
3803 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003804 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003805}
3806
Alex Elder788e2df2013-01-17 12:25:27 -06003807static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003808 struct ceph_object_id *oid,
3809 struct ceph_object_locator *oloc,
3810 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003811
3812{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003813 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3814 struct ceph_osd_request *req;
3815 struct page **pages;
3816 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003817 int ret;
3818
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003819 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3820 if (!req)
3821 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003822
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003823 ceph_oid_copy(&req->r_base_oid, oid);
3824 ceph_oloc_copy(&req->r_base_oloc, oloc);
3825 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003826
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003827 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3828 if (IS_ERR(pages)) {
3829 ret = PTR_ERR(pages);
3830 goto out_req;
3831 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003832
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003833 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3834 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3835 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003836
Ilya Dryomov26f887e2018-10-15 16:11:37 +02003837 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
3838 if (ret)
3839 goto out_req;
3840
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003841 ceph_osdc_start_request(osdc, req, false);
3842 ret = ceph_osdc_wait_request(osdc, req);
3843 if (ret >= 0)
3844 ceph_copy_from_page_vector(pages, buf, 0, ret);
3845
3846out_req:
3847 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003848 return ret;
3849}
3850
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003851/*
Alex Elder662518b2013-05-06 09:51:29 -05003852 * Read the complete header for the given rbd device. On successful
3853 * return, the rbd_dev->header field will contain up-to-date
3854 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003855 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003856static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003857{
3858 struct rbd_image_header_ondisk *ondisk = NULL;
3859 u32 snap_count = 0;
3860 u64 names_size = 0;
3861 u32 want_count;
3862 int ret;
3863
3864 /*
3865 * The complete header will include an array of its 64-bit
3866 * snapshot ids, followed by the names of those snapshots as
3867 * a contiguous block of NUL-terminated strings. Note that
3868 * the number of snapshots could change by the time we read
3869 * it in, in which case we re-read it.
3870 */
3871 do {
3872 size_t size;
3873
3874 kfree(ondisk);
3875
3876 size = sizeof (*ondisk);
3877 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3878 size += names_size;
3879 ondisk = kmalloc(size, GFP_KERNEL);
3880 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003881 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003882
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003883 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3884 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05003885 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003886 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003887 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003888 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003889 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3890 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003891 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003892 }
3893 if (!rbd_dev_ondisk_valid(ondisk)) {
3894 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003895 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003896 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003897 }
3898
3899 names_size = le64_to_cpu(ondisk->snap_names_len);
3900 want_count = snap_count;
3901 snap_count = le32_to_cpu(ondisk->snap_count);
3902 } while (snap_count != want_count);
3903
Alex Elder662518b2013-05-06 09:51:29 -05003904 ret = rbd_header_from_disk(rbd_dev, ondisk);
3905out:
Alex Elder4156d992012-08-02 11:29:46 -05003906 kfree(ondisk);
3907
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003908 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003909}
3910
Alex Elder15228ed2013-05-01 12:43:03 -05003911/*
3912 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3913 * has disappeared from the (just updated) snapshot context.
3914 */
3915static void rbd_exists_validate(struct rbd_device *rbd_dev)
3916{
3917 u64 snap_id;
3918
3919 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3920 return;
3921
3922 snap_id = rbd_dev->spec->snap_id;
3923 if (snap_id == CEPH_NOSNAP)
3924 return;
3925
3926 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3927 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3928}
3929
Josh Durgin98752012013-08-29 17:26:31 -07003930static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3931{
3932 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07003933
3934 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02003935 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3936 * try to update its size. If REMOVING is set, updating size
3937 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07003938 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02003939 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3940 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07003941 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3942 dout("setting size to %llu sectors", (unsigned long long)size);
3943 set_capacity(rbd_dev->disk, size);
3944 revalidate_disk(rbd_dev->disk);
3945 }
3946}
3947
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003948static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003949{
Alex Eldere627db02013-05-06 07:40:30 -05003950 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003951 int ret;
3952
Alex Eldercfbf6372013-05-31 17:40:45 -05003953 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05003954 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04003955
3956 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003957 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003958 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05003959
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003960 /*
3961 * If there is a parent, see if it has disappeared due to the
3962 * mapped image getting flattened.
3963 */
3964 if (rbd_dev->parent) {
3965 ret = rbd_dev_v2_parent_info(rbd_dev);
3966 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003967 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003968 }
3969
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003970 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003971 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003972 } else {
3973 /* validate mapped snapshot's EXISTS flag */
3974 rbd_exists_validate(rbd_dev);
3975 }
Alex Elder15228ed2013-05-01 12:43:03 -05003976
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003977out:
Alex Eldercfbf6372013-05-31 17:40:45 -05003978 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003979 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07003980 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003981
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003982 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05003983}
3984
Christoph Hellwigd6296d392017-05-01 10:19:08 -06003985static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
3986 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003987{
3988 struct work_struct *work = blk_mq_rq_to_pdu(rq);
3989
3990 INIT_WORK(work, rbd_queue_workfn);
3991 return 0;
3992}
3993
Eric Biggersf363b082017-03-30 13:39:16 -07003994static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003995 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003996 .init_request = rbd_init_request,
3997};
3998
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003999static int rbd_init_disk(struct rbd_device *rbd_dev)
4000{
4001 struct gendisk *disk;
4002 struct request_queue *q;
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004003 unsigned int objset_bytes =
4004 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004005 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004006
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004007 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004008 disk = alloc_disk(single_major ?
4009 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4010 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004011 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004012 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004013
Alex Elderf0f8cef2012-01-29 13:57:44 -06004014 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004015 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004016 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004017 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004018 if (single_major)
4019 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004020 disk->fops = &rbd_bd_ops;
4021 disk->private_data = rbd_dev;
4022
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004023 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4024 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004025 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004026 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004027 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004028 rbd_dev->tag_set.nr_hw_queues = 1;
4029 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4030
4031 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4032 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004033 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004034
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004035 q = blk_mq_init_queue(&rbd_dev->tag_set);
4036 if (IS_ERR(q)) {
4037 err = PTR_ERR(q);
4038 goto out_tag_set;
4039 }
4040
Bart Van Assche8b904b52018-03-07 17:10:10 -08004041 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004042 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004043
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004044 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004045 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01004046 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01004047 blk_queue_max_segment_size(q, UINT_MAX);
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004048 blk_queue_io_min(q, objset_bytes);
4049 blk_queue_io_opt(q, objset_bytes);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004050
Ilya Dryomovd9360542018-03-23 06:14:47 +01004051 if (rbd_dev->opts->trim) {
4052 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
4053 q->limits.discard_granularity = objset_bytes;
4054 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4055 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4056 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004057
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004058 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01004059 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004060
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004061 /*
4062 * disk_release() expects a queue ref from add_disk() and will
4063 * put it. Hold an extra ref until add_disk() is called.
4064 */
4065 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004066 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004067 q->queuedata = rbd_dev;
4068
4069 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004070
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004071 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004072out_tag_set:
4073 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004074out_disk:
4075 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004076 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004077}
4078
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004079/*
4080 sysfs
4081*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004082
Alex Elder593a9e72012-02-07 12:03:37 -06004083static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4084{
4085 return container_of(dev, struct rbd_device, dev);
4086}
4087
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004088static ssize_t rbd_size_show(struct device *dev,
4089 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004090{
Alex Elder593a9e72012-02-07 12:03:37 -06004091 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004092
Alex Elderfc71d832013-04-26 15:44:36 -05004093 return sprintf(buf, "%llu\n",
4094 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004095}
4096
Alex Elder34b13182012-07-13 20:35:12 -05004097/*
4098 * Note this shows the features for whatever's mapped, which is not
4099 * necessarily the base image.
4100 */
4101static ssize_t rbd_features_show(struct device *dev,
4102 struct device_attribute *attr, char *buf)
4103{
4104 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4105
4106 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004107 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004108}
4109
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004110static ssize_t rbd_major_show(struct device *dev,
4111 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004112{
Alex Elder593a9e72012-02-07 12:03:37 -06004113 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004114
Alex Elderfc71d832013-04-26 15:44:36 -05004115 if (rbd_dev->major)
4116 return sprintf(buf, "%d\n", rbd_dev->major);
4117
4118 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004119}
Alex Elderfc71d832013-04-26 15:44:36 -05004120
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004121static ssize_t rbd_minor_show(struct device *dev,
4122 struct device_attribute *attr, char *buf)
4123{
4124 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4125
4126 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004127}
4128
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004129static ssize_t rbd_client_addr_show(struct device *dev,
4130 struct device_attribute *attr, char *buf)
4131{
4132 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4133 struct ceph_entity_addr *client_addr =
4134 ceph_client_addr(rbd_dev->rbd_client->client);
4135
4136 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4137 le32_to_cpu(client_addr->nonce));
4138}
4139
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004140static ssize_t rbd_client_id_show(struct device *dev,
4141 struct device_attribute *attr, char *buf)
4142{
Alex Elder593a9e72012-02-07 12:03:37 -06004143 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004144
Alex Elder1dbb4392012-01-24 10:08:37 -06004145 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004146 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004147}
4148
Mike Christie267fb902016-08-18 18:38:43 +02004149static ssize_t rbd_cluster_fsid_show(struct device *dev,
4150 struct device_attribute *attr, char *buf)
4151{
4152 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4153
4154 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4155}
4156
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004157static ssize_t rbd_config_info_show(struct device *dev,
4158 struct device_attribute *attr, char *buf)
4159{
4160 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4161
4162 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004163}
4164
4165static ssize_t rbd_pool_show(struct device *dev,
4166 struct device_attribute *attr, char *buf)
4167{
Alex Elder593a9e72012-02-07 12:03:37 -06004168 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004169
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004170 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004171}
4172
Alex Elder9bb2f332012-07-12 10:46:35 -05004173static ssize_t rbd_pool_id_show(struct device *dev,
4174 struct device_attribute *attr, char *buf)
4175{
4176 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4177
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004178 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004179 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004180}
4181
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004182static ssize_t rbd_pool_ns_show(struct device *dev,
4183 struct device_attribute *attr, char *buf)
4184{
4185 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4186
4187 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4188}
4189
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004190static ssize_t rbd_name_show(struct device *dev,
4191 struct device_attribute *attr, char *buf)
4192{
Alex Elder593a9e72012-02-07 12:03:37 -06004193 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004194
Alex Eldera92ffdf2012-10-30 19:40:33 -05004195 if (rbd_dev->spec->image_name)
4196 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4197
4198 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004199}
4200
Alex Elder589d30e2012-07-10 20:30:11 -05004201static ssize_t rbd_image_id_show(struct device *dev,
4202 struct device_attribute *attr, char *buf)
4203{
4204 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4205
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004206 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004207}
4208
Alex Elder34b13182012-07-13 20:35:12 -05004209/*
4210 * Shows the name of the currently-mapped snapshot (or
4211 * RBD_SNAP_HEAD_NAME for the base image).
4212 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004213static ssize_t rbd_snap_show(struct device *dev,
4214 struct device_attribute *attr,
4215 char *buf)
4216{
Alex Elder593a9e72012-02-07 12:03:37 -06004217 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004218
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004219 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004220}
4221
Mike Christie92a58672016-08-18 18:38:44 +02004222static ssize_t rbd_snap_id_show(struct device *dev,
4223 struct device_attribute *attr, char *buf)
4224{
4225 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4226
4227 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4228}
4229
Alex Elder86b00e02012-10-25 23:34:42 -05004230/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004231 * For a v2 image, shows the chain of parent images, separated by empty
4232 * lines. For v1 images or if there is no parent, shows "(no parent
4233 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004234 */
4235static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004236 struct device_attribute *attr,
4237 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004238{
4239 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004240 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004241
Ilya Dryomovff961282014-07-22 21:53:07 +04004242 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004243 return sprintf(buf, "(no parent image)\n");
4244
Ilya Dryomovff961282014-07-22 21:53:07 +04004245 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4246 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004247
Ilya Dryomovff961282014-07-22 21:53:07 +04004248 count += sprintf(&buf[count], "%s"
4249 "pool_id %llu\npool_name %s\n"
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004250 "pool_ns %s\n"
Ilya Dryomovff961282014-07-22 21:53:07 +04004251 "image_id %s\nimage_name %s\n"
4252 "snap_id %llu\nsnap_name %s\n"
4253 "overlap %llu\n",
4254 !count ? "" : "\n", /* first? */
4255 spec->pool_id, spec->pool_name,
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004256 spec->pool_ns ?: "",
Ilya Dryomovff961282014-07-22 21:53:07 +04004257 spec->image_id, spec->image_name ?: "(unknown)",
4258 spec->snap_id, spec->snap_name,
4259 rbd_dev->parent_overlap);
4260 }
Alex Elder86b00e02012-10-25 23:34:42 -05004261
Ilya Dryomovff961282014-07-22 21:53:07 +04004262 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004263}
4264
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004265static ssize_t rbd_image_refresh(struct device *dev,
4266 struct device_attribute *attr,
4267 const char *buf,
4268 size_t size)
4269{
Alex Elder593a9e72012-02-07 12:03:37 -06004270 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004271 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004272
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004273 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004274 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004275 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004276
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004277 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004278}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004279
Joe Perches5657a812018-05-24 13:38:59 -06004280static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
4281static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
4282static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
4283static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
4284static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
4285static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
4286static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
4287static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
4288static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
4289static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004290static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
Joe Perches5657a812018-05-24 13:38:59 -06004291static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
4292static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
4293static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
4294static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
4295static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
4296static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004297
4298static struct attribute *rbd_attrs[] = {
4299 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004300 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004301 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004302 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004303 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004304 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004305 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004306 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004307 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004308 &dev_attr_pool_id.attr,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004309 &dev_attr_pool_ns.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004310 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004311 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004312 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004313 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004314 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004315 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004316 NULL
4317};
4318
4319static struct attribute_group rbd_attr_group = {
4320 .attrs = rbd_attrs,
4321};
4322
4323static const struct attribute_group *rbd_attr_groups[] = {
4324 &rbd_attr_group,
4325 NULL
4326};
4327
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004328static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004329
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304330static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004331 .name = "rbd",
4332 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004333 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004334};
4335
Alex Elder8b8fb992012-10-26 17:25:24 -05004336static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4337{
4338 kref_get(&spec->kref);
4339
4340 return spec;
4341}
4342
4343static void rbd_spec_free(struct kref *kref);
4344static void rbd_spec_put(struct rbd_spec *spec)
4345{
4346 if (spec)
4347 kref_put(&spec->kref, rbd_spec_free);
4348}
4349
4350static struct rbd_spec *rbd_spec_alloc(void)
4351{
4352 struct rbd_spec *spec;
4353
4354 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4355 if (!spec)
4356 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004357
4358 spec->pool_id = CEPH_NOPOOL;
4359 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004360 kref_init(&spec->kref);
4361
Alex Elder8b8fb992012-10-26 17:25:24 -05004362 return spec;
4363}
4364
4365static void rbd_spec_free(struct kref *kref)
4366{
4367 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4368
4369 kfree(spec->pool_name);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004370 kfree(spec->pool_ns);
Alex Elder8b8fb992012-10-26 17:25:24 -05004371 kfree(spec->image_id);
4372 kfree(spec->image_name);
4373 kfree(spec->snap_name);
4374 kfree(spec);
4375}
4376
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004377static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004378{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004379 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004380 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004381
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004382 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004383 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004384 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004385
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004386 rbd_put_client(rbd_dev->rbd_client);
4387 rbd_spec_put(rbd_dev->spec);
4388 kfree(rbd_dev->opts);
4389 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004390}
4391
4392static void rbd_dev_release(struct device *dev)
4393{
4394 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4395 bool need_put = !!rbd_dev->opts;
4396
4397 if (need_put) {
4398 destroy_workqueue(rbd_dev->task_wq);
4399 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4400 }
4401
4402 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004403
4404 /*
4405 * This is racy, but way better than putting module outside of
4406 * the release callback. The race window is pretty small, so
4407 * doing something similar to dm (dm-builtin.c) is overkill.
4408 */
4409 if (need_put)
4410 module_put(THIS_MODULE);
4411}
4412
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004413static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4414 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004415{
4416 struct rbd_device *rbd_dev;
4417
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004418 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004419 if (!rbd_dev)
4420 return NULL;
4421
4422 spin_lock_init(&rbd_dev->lock);
4423 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004424 init_rwsem(&rbd_dev->header_rwsem);
4425
Ilya Dryomov7e973322017-01-25 18:16:22 +01004426 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004427 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004428 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004429 if (spec->pool_ns) {
4430 WARN_ON(!*spec->pool_ns);
4431 rbd_dev->header_oloc.pool_ns =
4432 ceph_find_or_create_string(spec->pool_ns,
4433 strlen(spec->pool_ns));
4434 }
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004435
Ilya Dryomov99d16942016-08-12 16:11:41 +02004436 mutex_init(&rbd_dev->watch_mutex);
4437 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4438 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4439
Ilya Dryomoved95b212016-08-12 16:40:02 +02004440 init_rwsem(&rbd_dev->lock_rwsem);
4441 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4442 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4443 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4444 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4445 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4446 init_waitqueue_head(&rbd_dev->lock_waitq);
4447
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004448 rbd_dev->dev.bus = &rbd_bus_type;
4449 rbd_dev->dev.type = &rbd_device_type;
4450 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004451 device_initialize(&rbd_dev->dev);
4452
Alex Elderc53d5892012-10-25 23:34:42 -05004453 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004454 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004455
Alex Elderc53d5892012-10-25 23:34:42 -05004456 return rbd_dev;
4457}
4458
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004459/*
4460 * Create a mapping rbd_dev.
4461 */
4462static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4463 struct rbd_spec *spec,
4464 struct rbd_options *opts)
4465{
4466 struct rbd_device *rbd_dev;
4467
4468 rbd_dev = __rbd_dev_create(rbdc, spec);
4469 if (!rbd_dev)
4470 return NULL;
4471
4472 rbd_dev->opts = opts;
4473
4474 /* get an id and fill in device name */
4475 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4476 minor_to_rbd_dev_id(1 << MINORBITS),
4477 GFP_KERNEL);
4478 if (rbd_dev->dev_id < 0)
4479 goto fail_rbd_dev;
4480
4481 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4482 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4483 rbd_dev->name);
4484 if (!rbd_dev->task_wq)
4485 goto fail_dev_id;
4486
4487 /* we have a ref from do_rbd_add() */
4488 __module_get(THIS_MODULE);
4489
4490 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4491 return rbd_dev;
4492
4493fail_dev_id:
4494 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4495fail_rbd_dev:
4496 rbd_dev_free(rbd_dev);
4497 return NULL;
4498}
4499
Alex Elderc53d5892012-10-25 23:34:42 -05004500static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4501{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004502 if (rbd_dev)
4503 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004504}
4505
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004506/*
Alex Elder9d475de2012-07-03 16:01:19 -05004507 * Get the size and object order for an image snapshot, or if
4508 * snap_id is CEPH_NOSNAP, gets this information for the base
4509 * image.
4510 */
4511static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4512 u8 *order, u64 *snap_size)
4513{
4514 __le64 snapid = cpu_to_le64(snap_id);
4515 int ret;
4516 struct {
4517 u8 order;
4518 __le64 size;
4519 } __attribute__ ((packed)) size_buf = { 0 };
4520
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004521 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4522 &rbd_dev->header_oloc, "get_size",
4523 &snapid, sizeof(snapid),
4524 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004525 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004526 if (ret < 0)
4527 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004528 if (ret < sizeof (size_buf))
4529 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004530
Josh Durginc3545572013-08-28 17:08:10 -07004531 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004532 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004533 dout(" order %u", (unsigned int)*order);
4534 }
Alex Elder9d475de2012-07-03 16:01:19 -05004535 *snap_size = le64_to_cpu(size_buf.size);
4536
Josh Durginc3545572013-08-28 17:08:10 -07004537 dout(" snap_id 0x%016llx snap_size = %llu\n",
4538 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004539 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004540
4541 return 0;
4542}
4543
4544static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4545{
4546 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4547 &rbd_dev->header.obj_order,
4548 &rbd_dev->header.image_size);
4549}
4550
Alex Elder1e130192012-07-03 16:01:19 -05004551static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4552{
4553 void *reply_buf;
4554 int ret;
4555 void *p;
4556
4557 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4558 if (!reply_buf)
4559 return -ENOMEM;
4560
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004561 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4562 &rbd_dev->header_oloc, "get_object_prefix",
4563 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004564 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004565 if (ret < 0)
4566 goto out;
4567
4568 p = reply_buf;
4569 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004570 p + ret, NULL, GFP_NOIO);
4571 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004572
4573 if (IS_ERR(rbd_dev->header.object_prefix)) {
4574 ret = PTR_ERR(rbd_dev->header.object_prefix);
4575 rbd_dev->header.object_prefix = NULL;
4576 } else {
4577 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4578 }
Alex Elder1e130192012-07-03 16:01:19 -05004579out:
4580 kfree(reply_buf);
4581
4582 return ret;
4583}
4584
Alex Elderb1b54022012-07-03 16:01:19 -05004585static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4586 u64 *snap_features)
4587{
4588 __le64 snapid = cpu_to_le64(snap_id);
4589 struct {
4590 __le64 features;
4591 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004592 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004593 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004594 int ret;
4595
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004596 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4597 &rbd_dev->header_oloc, "get_features",
4598 &snapid, sizeof(snapid),
4599 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004600 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004601 if (ret < 0)
4602 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004603 if (ret < sizeof (features_buf))
4604 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004605
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004606 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4607 if (unsup) {
4608 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4609 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004610 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004611 }
Alex Elderd8891402012-10-09 13:50:17 -07004612
Alex Elderb1b54022012-07-03 16:01:19 -05004613 *snap_features = le64_to_cpu(features_buf.features);
4614
4615 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004616 (unsigned long long)snap_id,
4617 (unsigned long long)*snap_features,
4618 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004619
4620 return 0;
4621}
4622
4623static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4624{
4625 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4626 &rbd_dev->header.features);
4627}
4628
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004629struct parent_image_info {
4630 u64 pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004631 const char *pool_ns;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004632 const char *image_id;
4633 u64 snap_id;
4634
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004635 bool has_overlap;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004636 u64 overlap;
4637};
4638
4639/*
4640 * The caller is responsible for @pii.
4641 */
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004642static int decode_parent_image_spec(void **p, void *end,
4643 struct parent_image_info *pii)
4644{
4645 u8 struct_v;
4646 u32 struct_len;
4647 int ret;
4648
4649 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
4650 &struct_v, &struct_len);
4651 if (ret)
4652 return ret;
4653
4654 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
4655 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4656 if (IS_ERR(pii->pool_ns)) {
4657 ret = PTR_ERR(pii->pool_ns);
4658 pii->pool_ns = NULL;
4659 return ret;
4660 }
4661 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4662 if (IS_ERR(pii->image_id)) {
4663 ret = PTR_ERR(pii->image_id);
4664 pii->image_id = NULL;
4665 return ret;
4666 }
4667 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
4668 return 0;
4669
4670e_inval:
4671 return -EINVAL;
4672}
4673
4674static int __get_parent_info(struct rbd_device *rbd_dev,
4675 struct page *req_page,
4676 struct page *reply_page,
4677 struct parent_image_info *pii)
4678{
4679 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4680 size_t reply_len = PAGE_SIZE;
4681 void *p, *end;
4682 int ret;
4683
4684 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4685 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4686 req_page, sizeof(u64), reply_page, &reply_len);
4687 if (ret)
4688 return ret == -EOPNOTSUPP ? 1 : ret;
4689
4690 p = page_address(reply_page);
4691 end = p + reply_len;
4692 ret = decode_parent_image_spec(&p, end, pii);
4693 if (ret)
4694 return ret;
4695
4696 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4697 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4698 req_page, sizeof(u64), reply_page, &reply_len);
4699 if (ret)
4700 return ret;
4701
4702 p = page_address(reply_page);
4703 end = p + reply_len;
4704 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
4705 if (pii->has_overlap)
4706 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4707
4708 return 0;
4709
4710e_inval:
4711 return -EINVAL;
4712}
4713
4714/*
4715 * The caller is responsible for @pii.
4716 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004717static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4718 struct page *req_page,
4719 struct page *reply_page,
4720 struct parent_image_info *pii)
4721{
4722 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4723 size_t reply_len = PAGE_SIZE;
4724 void *p, *end;
4725 int ret;
4726
4727 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4728 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4729 req_page, sizeof(u64), reply_page, &reply_len);
4730 if (ret)
4731 return ret;
4732
4733 p = page_address(reply_page);
4734 end = p + reply_len;
4735 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4736 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4737 if (IS_ERR(pii->image_id)) {
4738 ret = PTR_ERR(pii->image_id);
4739 pii->image_id = NULL;
4740 return ret;
4741 }
4742 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004743 pii->has_overlap = true;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004744 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4745
4746 return 0;
4747
4748e_inval:
4749 return -EINVAL;
4750}
4751
4752static int get_parent_info(struct rbd_device *rbd_dev,
4753 struct parent_image_info *pii)
4754{
4755 struct page *req_page, *reply_page;
4756 void *p;
4757 int ret;
4758
4759 req_page = alloc_page(GFP_KERNEL);
4760 if (!req_page)
4761 return -ENOMEM;
4762
4763 reply_page = alloc_page(GFP_KERNEL);
4764 if (!reply_page) {
4765 __free_page(req_page);
4766 return -ENOMEM;
4767 }
4768
4769 p = page_address(req_page);
4770 ceph_encode_64(&p, rbd_dev->spec->snap_id);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004771 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
4772 if (ret > 0)
4773 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
4774 pii);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004775
4776 __free_page(req_page);
4777 __free_page(reply_page);
4778 return ret;
4779}
4780
Alex Elder86b00e02012-10-25 23:34:42 -05004781static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4782{
4783 struct rbd_spec *parent_spec;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004784 struct parent_image_info pii = { 0 };
Alex Elder86b00e02012-10-25 23:34:42 -05004785 int ret;
4786
4787 parent_spec = rbd_spec_alloc();
4788 if (!parent_spec)
4789 return -ENOMEM;
4790
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004791 ret = get_parent_info(rbd_dev, &pii);
4792 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004793 goto out_err;
4794
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004795 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
4796 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
4797 pii.has_overlap, pii.overlap);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004798
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004799 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
Alex Elder392a9da2013-05-06 17:40:33 -05004800 /*
4801 * Either the parent never existed, or we have
4802 * record of it but the image got flattened so it no
4803 * longer has a parent. When the parent of a
4804 * layered image disappears we immediately set the
4805 * overlap to 0. The effect of this is that all new
4806 * requests will be treated as if the image had no
4807 * parent.
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004808 *
4809 * If !pii.has_overlap, the parent image spec is not
4810 * applicable. It's there to avoid duplication in each
4811 * snapshot record.
Alex Elder392a9da2013-05-06 17:40:33 -05004812 */
4813 if (rbd_dev->parent_overlap) {
4814 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004815 rbd_dev_parent_put(rbd_dev);
4816 pr_info("%s: clone image has been flattened\n",
4817 rbd_dev->disk->disk_name);
4818 }
4819
Alex Elder86b00e02012-10-25 23:34:42 -05004820 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004821 }
Alex Elder86b00e02012-10-25 23:34:42 -05004822
Alex Elder0903e872012-11-14 12:25:19 -06004823 /* The ceph file layout needs to fit pool id in 32 bits */
4824
4825 ret = -EIO;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004826 if (pii.pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004827 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004828 (unsigned long long)pii.pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004829 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004830 }
Alex Elder0903e872012-11-14 12:25:19 -06004831
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004832 /*
4833 * The parent won't change (except when the clone is
4834 * flattened, already handled that). So we only need to
4835 * record the parent spec we have not already done so.
4836 */
4837 if (!rbd_dev->parent_spec) {
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004838 parent_spec->pool_id = pii.pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004839 if (pii.pool_ns && *pii.pool_ns) {
4840 parent_spec->pool_ns = pii.pool_ns;
4841 pii.pool_ns = NULL;
4842 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004843 parent_spec->image_id = pii.image_id;
4844 pii.image_id = NULL;
4845 parent_spec->snap_id = pii.snap_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004846
Alex Elder70cf49c2013-05-06 17:40:33 -05004847 rbd_dev->parent_spec = parent_spec;
4848 parent_spec = NULL; /* rbd_dev now owns this */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004849 }
4850
4851 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004852 * We always update the parent overlap. If it's zero we issue
4853 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004854 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004855 if (!pii.overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004856 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004857 /* refresh, careful to warn just once */
4858 if (rbd_dev->parent_overlap)
4859 rbd_warn(rbd_dev,
4860 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004861 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004862 /* initial probe */
4863 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004864 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004865 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004866 rbd_dev->parent_overlap = pii.overlap;
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004867
Alex Elder86b00e02012-10-25 23:34:42 -05004868out:
4869 ret = 0;
4870out_err:
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004871 kfree(pii.pool_ns);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004872 kfree(pii.image_id);
Alex Elder86b00e02012-10-25 23:34:42 -05004873 rbd_spec_put(parent_spec);
Alex Elder86b00e02012-10-25 23:34:42 -05004874 return ret;
4875}
4876
Alex Eldercc070d52013-04-21 12:14:45 -05004877static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4878{
4879 struct {
4880 __le64 stripe_unit;
4881 __le64 stripe_count;
4882 } __attribute__ ((packed)) striping_info_buf = { 0 };
4883 size_t size = sizeof (striping_info_buf);
4884 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05004885 int ret;
4886
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004887 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4888 &rbd_dev->header_oloc, "get_stripe_unit_count",
4889 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004890 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4891 if (ret < 0)
4892 return ret;
4893 if (ret < size)
4894 return -ERANGE;
4895
Alex Eldercc070d52013-04-21 12:14:45 -05004896 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01004897 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
4898 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05004899 return 0;
4900}
4901
Ilya Dryomov7e973322017-01-25 18:16:22 +01004902static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4903{
4904 __le64 data_pool_id;
4905 int ret;
4906
4907 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4908 &rbd_dev->header_oloc, "get_data_pool",
4909 NULL, 0, &data_pool_id, sizeof(data_pool_id));
4910 if (ret < 0)
4911 return ret;
4912 if (ret < sizeof(data_pool_id))
4913 return -EBADMSG;
4914
4915 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4916 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4917 return 0;
4918}
4919
Alex Elder9e15b772012-10-30 19:40:33 -05004920static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4921{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004922 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05004923 size_t image_id_size;
4924 char *image_id;
4925 void *p;
4926 void *end;
4927 size_t size;
4928 void *reply_buf = NULL;
4929 size_t len = 0;
4930 char *image_name = NULL;
4931 int ret;
4932
4933 rbd_assert(!rbd_dev->spec->image_name);
4934
Alex Elder69e7a022012-11-01 08:39:26 -05004935 len = strlen(rbd_dev->spec->image_id);
4936 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004937 image_id = kmalloc(image_id_size, GFP_KERNEL);
4938 if (!image_id)
4939 return NULL;
4940
4941 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004942 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004943 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004944
4945 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4946 reply_buf = kmalloc(size, GFP_KERNEL);
4947 if (!reply_buf)
4948 goto out;
4949
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004950 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4951 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
4952 "dir_get_name", image_id, image_id_size,
4953 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05004954 if (ret < 0)
4955 goto out;
4956 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004957 end = reply_buf + ret;
4958
Alex Elder9e15b772012-10-30 19:40:33 -05004959 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4960 if (IS_ERR(image_name))
4961 image_name = NULL;
4962 else
4963 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4964out:
4965 kfree(reply_buf);
4966 kfree(image_id);
4967
4968 return image_name;
4969}
4970
Alex Elder2ad3d712013-04-30 00:44:33 -05004971static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4972{
4973 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4974 const char *snap_name;
4975 u32 which = 0;
4976
4977 /* Skip over names until we find the one we are looking for */
4978
4979 snap_name = rbd_dev->header.snap_names;
4980 while (which < snapc->num_snaps) {
4981 if (!strcmp(name, snap_name))
4982 return snapc->snaps[which];
4983 snap_name += strlen(snap_name) + 1;
4984 which++;
4985 }
4986 return CEPH_NOSNAP;
4987}
4988
4989static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4990{
4991 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4992 u32 which;
4993 bool found = false;
4994 u64 snap_id;
4995
4996 for (which = 0; !found && which < snapc->num_snaps; which++) {
4997 const char *snap_name;
4998
4999 snap_id = snapc->snaps[which];
5000 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005001 if (IS_ERR(snap_name)) {
5002 /* ignore no-longer existing snapshots */
5003 if (PTR_ERR(snap_name) == -ENOENT)
5004 continue;
5005 else
5006 break;
5007 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005008 found = !strcmp(name, snap_name);
5009 kfree(snap_name);
5010 }
5011 return found ? snap_id : CEPH_NOSNAP;
5012}
5013
5014/*
5015 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5016 * no snapshot by that name is found, or if an error occurs.
5017 */
5018static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5019{
5020 if (rbd_dev->image_format == 1)
5021 return rbd_v1_snap_id_by_name(rbd_dev, name);
5022
5023 return rbd_v2_snap_id_by_name(rbd_dev, name);
5024}
5025
Alex Elder9e15b772012-10-30 19:40:33 -05005026/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005027 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005028 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005029static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5030{
5031 struct rbd_spec *spec = rbd_dev->spec;
5032
5033 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5034 rbd_assert(spec->image_id && spec->image_name);
5035 rbd_assert(spec->snap_name);
5036
5037 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5038 u64 snap_id;
5039
5040 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5041 if (snap_id == CEPH_NOSNAP)
5042 return -ENOENT;
5043
5044 spec->snap_id = snap_id;
5045 } else {
5046 spec->snap_id = CEPH_NOSNAP;
5047 }
5048
5049 return 0;
5050}
5051
5052/*
5053 * A parent image will have all ids but none of the names.
5054 *
5055 * All names in an rbd spec are dynamically allocated. It's OK if we
5056 * can't figure out the name for an image id.
5057 */
5058static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005059{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005060 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5061 struct rbd_spec *spec = rbd_dev->spec;
5062 const char *pool_name;
5063 const char *image_name;
5064 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005065 int ret;
5066
Ilya Dryomov04077592014-07-23 17:11:20 +04005067 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5068 rbd_assert(spec->image_id);
5069 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005070
Alex Elder2e9f7f12013-04-26 09:43:48 -05005071 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005072
Alex Elder2e9f7f12013-04-26 09:43:48 -05005073 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5074 if (!pool_name) {
5075 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005076 return -EIO;
5077 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005078 pool_name = kstrdup(pool_name, GFP_KERNEL);
5079 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005080 return -ENOMEM;
5081
5082 /* Fetch the image name; tolerate failure here */
5083
Alex Elder2e9f7f12013-04-26 09:43:48 -05005084 image_name = rbd_dev_image_name(rbd_dev);
5085 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005086 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005087
Ilya Dryomov04077592014-07-23 17:11:20 +04005088 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005089
Alex Elder2e9f7f12013-04-26 09:43:48 -05005090 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005091 if (IS_ERR(snap_name)) {
5092 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005093 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005094 }
5095
5096 spec->pool_name = pool_name;
5097 spec->image_name = image_name;
5098 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005099
5100 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005101
Alex Elder9e15b772012-10-30 19:40:33 -05005102out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005103 kfree(image_name);
5104 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005105 return ret;
5106}
5107
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005108static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005109{
5110 size_t size;
5111 int ret;
5112 void *reply_buf;
5113 void *p;
5114 void *end;
5115 u64 seq;
5116 u32 snap_count;
5117 struct ceph_snap_context *snapc;
5118 u32 i;
5119
5120 /*
5121 * We'll need room for the seq value (maximum snapshot id),
5122 * snapshot count, and array of that many snapshot ids.
5123 * For now we have a fixed upper limit on the number we're
5124 * prepared to receive.
5125 */
5126 size = sizeof (__le64) + sizeof (__le32) +
5127 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5128 reply_buf = kzalloc(size, GFP_KERNEL);
5129 if (!reply_buf)
5130 return -ENOMEM;
5131
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005132 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5133 &rbd_dev->header_oloc, "get_snapcontext",
5134 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005135 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005136 if (ret < 0)
5137 goto out;
5138
Alex Elder35d489f2012-07-03 16:01:19 -05005139 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005140 end = reply_buf + ret;
5141 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005142 ceph_decode_64_safe(&p, end, seq, out);
5143 ceph_decode_32_safe(&p, end, snap_count, out);
5144
5145 /*
5146 * Make sure the reported number of snapshot ids wouldn't go
5147 * beyond the end of our buffer. But before checking that,
5148 * make sure the computed size of the snapshot context we
5149 * allocate is representable in a size_t.
5150 */
5151 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5152 / sizeof (u64)) {
5153 ret = -EINVAL;
5154 goto out;
5155 }
5156 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5157 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005158 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005159
Alex Elder812164f82013-04-30 00:44:32 -05005160 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005161 if (!snapc) {
5162 ret = -ENOMEM;
5163 goto out;
5164 }
Alex Elder35d489f2012-07-03 16:01:19 -05005165 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005166 for (i = 0; i < snap_count; i++)
5167 snapc->snaps[i] = ceph_decode_64(&p);
5168
Alex Elder49ece552013-05-06 08:37:00 -05005169 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005170 rbd_dev->header.snapc = snapc;
5171
5172 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005173 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005174out:
5175 kfree(reply_buf);
5176
Alex Elder57385b52013-04-21 12:14:45 -05005177 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005178}
5179
Alex Elder54cac612013-04-30 00:44:33 -05005180static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5181 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005182{
5183 size_t size;
5184 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005185 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005186 int ret;
5187 void *p;
5188 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005189 char *snap_name;
5190
5191 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5192 reply_buf = kmalloc(size, GFP_KERNEL);
5193 if (!reply_buf)
5194 return ERR_PTR(-ENOMEM);
5195
Alex Elder54cac612013-04-30 00:44:33 -05005196 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005197 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5198 &rbd_dev->header_oloc, "get_snapshot_name",
5199 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005200 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005201 if (ret < 0) {
5202 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005203 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005204 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005205
5206 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005207 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005208 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005209 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005210 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005211
Alex Elderf40eb342013-04-25 15:09:42 -05005212 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005213 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005214out:
5215 kfree(reply_buf);
5216
Alex Elderf40eb342013-04-25 15:09:42 -05005217 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005218}
5219
Alex Elder2df3fac2013-05-06 09:51:30 -05005220static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005221{
Alex Elder2df3fac2013-05-06 09:51:30 -05005222 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005223 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005224
Josh Durgin1617e402013-06-12 14:43:10 -07005225 ret = rbd_dev_v2_image_size(rbd_dev);
5226 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005227 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005228
Alex Elder2df3fac2013-05-06 09:51:30 -05005229 if (first_time) {
5230 ret = rbd_dev_v2_header_onetime(rbd_dev);
5231 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005232 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005233 }
5234
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005235 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005236 if (ret && first_time) {
5237 kfree(rbd_dev->header.object_prefix);
5238 rbd_dev->header.object_prefix = NULL;
5239 }
Alex Elder117973f2012-08-31 17:29:55 -05005240
5241 return ret;
5242}
5243
Ilya Dryomova720ae02014-07-23 17:11:19 +04005244static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5245{
5246 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5247
5248 if (rbd_dev->image_format == 1)
5249 return rbd_dev_v1_header_info(rbd_dev);
5250
5251 return rbd_dev_v2_header_info(rbd_dev);
5252}
5253
Alex Elder1ddbe942012-01-29 13:57:44 -06005254/*
Alex Eldere28fff262012-02-02 08:13:30 -06005255 * Skips over white space at *buf, and updates *buf to point to the
5256 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005257 * the token (string of non-white space characters) found. Note
5258 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005259 */
5260static inline size_t next_token(const char **buf)
5261{
5262 /*
5263 * These are the characters that produce nonzero for
5264 * isspace() in the "C" and "POSIX" locales.
5265 */
5266 const char *spaces = " \f\n\r\t\v";
5267
5268 *buf += strspn(*buf, spaces); /* Find start of token */
5269
5270 return strcspn(*buf, spaces); /* Return token length */
5271}
5272
5273/*
Alex Elderea3352f2012-07-09 21:04:23 -05005274 * Finds the next token in *buf, dynamically allocates a buffer big
5275 * enough to hold a copy of it, and copies the token into the new
5276 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5277 * that a duplicate buffer is created even for a zero-length token.
5278 *
5279 * Returns a pointer to the newly-allocated duplicate, or a null
5280 * pointer if memory for the duplicate was not available. If
5281 * the lenp argument is a non-null pointer, the length of the token
5282 * (not including the '\0') is returned in *lenp.
5283 *
5284 * If successful, the *buf pointer will be updated to point beyond
5285 * the end of the found token.
5286 *
5287 * Note: uses GFP_KERNEL for allocation.
5288 */
5289static inline char *dup_token(const char **buf, size_t *lenp)
5290{
5291 char *dup;
5292 size_t len;
5293
5294 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005295 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005296 if (!dup)
5297 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005298 *(dup + len) = '\0';
5299 *buf += len;
5300
5301 if (lenp)
5302 *lenp = len;
5303
5304 return dup;
5305}
5306
5307/*
Alex Elder859c31d2012-10-25 23:34:42 -05005308 * Parse the options provided for an "rbd add" (i.e., rbd image
5309 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5310 * and the data written is passed here via a NUL-terminated buffer.
5311 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005312 *
Alex Elder859c31d2012-10-25 23:34:42 -05005313 * The information extracted from these options is recorded in
5314 * the other parameters which return dynamically-allocated
5315 * structures:
5316 * ceph_opts
5317 * The address of a pointer that will refer to a ceph options
5318 * structure. Caller must release the returned pointer using
5319 * ceph_destroy_options() when it is no longer needed.
5320 * rbd_opts
5321 * Address of an rbd options pointer. Fully initialized by
5322 * this function; caller must release with kfree().
5323 * spec
5324 * Address of an rbd image specification pointer. Fully
5325 * initialized by this function based on parsed options.
5326 * Caller must release with rbd_spec_put().
5327 *
5328 * The options passed take this form:
5329 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5330 * where:
5331 * <mon_addrs>
5332 * A comma-separated list of one or more monitor addresses.
5333 * A monitor address is an ip address, optionally followed
5334 * by a port number (separated by a colon).
5335 * I.e.: ip1[:port1][,ip2[:port2]...]
5336 * <options>
5337 * A comma-separated list of ceph and/or rbd options.
5338 * <pool_name>
5339 * The name of the rados pool containing the rbd image.
5340 * <image_name>
5341 * The name of the image in that pool to map.
5342 * <snap_id>
5343 * An optional snapshot id. If provided, the mapping will
5344 * present data from the image at the time that snapshot was
5345 * created. The image head is used if no snapshot id is
5346 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005347 */
Alex Elder859c31d2012-10-25 23:34:42 -05005348static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005349 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005350 struct rbd_options **opts,
5351 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005352{
Alex Elderd22f76e2012-07-12 10:46:35 -05005353 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005354 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005355 const char *mon_addrs;
Alex Elderecb4dc222013-04-26 09:43:47 -05005356 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005357 size_t mon_addrs_size;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005358 struct parse_rbd_opts_ctx pctx = { 0 };
Alex Elder859c31d2012-10-25 23:34:42 -05005359 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005360 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005361
5362 /* The first four tokens are required */
5363
Alex Elder7ef32142012-02-02 08:13:30 -06005364 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005365 if (!len) {
5366 rbd_warn(NULL, "no monitor address(es) provided");
5367 return -EINVAL;
5368 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005369 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005370 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005371 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005372
Alex Elderdc79b112012-10-25 23:34:41 -05005373 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005374 options = dup_token(&buf, NULL);
5375 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005376 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005377 if (!*options) {
5378 rbd_warn(NULL, "no options provided");
5379 goto out_err;
5380 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005381
Ilya Dryomovc3001562018-07-03 15:28:43 +02005382 pctx.spec = rbd_spec_alloc();
5383 if (!pctx.spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005384 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005385
Ilya Dryomovc3001562018-07-03 15:28:43 +02005386 pctx.spec->pool_name = dup_token(&buf, NULL);
5387 if (!pctx.spec->pool_name)
Alex Elder859c31d2012-10-25 23:34:42 -05005388 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005389 if (!*pctx.spec->pool_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005390 rbd_warn(NULL, "no pool name provided");
5391 goto out_err;
5392 }
Alex Eldere28fff262012-02-02 08:13:30 -06005393
Ilya Dryomovc3001562018-07-03 15:28:43 +02005394 pctx.spec->image_name = dup_token(&buf, NULL);
5395 if (!pctx.spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005396 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005397 if (!*pctx.spec->image_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005398 rbd_warn(NULL, "no image name provided");
5399 goto out_err;
5400 }
Alex Eldere28fff262012-02-02 08:13:30 -06005401
Alex Elderf28e5652012-10-25 23:34:41 -05005402 /*
5403 * Snapshot name is optional; default is to use "-"
5404 * (indicating the head/no snapshot).
5405 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005406 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005407 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005408 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5409 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005410 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005411 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005412 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005413 }
Alex Elderecb4dc222013-04-26 09:43:47 -05005414 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5415 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005416 goto out_mem;
Alex Elderecb4dc222013-04-26 09:43:47 -05005417 *(snap_name + len) = '\0';
Ilya Dryomovc3001562018-07-03 15:28:43 +02005418 pctx.spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005419
Alex Elder0ddebc02012-10-25 23:34:41 -05005420 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005421
Ilya Dryomovc3001562018-07-03 15:28:43 +02005422 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5423 if (!pctx.opts)
Alex Elder4e9afeb2012-10-25 23:34:41 -05005424 goto out_mem;
5425
Ilya Dryomovc3001562018-07-03 15:28:43 +02005426 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5427 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
5428 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5429 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5430 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5431 pctx.opts->trim = RBD_TRIM_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005432
Alex Elder859c31d2012-10-25 23:34:42 -05005433 copts = ceph_parse_options(options, mon_addrs,
Ilya Dryomovc3001562018-07-03 15:28:43 +02005434 mon_addrs + mon_addrs_size - 1,
5435 parse_rbd_opts_token, &pctx);
Alex Elder859c31d2012-10-25 23:34:42 -05005436 if (IS_ERR(copts)) {
5437 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005438 goto out_err;
5439 }
Alex Elder859c31d2012-10-25 23:34:42 -05005440 kfree(options);
5441
5442 *ceph_opts = copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005443 *opts = pctx.opts;
5444 *rbd_spec = pctx.spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005445
Alex Elderdc79b112012-10-25 23:34:41 -05005446 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005447out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005448 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005449out_err:
Ilya Dryomovc3001562018-07-03 15:28:43 +02005450 kfree(pctx.opts);
5451 rbd_spec_put(pctx.spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005452 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005453
Alex Elderdc79b112012-10-25 23:34:41 -05005454 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005455}
5456
Ilya Dryomove010dd02017-04-13 12:17:39 +02005457static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5458{
5459 down_write(&rbd_dev->lock_rwsem);
5460 if (__rbd_is_lock_owner(rbd_dev))
5461 rbd_unlock(rbd_dev);
5462 up_write(&rbd_dev->lock_rwsem);
5463}
5464
5465static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5466{
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005467 int ret;
5468
Ilya Dryomove010dd02017-04-13 12:17:39 +02005469 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5470 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5471 return -EINVAL;
5472 }
5473
5474 /* FIXME: "rbd map --exclusive" should be in interruptible */
5475 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005476 ret = rbd_wait_state_locked(rbd_dev, true);
Ilya Dryomove010dd02017-04-13 12:17:39 +02005477 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005478 if (ret) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02005479 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5480 return -EROFS;
5481 }
5482
5483 return 0;
5484}
5485
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005486/*
Alex Elder589d30e2012-07-10 20:30:11 -05005487 * An rbd format 2 image has a unique identifier, distinct from the
5488 * name given to it by the user. Internally, that identifier is
5489 * what's used to specify the names of objects related to the image.
5490 *
5491 * A special "rbd id" object is used to map an rbd image name to its
5492 * id. If that object doesn't exist, then there is no v2 rbd image
5493 * with the supplied name.
5494 *
5495 * This function will record the given rbd_dev's image_id field if
5496 * it can be determined, and in that case will return 0. If any
5497 * errors occur a negative errno will be returned and the rbd_dev's
5498 * image_id field will be unchanged (and should be NULL).
5499 */
5500static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5501{
5502 int ret;
5503 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005504 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005505 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005506 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005507
Alex Elder589d30e2012-07-10 20:30:11 -05005508 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005509 * When probing a parent image, the image id is already
5510 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005511 * need to fetch the image id again in this case. We
5512 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005513 */
Alex Elderc0fba362013-04-25 23:15:08 -05005514 if (rbd_dev->spec->image_id) {
5515 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5516
Alex Elder2c0d0a12012-10-30 19:40:33 -05005517 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005518 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005519
5520 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005521 * First, see if the format 2 image id file exists, and if
5522 * so, get the image's persistent id from it.
5523 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005524 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5525 rbd_dev->spec->image_name);
5526 if (ret)
5527 return ret;
5528
5529 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005530
5531 /* Response will be an encoded string, which includes a length */
5532
5533 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5534 response = kzalloc(size, GFP_NOIO);
5535 if (!response) {
5536 ret = -ENOMEM;
5537 goto out;
5538 }
5539
Alex Elderc0fba362013-04-25 23:15:08 -05005540 /* If it doesn't exist we'll assume it's a format 1 image */
5541
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005542 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5543 "get_id", NULL, 0,
5544 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005545 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005546 if (ret == -ENOENT) {
5547 image_id = kstrdup("", GFP_KERNEL);
5548 ret = image_id ? 0 : -ENOMEM;
5549 if (!ret)
5550 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005551 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005552 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005553
Alex Elderc0fba362013-04-25 23:15:08 -05005554 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005555 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005556 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005557 if (!ret)
5558 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005559 }
5560
5561 if (!ret) {
5562 rbd_dev->spec->image_id = image_id;
5563 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005564 }
5565out:
5566 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005567 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005568 return ret;
5569}
5570
Alex Elder3abef3b2013-05-13 20:35:37 -05005571/*
5572 * Undo whatever state changes are made by v1 or v2 header info
5573 * call.
5574 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005575static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5576{
5577 struct rbd_image_header *header;
5578
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005579 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005580
5581 /* Free dynamic fields from the header, then zero it out */
5582
5583 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005584 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005585 kfree(header->snap_sizes);
5586 kfree(header->snap_names);
5587 kfree(header->object_prefix);
5588 memset(header, 0, sizeof (*header));
5589}
5590
Alex Elder2df3fac2013-05-06 09:51:30 -05005591static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005592{
5593 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005594
Alex Elder1e130192012-07-03 16:01:19 -05005595 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005596 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005597 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005598
Alex Elder2df3fac2013-05-06 09:51:30 -05005599 /*
5600 * Get the and check features for the image. Currently the
5601 * features are assumed to never change.
5602 */
Alex Elderb1b54022012-07-03 16:01:19 -05005603 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005604 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005605 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005606
Alex Eldercc070d52013-04-21 12:14:45 -05005607 /* If the image supports fancy striping, get its parameters */
5608
5609 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5610 ret = rbd_dev_v2_striping_info(rbd_dev);
5611 if (ret < 0)
5612 goto out_err;
5613 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005614
Ilya Dryomov7e973322017-01-25 18:16:22 +01005615 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5616 ret = rbd_dev_v2_data_pool(rbd_dev);
5617 if (ret)
5618 goto out_err;
5619 }
5620
Ilya Dryomov263423f2017-01-25 18:16:22 +01005621 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005622 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005623
Alex Elder9d475de2012-07-03 16:01:19 -05005624out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005625 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005626 kfree(rbd_dev->header.object_prefix);
5627 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005628 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005629}
5630
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005631/*
5632 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5633 * rbd_dev_image_probe() recursion depth, which means it's also the
5634 * length of the already discovered part of the parent chain.
5635 */
5636static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005637{
Alex Elder2f82ee52012-10-30 19:40:33 -05005638 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005639 int ret;
5640
5641 if (!rbd_dev->parent_spec)
5642 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005643
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005644 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5645 pr_info("parent chain is too long (%d)\n", depth);
5646 ret = -EINVAL;
5647 goto out_err;
5648 }
5649
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005650 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005651 if (!parent) {
5652 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005653 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005654 }
5655
5656 /*
5657 * Images related by parent/child relationships always share
5658 * rbd_client and spec/parent_spec, so bump their refcounts.
5659 */
5660 __rbd_get_client(rbd_dev->rbd_client);
5661 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005662
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005663 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005664 if (ret < 0)
5665 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005666
Alex Elder124afba2013-04-26 15:44:36 -05005667 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005668 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005669 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005670
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005671out_err:
5672 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005673 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005674 return ret;
5675}
5676
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005677static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5678{
5679 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5680 rbd_dev_mapping_clear(rbd_dev);
5681 rbd_free_disk(rbd_dev);
5682 if (!single_major)
5683 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5684}
5685
Ilya Dryomov811c6682016-04-15 16:22:16 +02005686/*
5687 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5688 * upon return.
5689 */
Alex Elder200a6a82013-04-28 23:32:34 -05005690static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005691{
Alex Elder83a06262012-10-30 15:47:17 -05005692 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005693
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005694 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005695
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005696 if (!single_major) {
5697 ret = register_blkdev(0, rbd_dev->name);
5698 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005699 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005700
5701 rbd_dev->major = ret;
5702 rbd_dev->minor = 0;
5703 } else {
5704 rbd_dev->major = rbd_major;
5705 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5706 }
Alex Elder83a06262012-10-30 15:47:17 -05005707
5708 /* Set up the blkdev mapping. */
5709
5710 ret = rbd_init_disk(rbd_dev);
5711 if (ret)
5712 goto err_out_blkdev;
5713
Alex Elderf35a4de2013-05-06 09:51:29 -05005714 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005715 if (ret)
5716 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005717
Alex Elderf35a4de2013-05-06 09:51:29 -05005718 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005719 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005720
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005721 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005722 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005723 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005724
Alex Elder129b79d2013-04-26 15:44:36 -05005725 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005726 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005727 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005728
Alex Elderf35a4de2013-05-06 09:51:29 -05005729err_out_mapping:
5730 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005731err_out_disk:
5732 rbd_free_disk(rbd_dev);
5733err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005734 if (!single_major)
5735 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005736err_out_unlock:
5737 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005738 return ret;
5739}
5740
Alex Elder332bb122013-04-27 09:59:30 -05005741static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5742{
5743 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005744 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005745
5746 /* Record the header object name for this rbd image. */
5747
5748 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005749 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005750 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5751 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005752 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005753 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5754 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005755
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005756 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005757}
5758
Alex Elder200a6a82013-04-28 23:32:34 -05005759static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5760{
Alex Elder6fd48b32013-04-28 23:32:34 -05005761 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005762 if (rbd_dev->opts)
5763 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005764 rbd_dev->image_format = 0;
5765 kfree(rbd_dev->spec->image_id);
5766 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005767}
5768
Alex Eldera30b71b2012-07-10 20:30:11 -05005769/*
5770 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005771 * device. If this image is the one being mapped (i.e., not a
5772 * parent), initiate a watch on its header object before using that
5773 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005774 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005775static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005776{
5777 int ret;
5778
5779 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005780 * Get the id from the image id object. Unless there's an
5781 * error, rbd_dev->spec->image_id will be filled in with
5782 * a dynamically-allocated string, and rbd_dev->image_format
5783 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005784 */
5785 ret = rbd_dev_image_id(rbd_dev);
5786 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005787 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005788
Alex Elder332bb122013-04-27 09:59:30 -05005789 ret = rbd_dev_header_name(rbd_dev);
5790 if (ret)
5791 goto err_out_format;
5792
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005793 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005794 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005795 if (ret) {
5796 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005797 pr_info("image %s/%s%s%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005798 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005799 rbd_dev->spec->pool_ns ?: "",
5800 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005801 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005802 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005803 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005804 }
Alex Elderb644de22013-04-27 09:59:31 -05005805
Ilya Dryomova720ae02014-07-23 17:11:19 +04005806 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005807 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005808 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005809
Ilya Dryomov04077592014-07-23 17:11:20 +04005810 /*
5811 * If this image is the one being mapped, we have pool name and
5812 * id, image name and id, and snap name - need to fill snap id.
5813 * Otherwise this is a parent image, identified by pool, image
5814 * and snap ids - need to fill in names for those ids.
5815 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005816 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005817 ret = rbd_spec_fill_snap_id(rbd_dev);
5818 else
5819 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005820 if (ret) {
5821 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005822 pr_info("snap %s/%s%s%s@%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005823 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005824 rbd_dev->spec->pool_ns ?: "",
5825 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005826 rbd_dev->spec->image_name,
5827 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005828 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005829 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005830
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005831 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5832 ret = rbd_dev_v2_parent_info(rbd_dev);
5833 if (ret)
5834 goto err_out_probe;
5835
5836 /*
5837 * Need to warn users if this image is the one being
5838 * mapped and has a parent.
5839 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005840 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005841 rbd_warn(rbd_dev,
5842 "WARNING: kernel layering is EXPERIMENTAL!");
5843 }
5844
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005845 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005846 if (ret)
5847 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005848
Alex Elder30d60ba2013-05-06 09:51:30 -05005849 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005850 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005851 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005852
Alex Elder6fd48b32013-04-28 23:32:34 -05005853err_out_probe:
5854 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005855err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005856 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005857 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005858err_out_format:
5859 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005860 kfree(rbd_dev->spec->image_id);
5861 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005862 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005863}
5864
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005865static ssize_t do_rbd_add(struct bus_type *bus,
5866 const char *buf,
5867 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005868{
Alex Eldercb8627c2012-07-09 21:04:23 -05005869 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005870 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005871 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005872 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005873 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005874 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005875
5876 if (!try_module_get(THIS_MODULE))
5877 return -ENODEV;
5878
Alex Eldera725f65e2012-02-02 08:13:30 -06005879 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005880 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005881 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005882 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06005883
Alex Elder9d3997f2012-10-25 23:34:42 -05005884 rbdc = rbd_get_client(ceph_opts);
5885 if (IS_ERR(rbdc)) {
5886 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005887 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005888 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005889
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005890 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01005891 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005892 if (rc < 0) {
5893 if (rc == -ENOENT)
5894 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005895 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005896 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05005897 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005898
Ilya Dryomovd1475432015-06-22 13:24:48 +03005899 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005900 if (!rbd_dev) {
5901 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05005902 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005903 }
Alex Elderc53d5892012-10-25 23:34:42 -05005904 rbdc = NULL; /* rbd_dev now owns this */
5905 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03005906 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005907
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005908 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5909 if (!rbd_dev->config_info) {
5910 rc = -ENOMEM;
5911 goto err_out_rbd_dev;
5912 }
5913
Ilya Dryomov811c6682016-04-15 16:22:16 +02005914 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005915 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005916 if (rc < 0) {
5917 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05005918 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005919 }
Alex Elder05fd6f62012-08-29 17:11:07 -05005920
Alex Elder7ce4eef2013-05-06 17:40:33 -05005921 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05005922 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02005923 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05005924
Alex Elderb536f692013-04-28 23:32:34 -05005925 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005926 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005927 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05005928
Ilya Dryomove010dd02017-04-13 12:17:39 +02005929 if (rbd_dev->opts->exclusive) {
5930 rc = rbd_add_acquire_lock(rbd_dev);
5931 if (rc)
5932 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05005933 }
5934
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005935 /* Everything's ready. Announce the disk to the world. */
5936
5937 rc = device_add(&rbd_dev->dev);
5938 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02005939 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005940
5941 add_disk(rbd_dev->disk);
5942 /* see rbd_init_disk() */
5943 blk_put_queue(rbd_dev->disk->queue);
5944
5945 spin_lock(&rbd_dev_list_lock);
5946 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5947 spin_unlock(&rbd_dev_list_lock);
5948
5949 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5950 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5951 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005952 rc = count;
5953out:
5954 module_put(THIS_MODULE);
5955 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05005956
Ilya Dryomove010dd02017-04-13 12:17:39 +02005957err_out_image_lock:
5958 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005959err_out_device_setup:
5960 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005961err_out_image_probe:
5962 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005963err_out_rbd_dev:
5964 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005965err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005966 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005967err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05005968 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03005969 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005970 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005971}
5972
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005973static ssize_t rbd_add(struct bus_type *bus,
5974 const char *buf,
5975 size_t count)
5976{
5977 if (single_major)
5978 return -EINVAL;
5979
5980 return do_rbd_add(bus, buf, count);
5981}
5982
5983static ssize_t rbd_add_single_major(struct bus_type *bus,
5984 const char *buf,
5985 size_t count)
5986{
5987 return do_rbd_add(bus, buf, count);
5988}
5989
Alex Elder05a46af2013-04-26 15:44:36 -05005990static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5991{
Alex Elderad945fc2013-04-26 15:44:36 -05005992 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005993 struct rbd_device *first = rbd_dev;
5994 struct rbd_device *second = first->parent;
5995 struct rbd_device *third;
5996
5997 /*
5998 * Follow to the parent with no grandparent and
5999 * remove it.
6000 */
6001 while (second && (third = second->parent)) {
6002 first = second;
6003 second = third;
6004 }
Alex Elderad945fc2013-04-26 15:44:36 -05006005 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006006 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006007 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05006008 first->parent = NULL;
6009 first->parent_overlap = 0;
6010
6011 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05006012 rbd_spec_put(first->parent_spec);
6013 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05006014 }
6015}
6016
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006017static ssize_t do_rbd_remove(struct bus_type *bus,
6018 const char *buf,
6019 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006020{
6021 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05006022 struct list_head *tmp;
6023 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02006024 char opt_buf[6];
Mike Christie0276dca2016-08-18 18:38:45 +02006025 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05006026 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006027
Mike Christie0276dca2016-08-18 18:38:45 +02006028 dev_id = -1;
6029 opt_buf[0] = '\0';
6030 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6031 if (dev_id < 0) {
6032 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006033 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02006034 }
6035 if (opt_buf[0] != '\0') {
6036 if (!strcmp(opt_buf, "force")) {
6037 force = true;
6038 } else {
6039 pr_err("bad remove option at '%s'\n", opt_buf);
6040 return -EINVAL;
6041 }
6042 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006043
Alex Elder751cc0e2013-05-31 15:17:01 -05006044 ret = -ENOENT;
6045 spin_lock(&rbd_dev_list_lock);
6046 list_for_each(tmp, &rbd_dev_list) {
6047 rbd_dev = list_entry(tmp, struct rbd_device, node);
6048 if (rbd_dev->dev_id == dev_id) {
6049 ret = 0;
6050 break;
6051 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006052 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006053 if (!ret) {
6054 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006055 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006056 ret = -EBUSY;
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006057 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6058 &rbd_dev->flags))
6059 ret = -EINPROGRESS;
Alex Elder751cc0e2013-05-31 15:17:01 -05006060 spin_unlock_irq(&rbd_dev->lock);
6061 }
6062 spin_unlock(&rbd_dev_list_lock);
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006063 if (ret)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006064 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006065
Mike Christie0276dca2016-08-18 18:38:45 +02006066 if (force) {
6067 /*
6068 * Prevent new IO from being queued and wait for existing
6069 * IO to complete/fail.
6070 */
6071 blk_mq_freeze_queue(rbd_dev->disk->queue);
6072 blk_set_queue_dying(rbd_dev->disk->queue);
6073 }
6074
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006075 del_gendisk(rbd_dev->disk);
6076 spin_lock(&rbd_dev_list_lock);
6077 list_del_init(&rbd_dev->node);
6078 spin_unlock(&rbd_dev_list_lock);
6079 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006080
Ilya Dryomove010dd02017-04-13 12:17:39 +02006081 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006082 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006083 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006084 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006085 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006086}
6087
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006088static ssize_t rbd_remove(struct bus_type *bus,
6089 const char *buf,
6090 size_t count)
6091{
6092 if (single_major)
6093 return -EINVAL;
6094
6095 return do_rbd_remove(bus, buf, count);
6096}
6097
6098static ssize_t rbd_remove_single_major(struct bus_type *bus,
6099 const char *buf,
6100 size_t count)
6101{
6102 return do_rbd_remove(bus, buf, count);
6103}
6104
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006105/*
6106 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006107 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006108 */
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006109static int __init rbd_sysfs_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006110{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006111 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006112
Alex Elderfed4c142012-02-07 12:03:36 -06006113 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006114 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006115 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006116
Alex Elderfed4c142012-02-07 12:03:36 -06006117 ret = bus_register(&rbd_bus_type);
6118 if (ret < 0)
6119 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006120
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006121 return ret;
6122}
6123
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006124static void __exit rbd_sysfs_cleanup(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006125{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006126 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006127 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006128}
6129
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006130static int __init rbd_slab_init(void)
Alex Elder1c2a9df2013-05-01 12:43:03 -05006131{
6132 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006133 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006134 if (!rbd_img_request_cache)
6135 return -ENOMEM;
6136
6137 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006138 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006139 if (!rbd_obj_request_cache)
6140 goto out_err;
6141
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006142 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006143
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006144out_err:
Alex Elder868311b2013-05-01 12:43:03 -05006145 kmem_cache_destroy(rbd_img_request_cache);
6146 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006147 return -ENOMEM;
6148}
6149
6150static void rbd_slab_exit(void)
6151{
Alex Elder868311b2013-05-01 12:43:03 -05006152 rbd_assert(rbd_obj_request_cache);
6153 kmem_cache_destroy(rbd_obj_request_cache);
6154 rbd_obj_request_cache = NULL;
6155
Alex Elder1c2a9df2013-05-01 12:43:03 -05006156 rbd_assert(rbd_img_request_cache);
6157 kmem_cache_destroy(rbd_img_request_cache);
6158 rbd_img_request_cache = NULL;
6159}
6160
Alex Eldercc344fa2013-02-19 12:25:56 -06006161static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006162{
6163 int rc;
6164
Alex Elder1e32d342013-01-30 11:13:33 -06006165 if (!libceph_compatible(NULL)) {
6166 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006167 return -EINVAL;
6168 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006169
Alex Elder1c2a9df2013-05-01 12:43:03 -05006170 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006171 if (rc)
6172 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006173
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006174 /*
6175 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006176 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006177 */
6178 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6179 if (!rbd_wq) {
6180 rc = -ENOMEM;
6181 goto err_out_slab;
6182 }
6183
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006184 if (single_major) {
6185 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6186 if (rbd_major < 0) {
6187 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006188 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006189 }
6190 }
6191
Alex Elder1c2a9df2013-05-01 12:43:03 -05006192 rc = rbd_sysfs_init();
6193 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006194 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006195
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006196 if (single_major)
6197 pr_info("loaded (major %d)\n", rbd_major);
6198 else
6199 pr_info("loaded\n");
6200
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006201 return 0;
6202
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006203err_out_blkdev:
6204 if (single_major)
6205 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006206err_out_wq:
6207 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006208err_out_slab:
6209 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006210 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006211}
6212
Alex Eldercc344fa2013-02-19 12:25:56 -06006213static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006214{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006215 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006216 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006217 if (single_major)
6218 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006219 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006220 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006221}
6222
6223module_init(rbd_init);
6224module_exit(rbd_exit);
6225
Alex Elderd552c612013-05-31 20:13:09 -05006226MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006227MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6228MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006229/* following authorship retained from original osdblk.c */
6230MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6231
Ilya Dryomov90da2582013-12-13 15:28:56 +02006232MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006233MODULE_LICENSE("GPL");