blob: bec5a50c98907eb054e919d3fe93a4f7d4030812 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
Mark Rutlandbfc18e32018-06-21 13:13:04 +010064 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
Alex Eldera2acd002013-05-08 22:50:04 -050065 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
118#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100119#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100120
Ilya Dryomoved95b212016-08-12 16:40:02 +0200121#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
122 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100123 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100124 RBD_FEATURE_DATA_POOL | \
125 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700126
127/* Features supported by this (client software) implementation. */
128
Alex Elder770eba62012-10-25 23:34:40 -0500129#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700130
Alex Elder81a89792012-02-02 08:13:30 -0600131/*
132 * An RBD device name will be "rbd#", where the "rbd" comes from
133 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600134 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700135#define DEV_NAME_LEN 32
136
137/*
138 * block device image metadata (in-memory version)
139 */
140struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500141 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500142 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500144 u64 stripe_unit;
145 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100146 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500147 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700148
Alex Elderf84344f2012-08-31 17:29:51 -0500149 /* The remaining fields need to be updated occasionally */
150 u64 image_size;
151 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500152 char *snap_names; /* format 1 only */
153 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700154};
155
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500156/*
157 * An rbd image specification.
158 *
159 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500160 * identify an image. Each rbd_dev structure includes a pointer to
161 * an rbd_spec structure that encapsulates this identity.
162 *
163 * Each of the id's in an rbd_spec has an associated name. For a
164 * user-mapped image, the names are supplied and the id's associated
165 * with them are looked up. For a layered image, a parent image is
166 * defined by the tuple, and the names are looked up.
167 *
168 * An rbd_dev structure contains a parent_spec pointer which is
169 * non-null if the image it represents is a child in a layered
170 * image. This pointer will refer to the rbd_spec structure used
171 * by the parent rbd_dev for its own identity (i.e., the structure
172 * is shared between the parent and child).
173 *
174 * Since these structures are populated once, during the discovery
175 * phase of image construction, they are effectively immutable so
176 * we make no effort to synchronize access to them.
177 *
178 * Note that code herein does not assume the image name is known (it
179 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500180 */
181struct rbd_spec {
182 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500183 const char *pool_name;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200184 const char *pool_ns; /* NULL if default, never "" */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500185
Alex Elderecb4dc22013-04-26 09:43:47 -0500186 const char *image_id;
187 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500188
189 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500190 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500191
192 struct kref kref;
193};
194
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700195/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600196 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700197 */
198struct rbd_client {
199 struct ceph_client *client;
200 struct kref kref;
201 struct list_head node;
202};
203
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600205
Alex Elder9969ebc2013-01-18 12:31:10 -0600206enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100207 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100208 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100209 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100210 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600211};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800213enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100214 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800215 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800216 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800217};
218
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100219/*
220 * Writes go through the following state machine to deal with
221 * layering:
222 *
223 * need copyup
224 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
225 * | ^ |
226 * v \------------------------------/
227 * done
228 * ^
229 * |
230 * RBD_OBJ_WRITE_FLAT
231 *
232 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
233 * there is a parent or not.
234 */
235enum rbd_obj_write_state {
236 RBD_OBJ_WRITE_FLAT = 1,
237 RBD_OBJ_WRITE_GUARD,
238 RBD_OBJ_WRITE_COPYUP,
Alex Elder926f9b32013-02-11 12:33:24 -0600239};
240
Alex Elderbf0d5f502012-11-22 00:00:08 -0600241struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100242 struct ceph_object_extent ex;
Alex Elderc5b5ef62013-02-11 12:33:24 -0600243 union {
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100244 bool tried_parent; /* for reads */
245 enum rbd_obj_write_state write_state; /* for writes */
246 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600247
Ilya Dryomov51c35092018-01-29 14:04:08 +0100248 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100249 struct ceph_file_extent *img_extents;
250 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600251
Alex Elder788e2df2013-01-17 12:25:27 -0600252 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100253 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600254 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100255 struct ceph_bvec_iter bvec_pos;
256 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100257 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600258 };
259 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100260 struct bio_vec *copyup_bvecs;
261 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600262
263 struct ceph_osd_request *osd_req;
264
265 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800266 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600267
Alex Elderbf0d5f502012-11-22 00:00:08 -0600268 struct kref kref;
269};
270
Alex Elder0c425242013-02-08 09:55:49 -0600271enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600272 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600273 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600274};
275
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600277 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100278 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100279 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600280 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600281 union {
Alex Elder9849e982013-01-24 16:13:36 -0600282 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600283 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600284 };
285 union {
286 struct request *rq; /* block request */
287 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600288 };
Ilya Dryomov15961b42018-02-01 11:50:47 +0100289 spinlock_t completion_lock;
Alex Elder55f27e02013-04-10 12:34:25 -0500290 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600291 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600292
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100293 struct list_head object_extents; /* obj_req.ex structs */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600294 u32 obj_request_count;
Ilya Dryomov7114eda2018-02-01 11:50:47 +0100295 u32 pending_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600296
297 struct kref kref;
298};
299
300#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100301 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600302#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100303 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600304
Ilya Dryomov99d16942016-08-12 16:11:41 +0200305enum rbd_watch_state {
306 RBD_WATCH_STATE_UNREGISTERED,
307 RBD_WATCH_STATE_REGISTERED,
308 RBD_WATCH_STATE_ERROR,
309};
310
Ilya Dryomoved95b212016-08-12 16:40:02 +0200311enum rbd_lock_state {
312 RBD_LOCK_STATE_UNLOCKED,
313 RBD_LOCK_STATE_LOCKED,
314 RBD_LOCK_STATE_RELEASING,
315};
316
317/* WatchNotify::ClientId */
318struct rbd_client_id {
319 u64 gid;
320 u64 handle;
321};
322
Alex Elderf84344f2012-08-31 17:29:51 -0500323struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500324 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500325 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500326};
327
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328/*
329 * a single device
330 */
331struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500332 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333
334 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200335 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700337
Alex Eldera30b71b2012-07-10 20:30:11 -0500338 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 struct rbd_client *rbd_client;
340
341 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
342
Alex Elderb82d1672013-01-14 12:43:31 -0600343 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
345 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600346 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500347 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300348 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200349 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200351 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200352 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500353
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200354 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600355
Ilya Dryomov99d16942016-08-12 16:11:41 +0200356 struct mutex watch_mutex;
357 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200358 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200359 u64 watch_cookie;
360 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361
Ilya Dryomoved95b212016-08-12 16:40:02 +0200362 struct rw_semaphore lock_rwsem;
363 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200364 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200365 struct rbd_client_id owner_cid;
366 struct work_struct acquired_lock_work;
367 struct work_struct released_lock_work;
368 struct delayed_work lock_dwork;
369 struct work_struct unlock_work;
370 wait_queue_head_t lock_waitq;
371
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200372 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700373
Alex Elder86b00e02012-10-25 23:34:42 -0500374 struct rbd_spec *parent_spec;
375 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500376 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500377 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500378
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100379 /* Block layer tags. */
380 struct blk_mq_tag_set tag_set;
381
Josh Durginc6666012011-11-21 17:11:12 -0800382 /* protects updating the header */
383 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500384
385 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700386
387 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800388
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800389 /* sysfs related */
390 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600391 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800392};
393
Alex Elderb82d1672013-01-14 12:43:31 -0600394/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200395 * Flag bits for rbd_dev->flags:
396 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
397 * by rbd_dev->lock
398 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600399 */
Alex Elder6d292902013-01-14 12:43:31 -0600400enum rbd_dev_flags {
401 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600402 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200403 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600404};
405
Alex Eldercfbf6372013-05-31 17:40:45 -0500406static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600407
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700408static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600409static DEFINE_SPINLOCK(rbd_dev_list_lock);
410
Alex Elder432b8582012-01-29 13:57:44 -0600411static LIST_HEAD(rbd_client_list); /* clients */
412static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700413
Alex Elder78c2a442013-05-01 12:43:04 -0500414/* Slab caches for frequently-allocated structures */
415
Alex Elder1c2a9df2013-05-01 12:43:03 -0500416static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500417static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500418
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200419static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200420static DEFINE_IDA(rbd_dev_id_ida);
421
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400422static struct workqueue_struct *rbd_wq;
423
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200424/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100425 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200426 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100427static bool single_major = true;
Joe Perches5657a812018-05-24 13:38:59 -0600428module_param(single_major, bool, 0444);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100429MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200430
Alex Elderf0f8cef2012-01-29 13:57:44 -0600431static ssize_t rbd_add(struct bus_type *bus, const char *buf,
432 size_t count);
433static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
434 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200435static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
436 size_t count);
437static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
438 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200439static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600440
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200441static int rbd_dev_id_to_minor(int dev_id)
442{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200443 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200444}
445
446static int minor_to_rbd_dev_id(int minor)
447{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200448 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200449}
450
Ilya Dryomoved95b212016-08-12 16:40:02 +0200451static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
452{
453 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
454 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
455}
456
457static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
458{
459 bool is_lock_owner;
460
461 down_read(&rbd_dev->lock_rwsem);
462 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
463 up_read(&rbd_dev->lock_rwsem);
464 return is_lock_owner;
465}
466
Ilya Dryomov8767b292017-03-02 19:56:57 +0100467static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
468{
469 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
470}
471
Joe Perches5657a812018-05-24 13:38:59 -0600472static BUS_ATTR(add, 0200, NULL, rbd_add);
473static BUS_ATTR(remove, 0200, NULL, rbd_remove);
474static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
475static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
476static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700477
478static struct attribute *rbd_bus_attrs[] = {
479 &bus_attr_add.attr,
480 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200481 &bus_attr_add_single_major.attr,
482 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100483 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700484 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600485};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200486
487static umode_t rbd_bus_is_visible(struct kobject *kobj,
488 struct attribute *attr, int index)
489{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200490 if (!single_major &&
491 (attr == &bus_attr_add_single_major.attr ||
492 attr == &bus_attr_remove_single_major.attr))
493 return 0;
494
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200495 return attr->mode;
496}
497
498static const struct attribute_group rbd_bus_group = {
499 .attrs = rbd_bus_attrs,
500 .is_visible = rbd_bus_is_visible,
501};
502__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600503
504static struct bus_type rbd_bus_type = {
505 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700506 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600507};
508
509static void rbd_root_dev_release(struct device *dev)
510{
511}
512
513static struct device rbd_root_dev = {
514 .init_name = "rbd",
515 .release = rbd_root_dev_release,
516};
517
Alex Elder06ecc6c2012-11-01 10:17:15 -0500518static __printf(2, 3)
519void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
520{
521 struct va_format vaf;
522 va_list args;
523
524 va_start(args, fmt);
525 vaf.fmt = fmt;
526 vaf.va = &args;
527
528 if (!rbd_dev)
529 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
530 else if (rbd_dev->disk)
531 printk(KERN_WARNING "%s: %s: %pV\n",
532 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
533 else if (rbd_dev->spec && rbd_dev->spec->image_name)
534 printk(KERN_WARNING "%s: image %s: %pV\n",
535 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
536 else if (rbd_dev->spec && rbd_dev->spec->image_id)
537 printk(KERN_WARNING "%s: id %s: %pV\n",
538 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
539 else /* punt */
540 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
541 RBD_DRV_NAME, rbd_dev, &vaf);
542 va_end(args);
543}
544
Alex Elderaafb2302012-09-06 16:00:54 -0500545#ifdef RBD_DEBUG
546#define rbd_assert(expr) \
547 if (unlikely(!(expr))) { \
548 printk(KERN_ERR "\nAssertion failure in %s() " \
549 "at line %d:\n\n" \
550 "\trbd_assert(%s);\n\n", \
551 __func__, __LINE__, #expr); \
552 BUG(); \
553 }
554#else /* !RBD_DEBUG */
555# define rbd_assert(expr) ((void) 0)
556#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800557
Alex Elder05a46af2013-04-26 15:44:36 -0500558static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600559
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500560static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500561static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400562static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400563static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500564static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
565 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500566static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
567 u8 *order, u64 *snap_size);
568static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
569 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700570
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700571static int rbd_open(struct block_device *bdev, fmode_t mode)
572{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600573 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600574 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575
Alex Eldera14ea262013-02-05 13:23:12 -0600576 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600577 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
578 removing = true;
579 else
580 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600581 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600582 if (removing)
583 return -ENOENT;
584
Alex Elderc3e946c2012-11-16 09:29:16 -0600585 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700586
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587 return 0;
588}
589
Al Virodb2a1442013-05-05 21:52:57 -0400590static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800591{
592 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600593 unsigned long open_count_before;
594
Alex Eldera14ea262013-02-05 13:23:12 -0600595 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600596 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600597 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600598 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800599
Alex Elderc3e946c2012-11-16 09:29:16 -0600600 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800601}
602
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800603static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
604{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200605 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800606
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200607 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800608 return -EFAULT;
609
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200610 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800611 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
612 return -EROFS;
613
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200614 /* Let blkdev_roset() handle it */
615 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800616}
617
618static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
619 unsigned int cmd, unsigned long arg)
620{
621 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200622 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800623
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800624 switch (cmd) {
625 case BLKROSET:
626 ret = rbd_ioctl_set_ro(rbd_dev, arg);
627 break;
628 default:
629 ret = -ENOTTY;
630 }
631
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800632 return ret;
633}
634
635#ifdef CONFIG_COMPAT
636static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
637 unsigned int cmd, unsigned long arg)
638{
639 return rbd_ioctl(bdev, mode, cmd, arg);
640}
641#endif /* CONFIG_COMPAT */
642
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643static const struct block_device_operations rbd_bd_ops = {
644 .owner = THIS_MODULE,
645 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800646 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800647 .ioctl = rbd_ioctl,
648#ifdef CONFIG_COMPAT
649 .compat_ioctl = rbd_compat_ioctl,
650#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651};
652
653/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500654 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500655 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656 */
Alex Elderf8c38922012-08-10 13:12:07 -0700657static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658{
659 struct rbd_client *rbdc;
660 int ret = -ENOMEM;
661
Alex Elder37206ee2013-02-20 17:32:08 -0600662 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
664 if (!rbdc)
665 goto out_opt;
666
667 kref_init(&rbdc->kref);
668 INIT_LIST_HEAD(&rbdc->node);
669
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100670 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500672 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500673 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674
675 ret = ceph_open_session(rbdc->client);
676 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500677 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678
Alex Elder432b8582012-01-29 13:57:44 -0600679 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600681 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682
Alex Elder37206ee2013-02-20 17:32:08 -0600683 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600684
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500686out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500688out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689 kfree(rbdc);
690out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500691 if (ceph_opts)
692 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600693 dout("%s: error %d\n", __func__, ret);
694
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400695 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696}
697
Alex Elder2f82ee52012-10-30 19:40:33 -0500698static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
699{
700 kref_get(&rbdc->kref);
701
702 return rbdc;
703}
704
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700706 * Find a ceph client with specific addr and configuration. If
707 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700709static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700710{
711 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700712 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713
Alex Elder43ae4702012-07-03 16:01:18 -0500714 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700715 return NULL;
716
Alex Elder1f7ba332012-08-10 13:12:07 -0700717 spin_lock(&rbd_client_list_lock);
718 list_for_each_entry(client_node, &rbd_client_list, node) {
719 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500720 __rbd_get_client(client_node);
721
Alex Elder1f7ba332012-08-10 13:12:07 -0700722 found = true;
723 break;
724 }
725 }
726 spin_unlock(&rbd_client_list_lock);
727
728 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729}
730
731/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300732 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700733 */
734enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300735 Opt_queue_depth,
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400736 Opt_lock_timeout,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700737 Opt_last_int,
738 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200739 Opt_pool_ns,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700740 Opt_last_string,
741 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700742 Opt_read_only,
743 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200744 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200745 Opt_exclusive,
Ilya Dryomovd9360542018-03-23 06:14:47 +0100746 Opt_notrim,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300747 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700748};
749
Alex Elder43ae4702012-07-03 16:01:18 -0500750static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300751 {Opt_queue_depth, "queue_depth=%d"},
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400752 {Opt_lock_timeout, "lock_timeout=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700753 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200754 {Opt_pool_ns, "_pool_ns=%s"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700755 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500756 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700757 {Opt_read_only, "ro"}, /* Alternate spelling */
758 {Opt_read_write, "read_write"},
759 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200760 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200761 {Opt_exclusive, "exclusive"},
Ilya Dryomovd9360542018-03-23 06:14:47 +0100762 {Opt_notrim, "notrim"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300763 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700764};
765
Alex Elder98571b52013-01-20 14:44:42 -0600766struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300767 int queue_depth;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400768 unsigned long lock_timeout;
Alex Elder98571b52013-01-20 14:44:42 -0600769 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200770 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200771 bool exclusive;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100772 bool trim;
Alex Elder98571b52013-01-20 14:44:42 -0600773};
774
Ilya Dryomovb5584182015-06-23 16:21:19 +0300775#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400776#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
Alex Elder98571b52013-01-20 14:44:42 -0600777#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200778#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200779#define RBD_EXCLUSIVE_DEFAULT false
Ilya Dryomovd9360542018-03-23 06:14:47 +0100780#define RBD_TRIM_DEFAULT true
Alex Elder98571b52013-01-20 14:44:42 -0600781
Ilya Dryomovc3001562018-07-03 15:28:43 +0200782struct parse_rbd_opts_ctx {
783 struct rbd_spec *spec;
784 struct rbd_options *opts;
785};
786
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700787static int parse_rbd_opts_token(char *c, void *private)
788{
Ilya Dryomovc3001562018-07-03 15:28:43 +0200789 struct parse_rbd_opts_ctx *pctx = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700790 substring_t argstr[MAX_OPT_ARGS];
791 int token, intval, ret;
792
Alex Elder43ae4702012-07-03 16:01:18 -0500793 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700794 if (token < Opt_last_int) {
795 ret = match_int(&argstr[0], &intval);
796 if (ret < 0) {
Ilya Dryomov2f56b6b2018-06-27 16:38:13 +0200797 pr_err("bad option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700798 return ret;
799 }
800 dout("got int token %d val %d\n", token, intval);
801 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300802 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700803 } else {
804 dout("got token %d\n", token);
805 }
806
807 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300808 case Opt_queue_depth:
809 if (intval < 1) {
810 pr_err("queue_depth out of range\n");
811 return -EINVAL;
812 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200813 pctx->opts->queue_depth = intval;
Ilya Dryomovb5584182015-06-23 16:21:19 +0300814 break;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400815 case Opt_lock_timeout:
816 /* 0 is "wait forever" (i.e. infinite timeout) */
817 if (intval < 0 || intval > INT_MAX / 1000) {
818 pr_err("lock_timeout out of range\n");
819 return -EINVAL;
820 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200821 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400822 break;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200823 case Opt_pool_ns:
824 kfree(pctx->spec->pool_ns);
825 pctx->spec->pool_ns = match_strdup(argstr);
826 if (!pctx->spec->pool_ns)
827 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700828 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700829 case Opt_read_only:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200830 pctx->opts->read_only = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700831 break;
832 case Opt_read_write:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200833 pctx->opts->read_only = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700834 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200835 case Opt_lock_on_read:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200836 pctx->opts->lock_on_read = true;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200837 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200838 case Opt_exclusive:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200839 pctx->opts->exclusive = true;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200840 break;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100841 case Opt_notrim:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200842 pctx->opts->trim = false;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100843 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700844 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300845 /* libceph prints "bad option" msg */
846 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700847 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300848
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700849 return 0;
850}
851
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800852static char* obj_op_name(enum obj_operation_type op_type)
853{
854 switch (op_type) {
855 case OBJ_OP_READ:
856 return "read";
857 case OBJ_OP_WRITE:
858 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800859 case OBJ_OP_DISCARD:
860 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800861 default:
862 return "???";
863 }
864}
865
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700866/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700867 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600868 *
Alex Elder432b8582012-01-29 13:57:44 -0600869 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700870 */
871static void rbd_client_release(struct kref *kref)
872{
873 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
874
Alex Elder37206ee2013-02-20 17:32:08 -0600875 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500876 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700877 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500878 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700879
880 ceph_destroy_client(rbdc->client);
881 kfree(rbdc);
882}
883
884/*
885 * Drop reference to ceph client node. If it's not referenced anymore, release
886 * it.
887 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500888static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700889{
Alex Elderc53d5892012-10-25 23:34:42 -0500890 if (rbdc)
891 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700892}
893
Ilya Dryomovdd435852018-02-22 13:43:24 +0100894static int wait_for_latest_osdmap(struct ceph_client *client)
895{
896 u64 newest_epoch;
897 int ret;
898
899 ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
900 if (ret)
901 return ret;
902
903 if (client->osdc.osdmap->epoch >= newest_epoch)
904 return 0;
905
906 ceph_osdc_maybe_request_map(&client->osdc);
907 return ceph_monc_wait_osdmap(&client->monc, newest_epoch,
908 client->options->mount_timeout);
909}
910
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100911/*
912 * Get a ceph client with specific addr and configuration, if one does
913 * not exist create it. Either way, ceph_opts is consumed by this
914 * function.
915 */
916static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
917{
918 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100919 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100920
921 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
922 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100923 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100924 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100925
926 /*
927 * Using an existing client. Make sure ->pg_pools is up to
928 * date before we look up the pool id in do_rbd_add().
929 */
930 ret = wait_for_latest_osdmap(rbdc->client);
931 if (ret) {
932 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
933 rbd_put_client(rbdc);
934 rbdc = ERR_PTR(ret);
935 }
936 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100937 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100938 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100939 mutex_unlock(&client_mutex);
940
941 return rbdc;
942}
943
Alex Eldera30b71b2012-07-10 20:30:11 -0500944static bool rbd_image_format_valid(u32 image_format)
945{
946 return image_format == 1 || image_format == 2;
947}
948
Alex Elder8e94af82012-07-25 09:32:40 -0500949static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
950{
Alex Elder103a1502012-08-02 11:29:45 -0500951 size_t size;
952 u32 snap_count;
953
954 /* The header has to start with the magic rbd header text */
955 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
956 return false;
957
Alex Elderdb2388b2012-10-20 22:17:27 -0500958 /* The bio layer requires at least sector-sized I/O */
959
960 if (ondisk->options.order < SECTOR_SHIFT)
961 return false;
962
963 /* If we use u64 in a few spots we may be able to loosen this */
964
965 if (ondisk->options.order > 8 * sizeof (int) - 1)
966 return false;
967
Alex Elder103a1502012-08-02 11:29:45 -0500968 /*
969 * The size of a snapshot header has to fit in a size_t, and
970 * that limits the number of snapshots.
971 */
972 snap_count = le32_to_cpu(ondisk->snap_count);
973 size = SIZE_MAX - sizeof (struct ceph_snap_context);
974 if (snap_count > size / sizeof (__le64))
975 return false;
976
977 /*
978 * Not only that, but the size of the entire the snapshot
979 * header must also be representable in a size_t.
980 */
981 size -= snap_count * sizeof (__le64);
982 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
983 return false;
984
985 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500986}
987
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700988/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100989 * returns the size of an object in the image
990 */
991static u32 rbd_obj_bytes(struct rbd_image_header *header)
992{
993 return 1U << header->obj_order;
994}
995
Ilya Dryomov263423f2017-01-25 18:16:22 +0100996static void rbd_init_layout(struct rbd_device *rbd_dev)
997{
998 if (rbd_dev->header.stripe_unit == 0 ||
999 rbd_dev->header.stripe_count == 0) {
1000 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1001 rbd_dev->header.stripe_count = 1;
1002 }
1003
1004 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1005 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1006 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +01001007 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1008 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001009 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1010}
1011
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001012/*
Alex Elderbb23e372013-05-06 09:51:29 -05001013 * Fill an rbd image header with information from the given format 1
1014 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001015 */
Alex Elder662518b2013-05-06 09:51:29 -05001016static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001017 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001018{
Alex Elder662518b2013-05-06 09:51:29 -05001019 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001020 bool first_time = header->object_prefix == NULL;
1021 struct ceph_snap_context *snapc;
1022 char *object_prefix = NULL;
1023 char *snap_names = NULL;
1024 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001025 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001026 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001027 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001028
Alex Elderbb23e372013-05-06 09:51:29 -05001029 /* Allocate this now to avoid having to handle failure below */
1030
1031 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001032 object_prefix = kstrndup(ondisk->object_prefix,
1033 sizeof(ondisk->object_prefix),
1034 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001035 if (!object_prefix)
1036 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001037 }
1038
1039 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001040
Alex Elder103a1502012-08-02 11:29:45 -05001041 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001042 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1043 if (!snapc)
1044 goto out_err;
1045 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001046 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001047 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001048 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1049
Alex Elderbb23e372013-05-06 09:51:29 -05001050 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001051
Alex Elderbb23e372013-05-06 09:51:29 -05001052 if (snap_names_len > (u64)SIZE_MAX)
1053 goto out_2big;
1054 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1055 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001056 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001057
1058 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001059 snap_sizes = kmalloc_array(snap_count,
1060 sizeof(*header->snap_sizes),
1061 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001062 if (!snap_sizes)
1063 goto out_err;
1064
Alex Elderf785cc12012-08-23 23:22:06 -05001065 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001066 * Copy the names, and fill in each snapshot's id
1067 * and size.
1068 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001069 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001070 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001071 * snap_names_len bytes beyond the end of the
1072 * snapshot id array, this memcpy() is safe.
1073 */
Alex Elderbb23e372013-05-06 09:51:29 -05001074 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1075 snaps = ondisk->snaps;
1076 for (i = 0; i < snap_count; i++) {
1077 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1078 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1079 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001080 }
Alex Elder849b4262012-07-09 21:04:24 -05001081
Alex Elderbb23e372013-05-06 09:51:29 -05001082 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001083
Alex Elderbb23e372013-05-06 09:51:29 -05001084 if (first_time) {
1085 header->object_prefix = object_prefix;
1086 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001087 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001088 } else {
1089 ceph_put_snap_context(header->snapc);
1090 kfree(header->snap_names);
1091 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001092 }
1093
1094 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001095
Alex Elderf84344f2012-08-31 17:29:51 -05001096 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001097 header->snapc = snapc;
1098 header->snap_names = snap_names;
1099 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001100
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001101 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001102out_2big:
1103 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001104out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001105 kfree(snap_sizes);
1106 kfree(snap_names);
1107 ceph_put_snap_context(snapc);
1108 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001109
Alex Elderbb23e372013-05-06 09:51:29 -05001110 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001111}
1112
Alex Elder9682fc62013-04-30 00:44:33 -05001113static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1114{
1115 const char *snap_name;
1116
1117 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1118
1119 /* Skip over names until we find the one we are looking for */
1120
1121 snap_name = rbd_dev->header.snap_names;
1122 while (which--)
1123 snap_name += strlen(snap_name) + 1;
1124
1125 return kstrdup(snap_name, GFP_KERNEL);
1126}
1127
Alex Elder30d1cff2013-05-01 12:43:03 -05001128/*
1129 * Snapshot id comparison function for use with qsort()/bsearch().
1130 * Note that result is for snapshots in *descending* order.
1131 */
1132static int snapid_compare_reverse(const void *s1, const void *s2)
1133{
1134 u64 snap_id1 = *(u64 *)s1;
1135 u64 snap_id2 = *(u64 *)s2;
1136
1137 if (snap_id1 < snap_id2)
1138 return 1;
1139 return snap_id1 == snap_id2 ? 0 : -1;
1140}
1141
1142/*
1143 * Search a snapshot context to see if the given snapshot id is
1144 * present.
1145 *
1146 * Returns the position of the snapshot id in the array if it's found,
1147 * or BAD_SNAP_INDEX otherwise.
1148 *
1149 * Note: The snapshot array is in kept sorted (by the osd) in
1150 * reverse order, highest snapshot id first.
1151 */
Alex Elder9682fc62013-04-30 00:44:33 -05001152static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1153{
1154 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001155 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001156
Alex Elder30d1cff2013-05-01 12:43:03 -05001157 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1158 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001159
Alex Elder30d1cff2013-05-01 12:43:03 -05001160 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001161}
1162
Alex Elder2ad3d712013-04-30 00:44:33 -05001163static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1164 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001165{
1166 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001167 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001168
1169 which = rbd_dev_snap_index(rbd_dev, snap_id);
1170 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001171 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001172
Josh Durginda6a6b62013-09-04 17:57:31 -07001173 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1174 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001175}
1176
Alex Elder9e15b772012-10-30 19:40:33 -05001177static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1178{
Alex Elder9e15b772012-10-30 19:40:33 -05001179 if (snap_id == CEPH_NOSNAP)
1180 return RBD_SNAP_HEAD_NAME;
1181
Alex Elder54cac612013-04-30 00:44:33 -05001182 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1183 if (rbd_dev->image_format == 1)
1184 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001185
Alex Elder54cac612013-04-30 00:44:33 -05001186 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001187}
1188
Alex Elder2ad3d712013-04-30 00:44:33 -05001189static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1190 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001191{
Alex Elder2ad3d712013-04-30 00:44:33 -05001192 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1193 if (snap_id == CEPH_NOSNAP) {
1194 *snap_size = rbd_dev->header.image_size;
1195 } else if (rbd_dev->image_format == 1) {
1196 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001197
Alex Elder2ad3d712013-04-30 00:44:33 -05001198 which = rbd_dev_snap_index(rbd_dev, snap_id);
1199 if (which == BAD_SNAP_INDEX)
1200 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001201
Alex Elder2ad3d712013-04-30 00:44:33 -05001202 *snap_size = rbd_dev->header.snap_sizes[which];
1203 } else {
1204 u64 size = 0;
1205 int ret;
1206
1207 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1208 if (ret)
1209 return ret;
1210
1211 *snap_size = size;
1212 }
1213 return 0;
1214}
1215
1216static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1217 u64 *snap_features)
1218{
1219 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1220 if (snap_id == CEPH_NOSNAP) {
1221 *snap_features = rbd_dev->header.features;
1222 } else if (rbd_dev->image_format == 1) {
1223 *snap_features = 0; /* No features for format 1 */
1224 } else {
1225 u64 features = 0;
1226 int ret;
1227
1228 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1229 if (ret)
1230 return ret;
1231
1232 *snap_features = features;
1233 }
1234 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001235}
1236
Alex Elderd1cf5782013-04-27 09:59:30 -05001237static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001238{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001239 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001240 u64 size = 0;
1241 u64 features = 0;
1242 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001243
Alex Elder2ad3d712013-04-30 00:44:33 -05001244 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1245 if (ret)
1246 return ret;
1247 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1248 if (ret)
1249 return ret;
1250
1251 rbd_dev->mapping.size = size;
1252 rbd_dev->mapping.features = features;
1253
Alex Elder8b0241f2013-04-25 23:15:08 -05001254 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255}
1256
Alex Elderd1cf5782013-04-27 09:59:30 -05001257static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1258{
1259 rbd_dev->mapping.size = 0;
1260 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001261}
1262
Ilya Dryomov5359a172018-01-20 10:30:10 +01001263static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001264{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001265 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001266 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001267
Ilya Dryomov5359a172018-01-20 10:30:10 +01001268 buf = bvec_kmap_irq(bv, &flags);
1269 memset(buf, 0, bv->bv_len);
1270 flush_dcache_page(bv->bv_page);
1271 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272}
1273
Ilya Dryomov5359a172018-01-20 10:30:10 +01001274static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001275{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001276 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001277
Ilya Dryomov5359a172018-01-20 10:30:10 +01001278 ceph_bio_iter_advance(&it, off);
1279 ceph_bio_iter_advance_step(&it, bytes, ({
1280 zero_bvec(&bv);
1281 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001282}
1283
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001284static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001285{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001286 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001287
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001288 ceph_bvec_iter_advance(&it, off);
1289 ceph_bvec_iter_advance_step(&it, bytes, ({
1290 zero_bvec(&bv);
1291 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001292}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001293
Alex Elderf7760da2012-10-20 22:17:27 -05001294/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001295 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001296 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001297 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001298 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001299 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001300static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1301 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001302{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001303 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001304 case OBJ_REQUEST_BIO:
1305 zero_bios(&obj_req->bio_pos, off, bytes);
1306 break;
1307 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001308 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001309 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1310 break;
1311 default:
1312 rbd_assert(0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001313 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001314}
1315
1316static void rbd_obj_request_destroy(struct kref *kref);
1317static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1318{
1319 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001320 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001321 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001322 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1323}
1324
Alex Elder0f2d5be2014-04-26 14:21:44 +04001325static void rbd_img_request_get(struct rbd_img_request *img_request)
1326{
1327 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001328 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001329 kref_get(&img_request->kref);
1330}
1331
Alex Elderbf0d5f502012-11-22 00:00:08 -06001332static void rbd_img_request_destroy(struct kref *kref);
1333static void rbd_img_request_put(struct rbd_img_request *img_request)
1334{
1335 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001336 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001337 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001338 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001339}
1340
1341static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1342 struct rbd_obj_request *obj_request)
1343{
Alex Elder25dcf952013-01-25 17:08:55 -06001344 rbd_assert(obj_request->img_request == NULL);
1345
Alex Elderb155e862013-04-15 14:50:37 -05001346 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001347 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001348 img_request->obj_request_count++;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01001349 img_request->pending_count++;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001350 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001351}
1352
1353static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1354 struct rbd_obj_request *obj_request)
1355{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001356 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001357 list_del(&obj_request->ex.oe_item);
Alex Elder25dcf952013-01-25 17:08:55 -06001358 rbd_assert(img_request->obj_request_count > 0);
1359 img_request->obj_request_count--;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001360 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001361 rbd_obj_request_put(obj_request);
1362}
1363
Ilya Dryomov980917f2016-09-12 18:59:42 +02001364static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001365{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001366 struct ceph_osd_request *osd_req = obj_request->osd_req;
1367
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001368 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001369 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1370 obj_request->ex.oe_len, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001371 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001372}
1373
Alex Elder0c425242013-02-08 09:55:49 -06001374/*
1375 * The default/initial value for all image request flags is 0. Each
1376 * is conditionally set to 1 at image request initialization time
1377 * and currently never change thereafter.
1378 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001379static void img_request_layered_set(struct rbd_img_request *img_request)
1380{
1381 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1382 smp_mb();
1383}
1384
Alex Eldera2acd002013-05-08 22:50:04 -05001385static void img_request_layered_clear(struct rbd_img_request *img_request)
1386{
1387 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1388 smp_mb();
1389}
1390
Alex Elderd0b2e942013-01-24 16:13:36 -06001391static bool img_request_layered_test(struct rbd_img_request *img_request)
1392{
1393 smp_mb();
1394 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1395}
1396
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001397static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001398{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001399 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1400
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001401 return !obj_req->ex.oe_off &&
1402 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001403}
1404
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001405static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001406{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001407 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001408
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001409 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001410 rbd_dev->layout.object_size;
1411}
1412
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001413static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1414{
1415 return ceph_file_extents_bytes(obj_req->img_extents,
1416 obj_req->num_img_extents);
1417}
1418
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001419static bool rbd_img_is_write(struct rbd_img_request *img_req)
1420{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001421 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001422 case OBJ_OP_READ:
1423 return false;
1424 case OBJ_OP_WRITE:
1425 case OBJ_OP_DISCARD:
1426 return true;
1427 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02001428 BUG();
Alex Elder6e2a4502013-03-27 09:16:30 -05001429 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001430}
1431
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001432static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
Ilya Dryomov27617132015-07-16 17:36:11 +03001433
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001434static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001435{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001436 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001437
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001438 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1439 osd_req->r_result, obj_req);
1440 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001441
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001442 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1443 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1444 obj_req->xferred = osd_req->r_result;
1445 else
1446 /*
1447 * Writes aren't allowed to return a data payload. In some
1448 * guarded write cases (e.g. stat + zero on an empty object)
1449 * a stat response makes it through, but we don't care.
1450 */
1451 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001452
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001453 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001454}
1455
Alex Elder9d4df012013-04-19 15:34:50 -05001456static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001457{
Alex Elder8c042b02013-04-03 01:28:58 -05001458 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001459
Ilya Dryomova162b302018-01-30 17:52:10 +01001460 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001461 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001462}
1463
1464static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1465{
Alex Elder9d4df012013-04-19 15:34:50 -05001466 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001467
Ilya Dryomova162b302018-01-30 17:52:10 +01001468 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001469 ktime_get_real_ts64(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001470 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001471}
1472
Ilya Dryomovbc812072017-01-25 18:16:23 +01001473static struct ceph_osd_request *
Ilya Dryomova162b302018-01-30 17:52:10 +01001474rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001475{
Ilya Dryomova162b302018-01-30 17:52:10 +01001476 struct rbd_img_request *img_req = obj_req->img_request;
1477 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001478 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1479 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001480 const char *name_format = rbd_dev->image_format == 1 ?
1481 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001482
Ilya Dryomova162b302018-01-30 17:52:10 +01001483 req = ceph_osdc_alloc_request(osdc,
1484 (rbd_img_is_write(img_req) ? img_req->snapc : NULL),
1485 num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001486 if (!req)
1487 return NULL;
1488
Ilya Dryomovbc812072017-01-25 18:16:23 +01001489 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001490 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001491
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001492 /*
1493 * Data objects may be stored in a separate pool, but always in
1494 * the same namespace in that pool as the header in its pool.
1495 */
1496 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001497 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001498
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001499 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001500 rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001501 goto err_req;
1502
1503 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1504 goto err_req;
1505
1506 return req;
1507
1508err_req:
1509 ceph_osdc_put_request(req);
1510 return NULL;
1511}
1512
Alex Elderbf0d5f502012-11-22 00:00:08 -06001513static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1514{
1515 ceph_osdc_put_request(osd_req);
1516}
1517
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001518static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001519{
1520 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001521
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001522 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001523 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001524 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001525
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001526 ceph_object_extent_init(&obj_request->ex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001527 kref_init(&obj_request->kref);
1528
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001529 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001530 return obj_request;
1531}
1532
1533static void rbd_obj_request_destroy(struct kref *kref)
1534{
1535 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001536 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001537
1538 obj_request = container_of(kref, struct rbd_obj_request, kref);
1539
Alex Elder37206ee2013-02-20 17:32:08 -06001540 dout("%s: obj %p\n", __func__, obj_request);
1541
Alex Elderbf0d5f502012-11-22 00:00:08 -06001542 if (obj_request->osd_req)
1543 rbd_osd_req_destroy(obj_request->osd_req);
1544
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001545 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001546 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001547 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001548 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001549 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001550 case OBJ_REQUEST_OWN_BVECS:
1551 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001552 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001553 default:
1554 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001555 }
1556
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001557 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001558 if (obj_request->copyup_bvecs) {
1559 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1560 if (obj_request->copyup_bvecs[i].bv_page)
1561 __free_page(obj_request->copyup_bvecs[i].bv_page);
1562 }
1563 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001564 }
1565
Alex Elder868311b2013-05-01 12:43:03 -05001566 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001567}
1568
Alex Elderfb65d2282013-05-08 22:50:04 -05001569/* It's OK to call this for a device with no parent */
1570
1571static void rbd_spec_put(struct rbd_spec *spec);
1572static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1573{
1574 rbd_dev_remove_parent(rbd_dev);
1575 rbd_spec_put(rbd_dev->parent_spec);
1576 rbd_dev->parent_spec = NULL;
1577 rbd_dev->parent_overlap = 0;
1578}
1579
Alex Elderbf0d5f502012-11-22 00:00:08 -06001580/*
Alex Eldera2acd002013-05-08 22:50:04 -05001581 * Parent image reference counting is used to determine when an
1582 * image's parent fields can be safely torn down--after there are no
1583 * more in-flight requests to the parent image. When the last
1584 * reference is dropped, cleaning them up is safe.
1585 */
1586static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1587{
1588 int counter;
1589
1590 if (!rbd_dev->parent_spec)
1591 return;
1592
1593 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1594 if (counter > 0)
1595 return;
1596
1597 /* Last reference; clean up parent data structures */
1598
1599 if (!counter)
1600 rbd_dev_unparent(rbd_dev);
1601 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001602 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001603}
1604
1605/*
1606 * If an image has a non-zero parent overlap, get a reference to its
1607 * parent.
1608 *
1609 * Returns true if the rbd device has a parent with a non-zero
1610 * overlap and a reference for it was successfully taken, or
1611 * false otherwise.
1612 */
1613static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1614{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001615 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001616
1617 if (!rbd_dev->parent_spec)
1618 return false;
1619
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001620 down_read(&rbd_dev->header_rwsem);
1621 if (rbd_dev->parent_overlap)
1622 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1623 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001624
1625 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001626 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001627
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001628 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001629}
1630
Alex Elderbf0d5f502012-11-22 00:00:08 -06001631/*
1632 * Caller is responsible for filling in the list of object requests
1633 * that comprises the image request, and the Linux request pointer
1634 * (if there is one).
1635 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001636static struct rbd_img_request *rbd_img_request_create(
1637 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001638 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001639 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001640{
1641 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001642
Ilya Dryomova0c58952018-01-22 16:03:06 +01001643 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001644 if (!img_request)
1645 return NULL;
1646
Alex Elderbf0d5f502012-11-22 00:00:08 -06001647 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001648 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001649 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001650 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001651 else
1652 img_request->snapc = snapc;
1653
Alex Eldera2acd002013-05-08 22:50:04 -05001654 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001655 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001656
Alex Elderbf0d5f502012-11-22 00:00:08 -06001657 spin_lock_init(&img_request->completion_lock);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001658 INIT_LIST_HEAD(&img_request->object_extents);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001659 kref_init(&img_request->kref);
1660
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001661 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1662 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001663 return img_request;
1664}
1665
1666static void rbd_img_request_destroy(struct kref *kref)
1667{
1668 struct rbd_img_request *img_request;
1669 struct rbd_obj_request *obj_request;
1670 struct rbd_obj_request *next_obj_request;
1671
1672 img_request = container_of(kref, struct rbd_img_request, kref);
1673
Alex Elder37206ee2013-02-20 17:32:08 -06001674 dout("%s: img %p\n", __func__, img_request);
1675
Alex Elderbf0d5f502012-11-22 00:00:08 -06001676 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1677 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06001678 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001679
Alex Eldera2acd002013-05-08 22:50:04 -05001680 if (img_request_layered_test(img_request)) {
1681 img_request_layered_clear(img_request);
1682 rbd_dev_parent_put(img_request->rbd_dev);
1683 }
1684
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001685 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001686 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001687
Alex Elder1c2a9df2013-05-01 12:43:03 -05001688 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001689}
1690
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001691static void prune_extents(struct ceph_file_extent *img_extents,
1692 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05001693{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001694 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05001695
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001696 /* drop extents completely beyond the overlap */
1697 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1698 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05001699
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001700 if (cnt) {
1701 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05001702
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001703 /* trim final overlapping extent */
1704 if (ex->fe_off + ex->fe_len > overlap)
1705 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06001706 }
1707
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001708 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05001709}
1710
Alex Elderf1a47392013-04-19 15:34:50 -05001711/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001712 * Determine the byte range(s) covered by either just the object extent
1713 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001714 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001715static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1716 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001717{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001718 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001719 int ret;
1720
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001721 if (!rbd_dev->parent_overlap)
1722 return 0;
1723
1724 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1725 entire ? 0 : obj_req->ex.oe_off,
1726 entire ? rbd_dev->layout.object_size :
1727 obj_req->ex.oe_len,
1728 &obj_req->img_extents,
1729 &obj_req->num_img_extents);
1730 if (ret)
1731 return ret;
1732
1733 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1734 rbd_dev->parent_overlap);
1735 return 0;
1736}
1737
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001738static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1739{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001740 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001741 case OBJ_REQUEST_BIO:
1742 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1743 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001744 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001745 break;
1746 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001747 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001748 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001749 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001750 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001751 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1752 &obj_req->bvec_pos);
1753 break;
1754 default:
1755 rbd_assert(0);
1756 }
1757}
1758
1759static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1760{
Ilya Dryomova162b302018-01-30 17:52:10 +01001761 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001762 if (!obj_req->osd_req)
Ilya Dryomov710214e2016-09-15 17:53:32 +02001763 return -ENOMEM;
1764
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001765 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001766 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001767 rbd_osd_req_setup_data(obj_req, 0);
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001768
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001769 rbd_osd_req_format_read(obj_req);
1770 return 0;
1771}
1772
1773static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1774 unsigned int which)
1775{
1776 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001777
Alex Elderc5b5ef62013-02-11 12:33:24 -06001778 /*
1779 * The response data for a STAT call consists of:
1780 * le64 length;
1781 * struct {
1782 * le32 tv_sec;
1783 * le32 tv_nsec;
1784 * } mtime;
1785 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001786 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1787 if (IS_ERR(pages))
1788 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06001789
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001790 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1791 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1792 8 + sizeof(struct ceph_timespec),
1793 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001794 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001795}
1796
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001797static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1798 unsigned int which)
Alex Elderb454e362013-04-19 15:34:50 -05001799{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001800 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1801 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05001802
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001803 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
1804 rbd_dev->layout.object_size,
1805 rbd_dev->layout.object_size);
Alex Elderb454e362013-04-19 15:34:50 -05001806
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001807 if (rbd_obj_is_entire(obj_req))
1808 opcode = CEPH_OSD_OP_WRITEFULL;
1809 else
1810 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001811
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001812 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001813 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001814 rbd_osd_req_setup_data(obj_req, which++);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001815
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001816 rbd_assert(which == obj_req->osd_req->r_num_ops);
1817 rbd_osd_req_format_write(obj_req);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001818}
1819
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001820static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001821{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001822 unsigned int num_osd_ops, which = 0;
1823 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02001824
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001825 /* reverse map the entire object onto the parent */
1826 ret = rbd_obj_calc_img_extents(obj_req, true);
1827 if (ret)
1828 return ret;
1829
1830 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001831 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1832 num_osd_ops = 3; /* stat + setallochint + write/writefull */
1833 } else {
1834 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1835 num_osd_ops = 2; /* setallochint + write/writefull */
1836 }
1837
Ilya Dryomova162b302018-01-30 17:52:10 +01001838 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001839 if (!obj_req->osd_req)
1840 return -ENOMEM;
1841
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001842 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001843 ret = __rbd_obj_setup_stat(obj_req, which++);
1844 if (ret)
1845 return ret;
1846 }
1847
1848 __rbd_obj_setup_write(obj_req, which);
1849 return 0;
1850}
1851
1852static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
1853 unsigned int which)
1854{
1855 u16 opcode;
1856
1857 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001858 if (obj_req->num_img_extents) {
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001859 osd_req_op_init(obj_req->osd_req, which++,
1860 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001861 opcode = CEPH_OSD_OP_TRUNCATE;
1862 } else {
1863 osd_req_op_init(obj_req->osd_req, which++,
1864 CEPH_OSD_OP_DELETE, 0);
1865 opcode = 0;
1866 }
1867 } else if (rbd_obj_is_tail(obj_req)) {
1868 opcode = CEPH_OSD_OP_TRUNCATE;
1869 } else {
1870 opcode = CEPH_OSD_OP_ZERO;
1871 }
1872
1873 if (opcode)
1874 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001875 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001876 0, 0);
1877
1878 rbd_assert(which == obj_req->osd_req->r_num_ops);
1879 rbd_osd_req_format_write(obj_req);
1880}
1881
1882static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1883{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001884 unsigned int num_osd_ops, which = 0;
1885 int ret;
1886
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001887 /* reverse map the entire object onto the parent */
1888 ret = rbd_obj_calc_img_extents(obj_req, true);
1889 if (ret)
1890 return ret;
1891
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001892 if (rbd_obj_is_entire(obj_req)) {
1893 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001894 if (obj_req->num_img_extents)
1895 num_osd_ops = 2; /* create + truncate */
1896 else
1897 num_osd_ops = 1; /* delete */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001898 } else {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001899 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001900 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1901 num_osd_ops = 2; /* stat + truncate/zero */
1902 } else {
1903 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1904 num_osd_ops = 1; /* truncate/zero */
1905 }
1906 }
1907
Ilya Dryomova162b302018-01-30 17:52:10 +01001908 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001909 if (!obj_req->osd_req)
1910 return -ENOMEM;
1911
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001912 if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001913 ret = __rbd_obj_setup_stat(obj_req, which++);
1914 if (ret)
1915 return ret;
1916 }
1917
1918 __rbd_obj_setup_discard(obj_req, which);
1919 return 0;
1920}
1921
1922/*
1923 * For each object request in @img_req, allocate an OSD request, add
1924 * individual OSD ops and prepare them for submission. The number of
1925 * OSD ops depends on op_type and the overlap point (if any).
1926 */
1927static int __rbd_img_fill_request(struct rbd_img_request *img_req)
1928{
1929 struct rbd_obj_request *obj_req;
1930 int ret;
1931
1932 for_each_obj_request(img_req, obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001933 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001934 case OBJ_OP_READ:
1935 ret = rbd_obj_setup_read(obj_req);
1936 break;
1937 case OBJ_OP_WRITE:
1938 ret = rbd_obj_setup_write(obj_req);
1939 break;
1940 case OBJ_OP_DISCARD:
1941 ret = rbd_obj_setup_discard(obj_req);
1942 break;
1943 default:
1944 rbd_assert(0);
1945 }
1946 if (ret)
1947 return ret;
1948 }
1949
1950 return 0;
1951}
1952
Ilya Dryomov5a237812018-02-06 19:26:34 +01001953union rbd_img_fill_iter {
1954 struct ceph_bio_iter bio_iter;
1955 struct ceph_bvec_iter bvec_iter;
1956};
1957
1958struct rbd_img_fill_ctx {
1959 enum obj_request_type pos_type;
1960 union rbd_img_fill_iter *pos;
1961 union rbd_img_fill_iter iter;
1962 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01001963 ceph_object_extent_fn_t count_fn;
1964 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01001965};
1966
1967static struct ceph_object_extent *alloc_object_extent(void *arg)
1968{
1969 struct rbd_img_request *img_req = arg;
1970 struct rbd_obj_request *obj_req;
1971
1972 obj_req = rbd_obj_request_create();
1973 if (!obj_req)
1974 return NULL;
1975
1976 rbd_img_obj_request_add(img_req, obj_req);
1977 return &obj_req->ex;
1978}
1979
1980/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01001981 * While su != os && sc == 1 is technically not fancy (it's the same
1982 * layout as su == os && sc == 1), we can't use the nocopy path for it
1983 * because ->set_pos_fn() should be called only once per object.
1984 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
1985 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01001986 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001987static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
1988{
1989 return l->stripe_unit != l->object_size;
1990}
1991
1992static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
1993 struct ceph_file_extent *img_extents,
1994 u32 num_img_extents,
1995 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01001996{
1997 u32 i;
1998 int ret;
1999
2000 img_req->data_type = fctx->pos_type;
2001
2002 /*
2003 * Create object requests and set each object request's starting
2004 * position in the provided bio (list) or bio_vec array.
2005 */
2006 fctx->iter = *fctx->pos;
2007 for (i = 0; i < num_img_extents; i++) {
2008 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2009 img_extents[i].fe_off,
2010 img_extents[i].fe_len,
2011 &img_req->object_extents,
2012 alloc_object_extent, img_req,
2013 fctx->set_pos_fn, &fctx->iter);
2014 if (ret)
2015 return ret;
2016 }
2017
2018 return __rbd_img_fill_request(img_req);
2019}
2020
Ilya Dryomovafb97882018-02-06 19:26:35 +01002021/*
2022 * Map a list of image extents to a list of object extents, create the
2023 * corresponding object requests (normally each to a different object,
2024 * but not always) and add them to @img_req. For each object request,
2025 * set up its data descriptor to point to the corresponding chunk(s) of
2026 * @fctx->pos data buffer.
2027 *
2028 * Because ceph_file_to_extents() will merge adjacent object extents
2029 * together, each object request's data descriptor may point to multiple
2030 * different chunks of @fctx->pos data buffer.
2031 *
2032 * @fctx->pos data buffer is assumed to be large enough.
2033 */
2034static int rbd_img_fill_request(struct rbd_img_request *img_req,
2035 struct ceph_file_extent *img_extents,
2036 u32 num_img_extents,
2037 struct rbd_img_fill_ctx *fctx)
2038{
2039 struct rbd_device *rbd_dev = img_req->rbd_dev;
2040 struct rbd_obj_request *obj_req;
2041 u32 i;
2042 int ret;
2043
2044 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2045 !rbd_layout_is_fancy(&rbd_dev->layout))
2046 return rbd_img_fill_request_nocopy(img_req, img_extents,
2047 num_img_extents, fctx);
2048
2049 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2050
2051 /*
2052 * Create object requests and determine ->bvec_count for each object
2053 * request. Note that ->bvec_count sum over all object requests may
2054 * be greater than the number of bio_vecs in the provided bio (list)
2055 * or bio_vec array because when mapped, those bio_vecs can straddle
2056 * stripe unit boundaries.
2057 */
2058 fctx->iter = *fctx->pos;
2059 for (i = 0; i < num_img_extents; i++) {
2060 ret = ceph_file_to_extents(&rbd_dev->layout,
2061 img_extents[i].fe_off,
2062 img_extents[i].fe_len,
2063 &img_req->object_extents,
2064 alloc_object_extent, img_req,
2065 fctx->count_fn, &fctx->iter);
2066 if (ret)
2067 return ret;
2068 }
2069
2070 for_each_obj_request(img_req, obj_req) {
2071 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2072 sizeof(*obj_req->bvec_pos.bvecs),
2073 GFP_NOIO);
2074 if (!obj_req->bvec_pos.bvecs)
2075 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002076 }
2077
2078 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002079 * Fill in each object request's private bio_vec array, splitting and
2080 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002081 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002082 fctx->iter = *fctx->pos;
2083 for (i = 0; i < num_img_extents; i++) {
2084 ret = ceph_iterate_extents(&rbd_dev->layout,
2085 img_extents[i].fe_off,
2086 img_extents[i].fe_len,
2087 &img_req->object_extents,
2088 fctx->copy_fn, &fctx->iter);
2089 if (ret)
2090 return ret;
2091 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002092
Ilya Dryomovafb97882018-02-06 19:26:35 +01002093 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002094}
2095
Ilya Dryomov5a237812018-02-06 19:26:34 +01002096static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2097 u64 off, u64 len)
2098{
2099 struct ceph_file_extent ex = { off, len };
2100 union rbd_img_fill_iter dummy;
2101 struct rbd_img_fill_ctx fctx = {
2102 .pos_type = OBJ_REQUEST_NODATA,
2103 .pos = &dummy,
2104 };
2105
2106 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2107}
2108
2109static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2110{
2111 struct rbd_obj_request *obj_req =
2112 container_of(ex, struct rbd_obj_request, ex);
2113 struct ceph_bio_iter *it = arg;
2114
2115 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2116 obj_req->bio_pos = *it;
2117 ceph_bio_iter_advance(it, bytes);
2118}
2119
Ilya Dryomovafb97882018-02-06 19:26:35 +01002120static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2121{
2122 struct rbd_obj_request *obj_req =
2123 container_of(ex, struct rbd_obj_request, ex);
2124 struct ceph_bio_iter *it = arg;
2125
2126 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2127 ceph_bio_iter_advance_step(it, bytes, ({
2128 obj_req->bvec_count++;
2129 }));
2130
2131}
2132
2133static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2134{
2135 struct rbd_obj_request *obj_req =
2136 container_of(ex, struct rbd_obj_request, ex);
2137 struct ceph_bio_iter *it = arg;
2138
2139 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2140 ceph_bio_iter_advance_step(it, bytes, ({
2141 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2142 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2143 }));
2144}
2145
Ilya Dryomov5a237812018-02-06 19:26:34 +01002146static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2147 struct ceph_file_extent *img_extents,
2148 u32 num_img_extents,
2149 struct ceph_bio_iter *bio_pos)
2150{
2151 struct rbd_img_fill_ctx fctx = {
2152 .pos_type = OBJ_REQUEST_BIO,
2153 .pos = (union rbd_img_fill_iter *)bio_pos,
2154 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002155 .count_fn = count_bio_bvecs,
2156 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002157 };
2158
2159 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2160 &fctx);
2161}
2162
2163static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2164 u64 off, u64 len, struct bio *bio)
2165{
2166 struct ceph_file_extent ex = { off, len };
2167 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2168
2169 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2170}
2171
2172static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2173{
2174 struct rbd_obj_request *obj_req =
2175 container_of(ex, struct rbd_obj_request, ex);
2176 struct ceph_bvec_iter *it = arg;
2177
2178 obj_req->bvec_pos = *it;
2179 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2180 ceph_bvec_iter_advance(it, bytes);
2181}
2182
Ilya Dryomovafb97882018-02-06 19:26:35 +01002183static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2184{
2185 struct rbd_obj_request *obj_req =
2186 container_of(ex, struct rbd_obj_request, ex);
2187 struct ceph_bvec_iter *it = arg;
2188
2189 ceph_bvec_iter_advance_step(it, bytes, ({
2190 obj_req->bvec_count++;
2191 }));
2192}
2193
2194static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2195{
2196 struct rbd_obj_request *obj_req =
2197 container_of(ex, struct rbd_obj_request, ex);
2198 struct ceph_bvec_iter *it = arg;
2199
2200 ceph_bvec_iter_advance_step(it, bytes, ({
2201 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2202 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2203 }));
2204}
2205
Ilya Dryomov5a237812018-02-06 19:26:34 +01002206static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2207 struct ceph_file_extent *img_extents,
2208 u32 num_img_extents,
2209 struct ceph_bvec_iter *bvec_pos)
2210{
2211 struct rbd_img_fill_ctx fctx = {
2212 .pos_type = OBJ_REQUEST_BVECS,
2213 .pos = (union rbd_img_fill_iter *)bvec_pos,
2214 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002215 .count_fn = count_bvecs,
2216 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002217 };
2218
2219 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2220 &fctx);
2221}
2222
2223static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2224 struct ceph_file_extent *img_extents,
2225 u32 num_img_extents,
2226 struct bio_vec *bvecs)
2227{
2228 struct ceph_bvec_iter it = {
2229 .bvecs = bvecs,
2230 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2231 num_img_extents) },
2232 };
2233
2234 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2235 &it);
2236}
2237
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002238static void rbd_img_request_submit(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002239{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002240 struct rbd_obj_request *obj_request;
2241
Alex Elder37206ee2013-02-20 17:32:08 -06002242 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002243
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002244 rbd_img_request_get(img_request);
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002245 for_each_obj_request(img_request, obj_request)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002246 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002247
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002248 rbd_img_request_put(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002249}
2250
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002251static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002252{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002253 struct rbd_img_request *img_req = obj_req->img_request;
2254 struct rbd_img_request *child_img_req;
2255 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002256
Ilya Dryomove93aca02018-02-06 19:26:35 +01002257 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2258 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002259 if (!child_img_req)
2260 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002261
Ilya Dryomove93aca02018-02-06 19:26:35 +01002262 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2263 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002264
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002265 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002266 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002267 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002268 ret = __rbd_img_fill_from_bio(child_img_req,
2269 obj_req->img_extents,
2270 obj_req->num_img_extents,
2271 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002272 break;
2273 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002274 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002275 ret = __rbd_img_fill_from_bvecs(child_img_req,
2276 obj_req->img_extents,
2277 obj_req->num_img_extents,
2278 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002279 break;
2280 default:
2281 rbd_assert(0);
2282 }
2283 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002284 ret = rbd_img_fill_from_bvecs(child_img_req,
2285 obj_req->img_extents,
2286 obj_req->num_img_extents,
2287 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002288 }
2289 if (ret) {
2290 rbd_img_request_put(child_img_req);
2291 return ret;
2292 }
2293
2294 rbd_img_request_submit(child_img_req);
2295 return 0;
2296}
2297
2298static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2299{
2300 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2301 int ret;
2302
2303 if (obj_req->result == -ENOENT &&
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002304 rbd_dev->parent_overlap && !obj_req->tried_parent) {
2305 /* reverse map this object extent onto the parent */
2306 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002307 if (ret) {
2308 obj_req->result = ret;
2309 return true;
2310 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002311
2312 if (obj_req->num_img_extents) {
2313 obj_req->tried_parent = true;
2314 ret = rbd_obj_read_from_parent(obj_req);
2315 if (ret) {
2316 obj_req->result = ret;
2317 return true;
2318 }
2319 return false;
2320 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002321 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002322
2323 /*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002324 * -ENOENT means a hole in the image -- zero-fill the entire
2325 * length of the request. A short read also implies zero-fill
2326 * to the end of the request. In both cases we update xferred
2327 * count to indicate the whole request was satisfied.
Alex Elder02c74fb2013-05-06 17:40:33 -05002328 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002329 if (obj_req->result == -ENOENT ||
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002330 (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002331 rbd_assert(!obj_req->xferred || !obj_req->result);
2332 rbd_obj_zero_range(obj_req, obj_req->xferred,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002333 obj_req->ex.oe_len - obj_req->xferred);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002334 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002335 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002336 }
2337
2338 return true;
2339}
2340
2341/*
2342 * copyup_bvecs pages are never highmem pages
2343 */
2344static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2345{
2346 struct ceph_bvec_iter it = {
2347 .bvecs = bvecs,
2348 .iter = { .bi_size = bytes },
2349 };
2350
2351 ceph_bvec_iter_advance_step(&it, bytes, ({
2352 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2353 bv.bv_len))
2354 return false;
2355 }));
2356 return true;
2357}
2358
2359static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2360{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002361 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
Chengguang Xufe943d52018-04-12 12:04:55 +08002362 int ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002363
2364 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2365 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2366 rbd_osd_req_destroy(obj_req->osd_req);
2367
2368 /*
2369 * Create a copyup request with the same number of OSD ops as
2370 * the original request. The original request was stat + op(s),
2371 * the new copyup request will be copyup + the same op(s).
2372 */
Ilya Dryomova162b302018-01-30 17:52:10 +01002373 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002374 if (!obj_req->osd_req)
2375 return -ENOMEM;
2376
Chengguang Xufe943d52018-04-12 12:04:55 +08002377 ret = osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
2378 "copyup");
2379 if (ret)
2380 return ret;
2381
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002382 /*
2383 * Only send non-zero copyup data to save some I/O and network
2384 * bandwidth -- zero copyup data is equivalent to the object not
2385 * existing.
2386 */
2387 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2388 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2389 bytes = 0;
2390 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002391 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
Ilya Dryomov0010f702018-05-04 16:57:30 +02002392 obj_req->copyup_bvecs,
2393 obj_req->copyup_bvec_count,
2394 bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002395
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002396 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002397 case OBJ_OP_WRITE:
2398 __rbd_obj_setup_write(obj_req, 1);
2399 break;
2400 case OBJ_OP_DISCARD:
2401 rbd_assert(!rbd_obj_is_entire(obj_req));
2402 __rbd_obj_setup_discard(obj_req, 1);
2403 break;
2404 default:
2405 rbd_assert(0);
2406 }
2407
2408 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002409 return 0;
2410}
2411
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002412static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2413{
2414 u32 i;
2415
2416 rbd_assert(!obj_req->copyup_bvecs);
2417 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2418 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2419 sizeof(*obj_req->copyup_bvecs),
2420 GFP_NOIO);
2421 if (!obj_req->copyup_bvecs)
2422 return -ENOMEM;
2423
2424 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2425 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2426
2427 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2428 if (!obj_req->copyup_bvecs[i].bv_page)
2429 return -ENOMEM;
2430
2431 obj_req->copyup_bvecs[i].bv_offset = 0;
2432 obj_req->copyup_bvecs[i].bv_len = len;
2433 obj_overlap -= len;
2434 }
2435
2436 rbd_assert(!obj_overlap);
2437 return 0;
2438}
2439
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002440static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2441{
2442 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002443 int ret;
2444
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002445 rbd_assert(obj_req->num_img_extents);
2446 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2447 rbd_dev->parent_overlap);
2448 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002449 /*
2450 * The overlap has become 0 (most likely because the
2451 * image has been flattened). Use rbd_obj_issue_copyup()
2452 * to re-submit the original write request -- the copyup
2453 * operation itself will be a no-op, since someone must
2454 * have populated the child object while we weren't
2455 * looking. Move to WRITE_FLAT state as we'll be done
2456 * with the operation once the null copyup completes.
2457 */
2458 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2459 return rbd_obj_issue_copyup(obj_req, 0);
2460 }
2461
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002462 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002463 if (ret)
2464 return ret;
2465
2466 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002467 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002468}
2469
2470static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2471{
2472 int ret;
2473
2474again:
2475 switch (obj_req->write_state) {
2476 case RBD_OBJ_WRITE_GUARD:
2477 rbd_assert(!obj_req->xferred);
2478 if (obj_req->result == -ENOENT) {
2479 /*
2480 * The target object doesn't exist. Read the data for
2481 * the entire target object up to the overlap point (if
2482 * any) from the parent, so we can use it for a copyup.
2483 */
2484 ret = rbd_obj_handle_write_guard(obj_req);
2485 if (ret) {
2486 obj_req->result = ret;
2487 return true;
2488 }
2489 return false;
2490 }
2491 /* fall through */
2492 case RBD_OBJ_WRITE_FLAT:
2493 if (!obj_req->result)
2494 /*
2495 * There is no such thing as a successful short
2496 * write -- indicate the whole request was satisfied.
2497 */
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002498 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002499 return true;
2500 case RBD_OBJ_WRITE_COPYUP:
2501 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2502 if (obj_req->result)
2503 goto again;
2504
2505 rbd_assert(obj_req->xferred);
2506 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2507 if (ret) {
2508 obj_req->result = ret;
2509 return true;
2510 }
2511 return false;
2512 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002513 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002514 }
2515}
2516
2517/*
2518 * Returns true if @obj_req is completed, or false otherwise.
2519 */
2520static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2521{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002522 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002523 case OBJ_OP_READ:
2524 return rbd_obj_handle_read(obj_req);
2525 case OBJ_OP_WRITE:
2526 return rbd_obj_handle_write(obj_req);
2527 case OBJ_OP_DISCARD:
2528 if (rbd_obj_handle_write(obj_req)) {
2529 /*
2530 * Hide -ENOENT from delete/truncate/zero -- discarding
2531 * a non-existent object is not a problem.
2532 */
2533 if (obj_req->result == -ENOENT) {
2534 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002535 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002536 }
2537 return true;
2538 }
2539 return false;
2540 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002541 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002542 }
2543}
2544
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002545static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2546{
2547 struct rbd_img_request *img_req = obj_req->img_request;
2548
2549 rbd_assert((!obj_req->result &&
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002550 obj_req->xferred == obj_req->ex.oe_len) ||
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002551 (obj_req->result < 0 && !obj_req->xferred));
2552 if (!obj_req->result) {
2553 img_req->xferred += obj_req->xferred;
Ilya Dryomov980917f2016-09-12 18:59:42 +02002554 return;
Alex Elder02c74fb2013-05-06 17:40:33 -05002555 }
2556
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002557 rbd_warn(img_req->rbd_dev,
2558 "%s at objno %llu %llu~%llu result %d xferred %llu",
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002559 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2560 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002561 obj_req->xferred);
2562 if (!img_req->result) {
2563 img_req->result = obj_req->result;
2564 img_req->xferred = 0;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002565 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06002566}
2567
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002568static void rbd_img_end_child_request(struct rbd_img_request *img_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002569{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002570 struct rbd_obj_request *obj_req = img_req->obj_request;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002571
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002572 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002573 rbd_assert((!img_req->result &&
2574 img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
2575 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002576
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002577 obj_req->result = img_req->result;
2578 obj_req->xferred = img_req->xferred;
2579 rbd_img_request_put(img_req);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002580}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002581
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002582static void rbd_img_end_request(struct rbd_img_request *img_req)
2583{
2584 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2585 rbd_assert((!img_req->result &&
2586 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2587 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002588
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002589 blk_mq_end_request(img_req->rq,
2590 errno_to_blk_status(img_req->result));
2591 rbd_img_request_put(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002592}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002593
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002594static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2595{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002596 struct rbd_img_request *img_req;
2597
2598again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002599 if (!__rbd_obj_handle_request(obj_req))
2600 return;
2601
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002602 img_req = obj_req->img_request;
2603 spin_lock(&img_req->completion_lock);
2604 rbd_obj_end_request(obj_req);
2605 rbd_assert(img_req->pending_count);
2606 if (--img_req->pending_count) {
2607 spin_unlock(&img_req->completion_lock);
2608 return;
2609 }
2610
2611 spin_unlock(&img_req->completion_lock);
2612 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2613 obj_req = img_req->obj_request;
2614 rbd_img_end_child_request(img_req);
2615 goto again;
2616 }
2617 rbd_img_end_request(img_req);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002618}
2619
Ilya Dryomoved95b212016-08-12 16:40:02 +02002620static const struct rbd_client_id rbd_empty_cid;
2621
2622static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2623 const struct rbd_client_id *rhs)
2624{
2625 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2626}
2627
2628static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2629{
2630 struct rbd_client_id cid;
2631
2632 mutex_lock(&rbd_dev->watch_mutex);
2633 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2634 cid.handle = rbd_dev->watch_cookie;
2635 mutex_unlock(&rbd_dev->watch_mutex);
2636 return cid;
2637}
2638
2639/*
2640 * lock_rwsem must be held for write
2641 */
2642static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2643 const struct rbd_client_id *cid)
2644{
2645 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2646 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2647 cid->gid, cid->handle);
2648 rbd_dev->owner_cid = *cid; /* struct */
2649}
2650
2651static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2652{
2653 mutex_lock(&rbd_dev->watch_mutex);
2654 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2655 mutex_unlock(&rbd_dev->watch_mutex);
2656}
2657
Florian Margaineedd8ca82017-12-13 16:43:59 +01002658static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2659{
2660 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2661
2662 strcpy(rbd_dev->lock_cookie, cookie);
2663 rbd_set_owner_cid(rbd_dev, &cid);
2664 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2665}
2666
Ilya Dryomoved95b212016-08-12 16:40:02 +02002667/*
2668 * lock_rwsem must be held for write
2669 */
2670static int rbd_lock(struct rbd_device *rbd_dev)
2671{
2672 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002673 char cookie[32];
2674 int ret;
2675
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002676 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2677 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002678
2679 format_lock_cookie(rbd_dev, cookie);
2680 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2681 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2682 RBD_LOCK_TAG, "", 0);
2683 if (ret)
2684 return ret;
2685
2686 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002687 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002688 return 0;
2689}
2690
2691/*
2692 * lock_rwsem must be held for write
2693 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002694static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002695{
2696 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002697 int ret;
2698
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002699 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2700 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002701
Ilya Dryomoved95b212016-08-12 16:40:02 +02002702 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002703 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002704 if (ret && ret != -ENOENT)
2705 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002706
Ilya Dryomovbbead742017-04-13 12:17:38 +02002707 /* treat errors as the image is unlocked */
2708 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002709 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002710 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2711 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002712}
2713
2714static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2715 enum rbd_notify_op notify_op,
2716 struct page ***preply_pages,
2717 size_t *preply_len)
2718{
2719 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2720 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07002721 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
2722 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002723 void *p = buf;
2724
2725 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2726
2727 /* encode *LockPayload NotifyMessage (op + ClientId) */
2728 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2729 ceph_encode_32(&p, notify_op);
2730 ceph_encode_64(&p, cid.gid);
2731 ceph_encode_64(&p, cid.handle);
2732
2733 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2734 &rbd_dev->header_oloc, buf, buf_size,
2735 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2736}
2737
2738static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2739 enum rbd_notify_op notify_op)
2740{
2741 struct page **reply_pages;
2742 size_t reply_len;
2743
2744 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2745 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2746}
2747
2748static void rbd_notify_acquired_lock(struct work_struct *work)
2749{
2750 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2751 acquired_lock_work);
2752
2753 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2754}
2755
2756static void rbd_notify_released_lock(struct work_struct *work)
2757{
2758 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2759 released_lock_work);
2760
2761 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2762}
2763
2764static int rbd_request_lock(struct rbd_device *rbd_dev)
2765{
2766 struct page **reply_pages;
2767 size_t reply_len;
2768 bool lock_owner_responded = false;
2769 int ret;
2770
2771 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2772
2773 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2774 &reply_pages, &reply_len);
2775 if (ret && ret != -ETIMEDOUT) {
2776 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2777 goto out;
2778 }
2779
2780 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2781 void *p = page_address(reply_pages[0]);
2782 void *const end = p + reply_len;
2783 u32 n;
2784
2785 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2786 while (n--) {
2787 u8 struct_v;
2788 u32 len;
2789
2790 ceph_decode_need(&p, end, 8 + 8, e_inval);
2791 p += 8 + 8; /* skip gid and cookie */
2792
2793 ceph_decode_32_safe(&p, end, len, e_inval);
2794 if (!len)
2795 continue;
2796
2797 if (lock_owner_responded) {
2798 rbd_warn(rbd_dev,
2799 "duplicate lock owners detected");
2800 ret = -EIO;
2801 goto out;
2802 }
2803
2804 lock_owner_responded = true;
2805 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2806 &struct_v, &len);
2807 if (ret) {
2808 rbd_warn(rbd_dev,
2809 "failed to decode ResponseMessage: %d",
2810 ret);
2811 goto e_inval;
2812 }
2813
2814 ret = ceph_decode_32(&p);
2815 }
2816 }
2817
2818 if (!lock_owner_responded) {
2819 rbd_warn(rbd_dev, "no lock owners detected");
2820 ret = -ETIMEDOUT;
2821 }
2822
2823out:
2824 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2825 return ret;
2826
2827e_inval:
2828 ret = -EINVAL;
2829 goto out;
2830}
2831
2832static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2833{
2834 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2835
2836 cancel_delayed_work(&rbd_dev->lock_dwork);
2837 if (wake_all)
2838 wake_up_all(&rbd_dev->lock_waitq);
2839 else
2840 wake_up(&rbd_dev->lock_waitq);
2841}
2842
2843static int get_lock_owner_info(struct rbd_device *rbd_dev,
2844 struct ceph_locker **lockers, u32 *num_lockers)
2845{
2846 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2847 u8 lock_type;
2848 char *lock_tag;
2849 int ret;
2850
2851 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2852
2853 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2854 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2855 &lock_type, &lock_tag, lockers, num_lockers);
2856 if (ret)
2857 return ret;
2858
2859 if (*num_lockers == 0) {
2860 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2861 goto out;
2862 }
2863
2864 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2865 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2866 lock_tag);
2867 ret = -EBUSY;
2868 goto out;
2869 }
2870
2871 if (lock_type == CEPH_CLS_LOCK_SHARED) {
2872 rbd_warn(rbd_dev, "shared lock type detected");
2873 ret = -EBUSY;
2874 goto out;
2875 }
2876
2877 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2878 strlen(RBD_LOCK_COOKIE_PREFIX))) {
2879 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2880 (*lockers)[0].id.cookie);
2881 ret = -EBUSY;
2882 goto out;
2883 }
2884
2885out:
2886 kfree(lock_tag);
2887 return ret;
2888}
2889
2890static int find_watcher(struct rbd_device *rbd_dev,
2891 const struct ceph_locker *locker)
2892{
2893 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2894 struct ceph_watch_item *watchers;
2895 u32 num_watchers;
2896 u64 cookie;
2897 int i;
2898 int ret;
2899
2900 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2901 &rbd_dev->header_oloc, &watchers,
2902 &num_watchers);
2903 if (ret)
2904 return ret;
2905
2906 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2907 for (i = 0; i < num_watchers; i++) {
2908 if (!memcmp(&watchers[i].addr, &locker->info.addr,
2909 sizeof(locker->info.addr)) &&
2910 watchers[i].cookie == cookie) {
2911 struct rbd_client_id cid = {
2912 .gid = le64_to_cpu(watchers[i].name.num),
2913 .handle = cookie,
2914 };
2915
2916 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
2917 rbd_dev, cid.gid, cid.handle);
2918 rbd_set_owner_cid(rbd_dev, &cid);
2919 ret = 1;
2920 goto out;
2921 }
2922 }
2923
2924 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
2925 ret = 0;
2926out:
2927 kfree(watchers);
2928 return ret;
2929}
2930
2931/*
2932 * lock_rwsem must be held for write
2933 */
2934static int rbd_try_lock(struct rbd_device *rbd_dev)
2935{
2936 struct ceph_client *client = rbd_dev->rbd_client->client;
2937 struct ceph_locker *lockers;
2938 u32 num_lockers;
2939 int ret;
2940
2941 for (;;) {
2942 ret = rbd_lock(rbd_dev);
2943 if (ret != -EBUSY)
2944 return ret;
2945
2946 /* determine if the current lock holder is still alive */
2947 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
2948 if (ret)
2949 return ret;
2950
2951 if (num_lockers == 0)
2952 goto again;
2953
2954 ret = find_watcher(rbd_dev, lockers);
2955 if (ret) {
2956 if (ret > 0)
2957 ret = 0; /* have to request lock */
2958 goto out;
2959 }
2960
2961 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
2962 ENTITY_NAME(lockers[0].id.name));
2963
2964 ret = ceph_monc_blacklist_add(&client->monc,
2965 &lockers[0].info.addr);
2966 if (ret) {
2967 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
2968 ENTITY_NAME(lockers[0].id.name), ret);
2969 goto out;
2970 }
2971
2972 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
2973 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2974 lockers[0].id.cookie,
2975 &lockers[0].id.name);
2976 if (ret && ret != -ENOENT)
2977 goto out;
2978
2979again:
2980 ceph_free_lockers(lockers, num_lockers);
2981 }
2982
2983out:
2984 ceph_free_lockers(lockers, num_lockers);
2985 return ret;
2986}
2987
2988/*
2989 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
2990 */
2991static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
2992 int *pret)
2993{
2994 enum rbd_lock_state lock_state;
2995
2996 down_read(&rbd_dev->lock_rwsem);
2997 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
2998 rbd_dev->lock_state);
2999 if (__rbd_is_lock_owner(rbd_dev)) {
3000 lock_state = rbd_dev->lock_state;
3001 up_read(&rbd_dev->lock_rwsem);
3002 return lock_state;
3003 }
3004
3005 up_read(&rbd_dev->lock_rwsem);
3006 down_write(&rbd_dev->lock_rwsem);
3007 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3008 rbd_dev->lock_state);
3009 if (!__rbd_is_lock_owner(rbd_dev)) {
3010 *pret = rbd_try_lock(rbd_dev);
3011 if (*pret)
3012 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3013 }
3014
3015 lock_state = rbd_dev->lock_state;
3016 up_write(&rbd_dev->lock_rwsem);
3017 return lock_state;
3018}
3019
3020static void rbd_acquire_lock(struct work_struct *work)
3021{
3022 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3023 struct rbd_device, lock_dwork);
3024 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003025 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003026
3027 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3028again:
3029 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3030 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3031 if (lock_state == RBD_LOCK_STATE_LOCKED)
3032 wake_requests(rbd_dev, true);
3033 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3034 rbd_dev, lock_state, ret);
3035 return;
3036 }
3037
3038 ret = rbd_request_lock(rbd_dev);
3039 if (ret == -ETIMEDOUT) {
3040 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003041 } else if (ret == -EROFS) {
3042 rbd_warn(rbd_dev, "peer will not release lock");
3043 /*
3044 * If this is rbd_add_acquire_lock(), we want to fail
3045 * immediately -- reuse BLACKLISTED flag. Otherwise we
3046 * want to block.
3047 */
3048 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3049 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3050 /* wake "rbd map --exclusive" process */
3051 wake_requests(rbd_dev, false);
3052 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003053 } else if (ret < 0) {
3054 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3055 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3056 RBD_RETRY_DELAY);
3057 } else {
3058 /*
3059 * lock owner acked, but resend if we don't see them
3060 * release the lock
3061 */
3062 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3063 rbd_dev);
3064 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3065 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3066 }
3067}
3068
3069/*
3070 * lock_rwsem must be held for write
3071 */
3072static bool rbd_release_lock(struct rbd_device *rbd_dev)
3073{
3074 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3075 rbd_dev->lock_state);
3076 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3077 return false;
3078
3079 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3080 downgrade_write(&rbd_dev->lock_rwsem);
3081 /*
3082 * Ensure that all in-flight IO is flushed.
3083 *
3084 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3085 * may be shared with other devices.
3086 */
3087 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3088 up_read(&rbd_dev->lock_rwsem);
3089
3090 down_write(&rbd_dev->lock_rwsem);
3091 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3092 rbd_dev->lock_state);
3093 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3094 return false;
3095
Ilya Dryomovbbead742017-04-13 12:17:38 +02003096 rbd_unlock(rbd_dev);
3097 /*
3098 * Give others a chance to grab the lock - we would re-acquire
3099 * almost immediately if we got new IO during ceph_osdc_sync()
3100 * otherwise. We need to ack our own notifications, so this
3101 * lock_dwork will be requeued from rbd_wait_state_locked()
3102 * after wake_requests() in rbd_handle_released_lock().
3103 */
3104 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003105 return true;
3106}
3107
3108static void rbd_release_lock_work(struct work_struct *work)
3109{
3110 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3111 unlock_work);
3112
3113 down_write(&rbd_dev->lock_rwsem);
3114 rbd_release_lock(rbd_dev);
3115 up_write(&rbd_dev->lock_rwsem);
3116}
3117
3118static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3119 void **p)
3120{
3121 struct rbd_client_id cid = { 0 };
3122
3123 if (struct_v >= 2) {
3124 cid.gid = ceph_decode_64(p);
3125 cid.handle = ceph_decode_64(p);
3126 }
3127
3128 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3129 cid.handle);
3130 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3131 down_write(&rbd_dev->lock_rwsem);
3132 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3133 /*
3134 * we already know that the remote client is
3135 * the owner
3136 */
3137 up_write(&rbd_dev->lock_rwsem);
3138 return;
3139 }
3140
3141 rbd_set_owner_cid(rbd_dev, &cid);
3142 downgrade_write(&rbd_dev->lock_rwsem);
3143 } else {
3144 down_read(&rbd_dev->lock_rwsem);
3145 }
3146
3147 if (!__rbd_is_lock_owner(rbd_dev))
3148 wake_requests(rbd_dev, false);
3149 up_read(&rbd_dev->lock_rwsem);
3150}
3151
3152static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3153 void **p)
3154{
3155 struct rbd_client_id cid = { 0 };
3156
3157 if (struct_v >= 2) {
3158 cid.gid = ceph_decode_64(p);
3159 cid.handle = ceph_decode_64(p);
3160 }
3161
3162 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3163 cid.handle);
3164 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3165 down_write(&rbd_dev->lock_rwsem);
3166 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3167 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3168 __func__, rbd_dev, cid.gid, cid.handle,
3169 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3170 up_write(&rbd_dev->lock_rwsem);
3171 return;
3172 }
3173
3174 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3175 downgrade_write(&rbd_dev->lock_rwsem);
3176 } else {
3177 down_read(&rbd_dev->lock_rwsem);
3178 }
3179
3180 if (!__rbd_is_lock_owner(rbd_dev))
3181 wake_requests(rbd_dev, false);
3182 up_read(&rbd_dev->lock_rwsem);
3183}
3184
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003185/*
3186 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3187 * ResponseMessage is needed.
3188 */
3189static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3190 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003191{
3192 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3193 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003194 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003195
3196 if (struct_v >= 2) {
3197 cid.gid = ceph_decode_64(p);
3198 cid.handle = ceph_decode_64(p);
3199 }
3200
3201 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3202 cid.handle);
3203 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003204 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003205
3206 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003207 if (__rbd_is_lock_owner(rbd_dev)) {
3208 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3209 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3210 goto out_unlock;
3211
3212 /*
3213 * encode ResponseMessage(0) so the peer can detect
3214 * a missing owner
3215 */
3216 result = 0;
3217
3218 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003219 if (!rbd_dev->opts->exclusive) {
3220 dout("%s rbd_dev %p queueing unlock_work\n",
3221 __func__, rbd_dev);
3222 queue_work(rbd_dev->task_wq,
3223 &rbd_dev->unlock_work);
3224 } else {
3225 /* refuse to release the lock */
3226 result = -EROFS;
3227 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003228 }
3229 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003230
3231out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003232 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003233 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003234}
3235
3236static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3237 u64 notify_id, u64 cookie, s32 *result)
3238{
3239 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07003240 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3241 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003242 int ret;
3243
3244 if (result) {
3245 void *p = buf;
3246
3247 /* encode ResponseMessage */
3248 ceph_start_encoding(&p, 1, 1,
3249 buf_size - CEPH_ENCODING_START_BLK_LEN);
3250 ceph_encode_32(&p, *result);
3251 } else {
3252 buf_size = 0;
3253 }
3254
3255 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3256 &rbd_dev->header_oloc, notify_id, cookie,
3257 buf, buf_size);
3258 if (ret)
3259 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3260}
3261
3262static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3263 u64 cookie)
3264{
3265 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3266 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3267}
3268
3269static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3270 u64 notify_id, u64 cookie, s32 result)
3271{
3272 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3273 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3274}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003275
3276static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3277 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003278{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003279 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003280 void *p = data;
3281 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003282 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003283 u32 len;
3284 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003285 int ret;
3286
Ilya Dryomoved95b212016-08-12 16:40:02 +02003287 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3288 __func__, rbd_dev, cookie, notify_id, data_len);
3289 if (data_len) {
3290 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3291 &struct_v, &len);
3292 if (ret) {
3293 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3294 ret);
3295 return;
3296 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003297
Ilya Dryomoved95b212016-08-12 16:40:02 +02003298 notify_op = ceph_decode_32(&p);
3299 } else {
3300 /* legacy notification for header updates */
3301 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3302 len = 0;
3303 }
Alex Elderb8d70032012-11-30 17:53:04 -06003304
Ilya Dryomoved95b212016-08-12 16:40:02 +02003305 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3306 switch (notify_op) {
3307 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3308 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3309 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3310 break;
3311 case RBD_NOTIFY_OP_RELEASED_LOCK:
3312 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3313 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3314 break;
3315 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003316 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3317 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003318 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003319 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003320 else
3321 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3322 break;
3323 case RBD_NOTIFY_OP_HEADER_UPDATE:
3324 ret = rbd_dev_refresh(rbd_dev);
3325 if (ret)
3326 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3327
3328 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3329 break;
3330 default:
3331 if (rbd_is_lock_owner(rbd_dev))
3332 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3333 cookie, -EOPNOTSUPP);
3334 else
3335 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3336 break;
3337 }
Alex Elderb8d70032012-11-30 17:53:04 -06003338}
3339
Ilya Dryomov99d16942016-08-12 16:11:41 +02003340static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3341
Ilya Dryomov922dab62016-05-26 01:15:02 +02003342static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003343{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003344 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003345
Ilya Dryomov922dab62016-05-26 01:15:02 +02003346 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003347
Ilya Dryomoved95b212016-08-12 16:40:02 +02003348 down_write(&rbd_dev->lock_rwsem);
3349 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3350 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003351
Ilya Dryomov99d16942016-08-12 16:11:41 +02003352 mutex_lock(&rbd_dev->watch_mutex);
3353 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3354 __rbd_unregister_watch(rbd_dev);
3355 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003356
Ilya Dryomov99d16942016-08-12 16:11:41 +02003357 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003358 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003359 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003360}
3361
3362/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003363 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003364 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003365static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003366{
3367 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003368 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003369
Ilya Dryomov922dab62016-05-26 01:15:02 +02003370 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003371 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003372
Ilya Dryomov922dab62016-05-26 01:15:02 +02003373 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3374 &rbd_dev->header_oloc, rbd_watch_cb,
3375 rbd_watch_errcb, rbd_dev);
3376 if (IS_ERR(handle))
3377 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003378
Ilya Dryomov922dab62016-05-26 01:15:02 +02003379 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003380 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003381}
3382
Ilya Dryomov99d16942016-08-12 16:11:41 +02003383/*
3384 * watch_mutex must be locked
3385 */
3386static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003387{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003388 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3389 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003390
Ilya Dryomov99d16942016-08-12 16:11:41 +02003391 rbd_assert(rbd_dev->watch_handle);
3392 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003393
Ilya Dryomov922dab62016-05-26 01:15:02 +02003394 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3395 if (ret)
3396 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003397
Ilya Dryomov922dab62016-05-26 01:15:02 +02003398 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003399}
3400
Ilya Dryomov99d16942016-08-12 16:11:41 +02003401static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003402{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003403 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003404
Ilya Dryomov99d16942016-08-12 16:11:41 +02003405 mutex_lock(&rbd_dev->watch_mutex);
3406 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3407 ret = __rbd_register_watch(rbd_dev);
3408 if (ret)
3409 goto out;
3410
3411 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3412 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3413
3414out:
3415 mutex_unlock(&rbd_dev->watch_mutex);
3416 return ret;
3417}
3418
3419static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3420{
3421 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3422
Ilya Dryomoved95b212016-08-12 16:40:02 +02003423 cancel_work_sync(&rbd_dev->acquired_lock_work);
3424 cancel_work_sync(&rbd_dev->released_lock_work);
3425 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3426 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003427}
3428
3429static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3430{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003431 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003432 cancel_tasks_sync(rbd_dev);
3433
3434 mutex_lock(&rbd_dev->watch_mutex);
3435 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3436 __rbd_unregister_watch(rbd_dev);
3437 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3438 mutex_unlock(&rbd_dev->watch_mutex);
3439
Dongsheng Yang23edca82018-06-04 06:24:37 -04003440 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomov811c6682016-04-15 16:22:16 +02003441 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003442}
3443
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003444/*
3445 * lock_rwsem must be held for write
3446 */
3447static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3448{
3449 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3450 char cookie[32];
3451 int ret;
3452
3453 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3454
3455 format_lock_cookie(rbd_dev, cookie);
3456 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3457 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3458 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3459 RBD_LOCK_TAG, cookie);
3460 if (ret) {
3461 if (ret != -EOPNOTSUPP)
3462 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3463 ret);
3464
3465 /*
3466 * Lock cookie cannot be updated on older OSDs, so do
3467 * a manual release and queue an acquire.
3468 */
3469 if (rbd_release_lock(rbd_dev))
3470 queue_delayed_work(rbd_dev->task_wq,
3471 &rbd_dev->lock_dwork, 0);
3472 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003473 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003474 }
3475}
3476
Ilya Dryomov99d16942016-08-12 16:11:41 +02003477static void rbd_reregister_watch(struct work_struct *work)
3478{
3479 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3480 struct rbd_device, watch_dwork);
3481 int ret;
3482
3483 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3484
3485 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003486 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3487 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003488 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003489 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003490
3491 ret = __rbd_register_watch(rbd_dev);
3492 if (ret) {
3493 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003494 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003495 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003496 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003497 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003498 queue_delayed_work(rbd_dev->task_wq,
3499 &rbd_dev->watch_dwork,
3500 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003501 }
3502 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003503 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003504 }
3505
3506 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3507 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3508 mutex_unlock(&rbd_dev->watch_mutex);
3509
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003510 down_write(&rbd_dev->lock_rwsem);
3511 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3512 rbd_reacquire_lock(rbd_dev);
3513 up_write(&rbd_dev->lock_rwsem);
3514
Ilya Dryomov99d16942016-08-12 16:11:41 +02003515 ret = rbd_dev_refresh(rbd_dev);
3516 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00003517 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003518}
3519
Alex Elder36be9a72013-01-19 00:30:28 -06003520/*
Alex Elderf40eb342013-04-25 15:09:42 -05003521 * Synchronous osd object method call. Returns the number of bytes
3522 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003523 */
3524static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003525 struct ceph_object_id *oid,
3526 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003527 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003528 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003529 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003530 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003531 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003532{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003533 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3534 struct page *req_page = NULL;
3535 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003536 int ret;
3537
3538 /*
Alex Elder6010a452013-04-05 01:27:11 -05003539 * Method calls are ultimately read operations. The result
3540 * should placed into the inbound buffer provided. They
3541 * also supply outbound data--parameters for the object
3542 * method. Currently if this is present it will be a
3543 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003544 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003545 if (outbound) {
3546 if (outbound_size > PAGE_SIZE)
3547 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003548
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003549 req_page = alloc_page(GFP_KERNEL);
3550 if (!req_page)
3551 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003552
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003553 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003554 }
Alex Elder430c28c2013-04-03 21:32:51 -05003555
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003556 reply_page = alloc_page(GFP_KERNEL);
3557 if (!reply_page) {
3558 if (req_page)
3559 __free_page(req_page);
3560 return -ENOMEM;
3561 }
Alex Elder36be9a72013-01-19 00:30:28 -06003562
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003563 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3564 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3565 reply_page, &inbound_size);
3566 if (!ret) {
3567 memcpy(inbound, page_address(reply_page), inbound_size);
3568 ret = inbound_size;
3569 }
Alex Elder57385b52013-04-21 12:14:45 -05003570
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003571 if (req_page)
3572 __free_page(req_page);
3573 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003574 return ret;
3575}
3576
Ilya Dryomoved95b212016-08-12 16:40:02 +02003577/*
3578 * lock_rwsem must be held for read
3579 */
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003580static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003581{
3582 DEFINE_WAIT(wait);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003583 unsigned long timeout;
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003584 int ret = 0;
3585
3586 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3587 return -EBLACKLISTED;
3588
3589 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3590 return 0;
3591
3592 if (!may_acquire) {
3593 rbd_warn(rbd_dev, "exclusive lock required");
3594 return -EROFS;
3595 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003596
3597 do {
3598 /*
3599 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3600 * and cancel_delayed_work() in wake_requests().
3601 */
3602 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3603 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3604 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3605 TASK_UNINTERRUPTIBLE);
3606 up_read(&rbd_dev->lock_rwsem);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003607 timeout = schedule_timeout(ceph_timeout_jiffies(
3608 rbd_dev->opts->lock_timeout));
Ilya Dryomoved95b212016-08-12 16:40:02 +02003609 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003610 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3611 ret = -EBLACKLISTED;
3612 break;
3613 }
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003614 if (!timeout) {
3615 rbd_warn(rbd_dev, "timed out waiting for lock");
3616 ret = -ETIMEDOUT;
3617 break;
3618 }
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003619 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003620
Ilya Dryomoved95b212016-08-12 16:40:02 +02003621 finish_wait(&rbd_dev->lock_waitq, &wait);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003622 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003623}
3624
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003625static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003626{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003627 struct request *rq = blk_mq_rq_from_pdu(work);
3628 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003629 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003630 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003631 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3632 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003633 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003634 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003635 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003636 int result;
3637
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003638 switch (req_op(rq)) {
3639 case REQ_OP_DISCARD:
Ilya Dryomov6ac56952017-05-22 19:59:24 +02003640 case REQ_OP_WRITE_ZEROES:
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003641 op_type = OBJ_OP_DISCARD;
3642 break;
3643 case REQ_OP_WRITE:
3644 op_type = OBJ_OP_WRITE;
3645 break;
3646 case REQ_OP_READ:
3647 op_type = OBJ_OP_READ;
3648 break;
3649 default:
3650 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003651 result = -EIO;
3652 goto err;
3653 }
3654
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003655 /* Ignore/skip any zero-length requests */
3656
3657 if (!length) {
3658 dout("%s: zero-length request\n", __func__);
3659 result = 0;
3660 goto err_rq;
3661 }
3662
Ilya Dryomov9568c932017-10-12 12:35:19 +02003663 rbd_assert(op_type == OBJ_OP_READ ||
3664 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003665
3666 /*
3667 * Quit early if the mapped snapshot no longer exists. It's
3668 * still possible the snapshot will have disappeared by the
3669 * time our request arrives at the osd, but there's no sense in
3670 * sending it if we already know.
3671 */
3672 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3673 dout("request for non-existent snapshot");
3674 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3675 result = -ENXIO;
3676 goto err_rq;
3677 }
3678
3679 if (offset && length > U64_MAX - offset + 1) {
3680 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3681 length);
3682 result = -EINVAL;
3683 goto err_rq; /* Shouldn't happen */
3684 }
3685
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003686 blk_mq_start_request(rq);
3687
Josh Durgin4e752f02014-04-08 11:12:11 -07003688 down_read(&rbd_dev->header_rwsem);
3689 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003690 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003691 snapc = rbd_dev->header.snapc;
3692 ceph_get_snap_context(snapc);
3693 }
3694 up_read(&rbd_dev->header_rwsem);
3695
3696 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003697 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003698 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003699 result = -EIO;
3700 goto err_rq;
3701 }
3702
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003703 must_be_locked =
3704 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3705 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003706 if (must_be_locked) {
3707 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003708 result = rbd_wait_state_locked(rbd_dev,
3709 !rbd_dev->opts->exclusive);
3710 if (result)
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003711 goto err_unlock;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003712 }
3713
Ilya Dryomovdfd98752018-02-06 19:26:35 +01003714 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003715 if (!img_request) {
3716 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003717 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003718 }
3719 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003720 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003721
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003722 if (op_type == OBJ_OP_DISCARD)
Ilya Dryomov5a237812018-02-06 19:26:34 +01003723 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003724 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01003725 result = rbd_img_fill_from_bio(img_request, offset, length,
3726 rq->bio);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003727 if (result)
3728 goto err_img_request;
3729
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01003730 rbd_img_request_submit(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003731 if (must_be_locked)
3732 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003733 return;
3734
3735err_img_request:
3736 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003737err_unlock:
3738 if (must_be_locked)
3739 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003740err_rq:
3741 if (result)
3742 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003743 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003744 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003745err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003746 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003747}
3748
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003749static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003750 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003751{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003752 struct request *rq = bd->rq;
3753 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003754
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003755 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003756 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003757}
3758
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003759static void rbd_free_disk(struct rbd_device *rbd_dev)
3760{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003761 blk_cleanup_queue(rbd_dev->disk->queue);
3762 blk_mq_free_tag_set(&rbd_dev->tag_set);
3763 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003764 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003765}
3766
Alex Elder788e2df2013-01-17 12:25:27 -06003767static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003768 struct ceph_object_id *oid,
3769 struct ceph_object_locator *oloc,
3770 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003771
3772{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003773 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3774 struct ceph_osd_request *req;
3775 struct page **pages;
3776 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003777 int ret;
3778
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003779 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3780 if (!req)
3781 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003782
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003783 ceph_oid_copy(&req->r_base_oid, oid);
3784 ceph_oloc_copy(&req->r_base_oloc, oloc);
3785 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003786
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003787 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06003788 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003789 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06003790
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003791 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3792 if (IS_ERR(pages)) {
3793 ret = PTR_ERR(pages);
3794 goto out_req;
3795 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003796
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003797 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3798 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3799 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003800
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003801 ceph_osdc_start_request(osdc, req, false);
3802 ret = ceph_osdc_wait_request(osdc, req);
3803 if (ret >= 0)
3804 ceph_copy_from_page_vector(pages, buf, 0, ret);
3805
3806out_req:
3807 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003808 return ret;
3809}
3810
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003811/*
Alex Elder662518b2013-05-06 09:51:29 -05003812 * Read the complete header for the given rbd device. On successful
3813 * return, the rbd_dev->header field will contain up-to-date
3814 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003815 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003816static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003817{
3818 struct rbd_image_header_ondisk *ondisk = NULL;
3819 u32 snap_count = 0;
3820 u64 names_size = 0;
3821 u32 want_count;
3822 int ret;
3823
3824 /*
3825 * The complete header will include an array of its 64-bit
3826 * snapshot ids, followed by the names of those snapshots as
3827 * a contiguous block of NUL-terminated strings. Note that
3828 * the number of snapshots could change by the time we read
3829 * it in, in which case we re-read it.
3830 */
3831 do {
3832 size_t size;
3833
3834 kfree(ondisk);
3835
3836 size = sizeof (*ondisk);
3837 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3838 size += names_size;
3839 ondisk = kmalloc(size, GFP_KERNEL);
3840 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003841 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003842
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003843 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3844 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05003845 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003846 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003847 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003848 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003849 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3850 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003851 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003852 }
3853 if (!rbd_dev_ondisk_valid(ondisk)) {
3854 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003855 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003856 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003857 }
3858
3859 names_size = le64_to_cpu(ondisk->snap_names_len);
3860 want_count = snap_count;
3861 snap_count = le32_to_cpu(ondisk->snap_count);
3862 } while (snap_count != want_count);
3863
Alex Elder662518b2013-05-06 09:51:29 -05003864 ret = rbd_header_from_disk(rbd_dev, ondisk);
3865out:
Alex Elder4156d992012-08-02 11:29:46 -05003866 kfree(ondisk);
3867
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003868 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003869}
3870
Alex Elder15228ed2013-05-01 12:43:03 -05003871/*
3872 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3873 * has disappeared from the (just updated) snapshot context.
3874 */
3875static void rbd_exists_validate(struct rbd_device *rbd_dev)
3876{
3877 u64 snap_id;
3878
3879 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3880 return;
3881
3882 snap_id = rbd_dev->spec->snap_id;
3883 if (snap_id == CEPH_NOSNAP)
3884 return;
3885
3886 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3887 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3888}
3889
Josh Durgin98752012013-08-29 17:26:31 -07003890static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3891{
3892 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07003893
3894 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02003895 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3896 * try to update its size. If REMOVING is set, updating size
3897 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07003898 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02003899 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3900 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07003901 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3902 dout("setting size to %llu sectors", (unsigned long long)size);
3903 set_capacity(rbd_dev->disk, size);
3904 revalidate_disk(rbd_dev->disk);
3905 }
3906}
3907
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003908static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003909{
Alex Eldere627db02013-05-06 07:40:30 -05003910 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003911 int ret;
3912
Alex Eldercfbf6372013-05-31 17:40:45 -05003913 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05003914 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04003915
3916 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003917 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003918 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05003919
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003920 /*
3921 * If there is a parent, see if it has disappeared due to the
3922 * mapped image getting flattened.
3923 */
3924 if (rbd_dev->parent) {
3925 ret = rbd_dev_v2_parent_info(rbd_dev);
3926 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003927 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04003928 }
3929
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003930 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003931 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04003932 } else {
3933 /* validate mapped snapshot's EXISTS flag */
3934 rbd_exists_validate(rbd_dev);
3935 }
Alex Elder15228ed2013-05-01 12:43:03 -05003936
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003937out:
Alex Eldercfbf6372013-05-31 17:40:45 -05003938 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003939 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07003940 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05003941
Ilya Dryomov73e39e42015-01-08 20:18:22 +03003942 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05003943}
3944
Christoph Hellwigd6296d392017-05-01 10:19:08 -06003945static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
3946 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003947{
3948 struct work_struct *work = blk_mq_rq_to_pdu(rq);
3949
3950 INIT_WORK(work, rbd_queue_workfn);
3951 return 0;
3952}
3953
Eric Biggersf363b082017-03-30 13:39:16 -07003954static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003955 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003956 .init_request = rbd_init_request,
3957};
3958
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003959static int rbd_init_disk(struct rbd_device *rbd_dev)
3960{
3961 struct gendisk *disk;
3962 struct request_queue *q;
Ilya Dryomov420efbd2018-04-16 09:32:18 +02003963 unsigned int objset_bytes =
3964 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003965 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003966
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003967 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003968 disk = alloc_disk(single_major ?
3969 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
3970 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003971 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05003972 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003973
Alex Elderf0f8cef2012-01-29 13:57:44 -06003974 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05003975 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003976 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02003977 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02003978 if (single_major)
3979 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003980 disk->fops = &rbd_bd_ops;
3981 disk->private_data = rbd_dev;
3982
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003983 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
3984 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003985 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003986 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03003987 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003988 rbd_dev->tag_set.nr_hw_queues = 1;
3989 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
3990
3991 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
3992 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003993 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07003994
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003995 q = blk_mq_init_queue(&rbd_dev->tag_set);
3996 if (IS_ERR(q)) {
3997 err = PTR_ERR(q);
3998 goto out_tag_set;
3999 }
4000
Bart Van Assche8b904b52018-03-07 17:10:10 -08004001 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004002 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004003
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004004 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004005 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01004006 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01004007 blk_queue_max_segment_size(q, UINT_MAX);
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004008 blk_queue_io_min(q, objset_bytes);
4009 blk_queue_io_opt(q, objset_bytes);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004010
Ilya Dryomovd9360542018-03-23 06:14:47 +01004011 if (rbd_dev->opts->trim) {
4012 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
4013 q->limits.discard_granularity = objset_bytes;
4014 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4015 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4016 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004017
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004018 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01004019 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004020
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004021 /*
4022 * disk_release() expects a queue ref from add_disk() and will
4023 * put it. Hold an extra ref until add_disk() is called.
4024 */
4025 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004026 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004027 q->queuedata = rbd_dev;
4028
4029 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004030
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004031 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004032out_tag_set:
4033 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004034out_disk:
4035 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004036 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004037}
4038
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004039/*
4040 sysfs
4041*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004042
Alex Elder593a9e72012-02-07 12:03:37 -06004043static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4044{
4045 return container_of(dev, struct rbd_device, dev);
4046}
4047
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004048static ssize_t rbd_size_show(struct device *dev,
4049 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004050{
Alex Elder593a9e72012-02-07 12:03:37 -06004051 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004052
Alex Elderfc71d832013-04-26 15:44:36 -05004053 return sprintf(buf, "%llu\n",
4054 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004055}
4056
Alex Elder34b13182012-07-13 20:35:12 -05004057/*
4058 * Note this shows the features for whatever's mapped, which is not
4059 * necessarily the base image.
4060 */
4061static ssize_t rbd_features_show(struct device *dev,
4062 struct device_attribute *attr, char *buf)
4063{
4064 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4065
4066 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004067 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004068}
4069
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004070static ssize_t rbd_major_show(struct device *dev,
4071 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004072{
Alex Elder593a9e72012-02-07 12:03:37 -06004073 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004074
Alex Elderfc71d832013-04-26 15:44:36 -05004075 if (rbd_dev->major)
4076 return sprintf(buf, "%d\n", rbd_dev->major);
4077
4078 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004079}
Alex Elderfc71d832013-04-26 15:44:36 -05004080
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004081static ssize_t rbd_minor_show(struct device *dev,
4082 struct device_attribute *attr, char *buf)
4083{
4084 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4085
4086 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004087}
4088
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004089static ssize_t rbd_client_addr_show(struct device *dev,
4090 struct device_attribute *attr, char *buf)
4091{
4092 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4093 struct ceph_entity_addr *client_addr =
4094 ceph_client_addr(rbd_dev->rbd_client->client);
4095
4096 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4097 le32_to_cpu(client_addr->nonce));
4098}
4099
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004100static ssize_t rbd_client_id_show(struct device *dev,
4101 struct device_attribute *attr, char *buf)
4102{
Alex Elder593a9e72012-02-07 12:03:37 -06004103 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004104
Alex Elder1dbb4392012-01-24 10:08:37 -06004105 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004106 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004107}
4108
Mike Christie267fb902016-08-18 18:38:43 +02004109static ssize_t rbd_cluster_fsid_show(struct device *dev,
4110 struct device_attribute *attr, char *buf)
4111{
4112 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4113
4114 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4115}
4116
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004117static ssize_t rbd_config_info_show(struct device *dev,
4118 struct device_attribute *attr, char *buf)
4119{
4120 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4121
4122 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004123}
4124
4125static ssize_t rbd_pool_show(struct device *dev,
4126 struct device_attribute *attr, char *buf)
4127{
Alex Elder593a9e72012-02-07 12:03:37 -06004128 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004129
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004130 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004131}
4132
Alex Elder9bb2f332012-07-12 10:46:35 -05004133static ssize_t rbd_pool_id_show(struct device *dev,
4134 struct device_attribute *attr, char *buf)
4135{
4136 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4137
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004138 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004139 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004140}
4141
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004142static ssize_t rbd_pool_ns_show(struct device *dev,
4143 struct device_attribute *attr, char *buf)
4144{
4145 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4146
4147 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4148}
4149
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004150static ssize_t rbd_name_show(struct device *dev,
4151 struct device_attribute *attr, char *buf)
4152{
Alex Elder593a9e72012-02-07 12:03:37 -06004153 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004154
Alex Eldera92ffdf2012-10-30 19:40:33 -05004155 if (rbd_dev->spec->image_name)
4156 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4157
4158 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004159}
4160
Alex Elder589d30e2012-07-10 20:30:11 -05004161static ssize_t rbd_image_id_show(struct device *dev,
4162 struct device_attribute *attr, char *buf)
4163{
4164 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4165
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004166 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004167}
4168
Alex Elder34b13182012-07-13 20:35:12 -05004169/*
4170 * Shows the name of the currently-mapped snapshot (or
4171 * RBD_SNAP_HEAD_NAME for the base image).
4172 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004173static ssize_t rbd_snap_show(struct device *dev,
4174 struct device_attribute *attr,
4175 char *buf)
4176{
Alex Elder593a9e72012-02-07 12:03:37 -06004177 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004178
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004179 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004180}
4181
Mike Christie92a58672016-08-18 18:38:44 +02004182static ssize_t rbd_snap_id_show(struct device *dev,
4183 struct device_attribute *attr, char *buf)
4184{
4185 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4186
4187 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4188}
4189
Alex Elder86b00e02012-10-25 23:34:42 -05004190/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004191 * For a v2 image, shows the chain of parent images, separated by empty
4192 * lines. For v1 images or if there is no parent, shows "(no parent
4193 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004194 */
4195static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004196 struct device_attribute *attr,
4197 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004198{
4199 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004200 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004201
Ilya Dryomovff961282014-07-22 21:53:07 +04004202 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004203 return sprintf(buf, "(no parent image)\n");
4204
Ilya Dryomovff961282014-07-22 21:53:07 +04004205 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4206 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004207
Ilya Dryomovff961282014-07-22 21:53:07 +04004208 count += sprintf(&buf[count], "%s"
4209 "pool_id %llu\npool_name %s\n"
4210 "image_id %s\nimage_name %s\n"
4211 "snap_id %llu\nsnap_name %s\n"
4212 "overlap %llu\n",
4213 !count ? "" : "\n", /* first? */
4214 spec->pool_id, spec->pool_name,
4215 spec->image_id, spec->image_name ?: "(unknown)",
4216 spec->snap_id, spec->snap_name,
4217 rbd_dev->parent_overlap);
4218 }
Alex Elder86b00e02012-10-25 23:34:42 -05004219
Ilya Dryomovff961282014-07-22 21:53:07 +04004220 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004221}
4222
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004223static ssize_t rbd_image_refresh(struct device *dev,
4224 struct device_attribute *attr,
4225 const char *buf,
4226 size_t size)
4227{
Alex Elder593a9e72012-02-07 12:03:37 -06004228 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004229 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004230
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004231 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004232 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004233 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004234
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004235 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004236}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004237
Joe Perches5657a812018-05-24 13:38:59 -06004238static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
4239static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
4240static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
4241static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
4242static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
4243static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
4244static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
4245static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
4246static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
4247static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004248static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
Joe Perches5657a812018-05-24 13:38:59 -06004249static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
4250static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
4251static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
4252static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
4253static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
4254static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004255
4256static struct attribute *rbd_attrs[] = {
4257 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004258 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004259 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004260 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004261 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004262 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004263 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004264 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004265 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004266 &dev_attr_pool_id.attr,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004267 &dev_attr_pool_ns.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004268 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004269 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004270 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004271 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004272 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004273 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004274 NULL
4275};
4276
4277static struct attribute_group rbd_attr_group = {
4278 .attrs = rbd_attrs,
4279};
4280
4281static const struct attribute_group *rbd_attr_groups[] = {
4282 &rbd_attr_group,
4283 NULL
4284};
4285
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004286static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004287
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304288static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004289 .name = "rbd",
4290 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004291 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004292};
4293
Alex Elder8b8fb992012-10-26 17:25:24 -05004294static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4295{
4296 kref_get(&spec->kref);
4297
4298 return spec;
4299}
4300
4301static void rbd_spec_free(struct kref *kref);
4302static void rbd_spec_put(struct rbd_spec *spec)
4303{
4304 if (spec)
4305 kref_put(&spec->kref, rbd_spec_free);
4306}
4307
4308static struct rbd_spec *rbd_spec_alloc(void)
4309{
4310 struct rbd_spec *spec;
4311
4312 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4313 if (!spec)
4314 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004315
4316 spec->pool_id = CEPH_NOPOOL;
4317 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004318 kref_init(&spec->kref);
4319
Alex Elder8b8fb992012-10-26 17:25:24 -05004320 return spec;
4321}
4322
4323static void rbd_spec_free(struct kref *kref)
4324{
4325 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4326
4327 kfree(spec->pool_name);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004328 kfree(spec->pool_ns);
Alex Elder8b8fb992012-10-26 17:25:24 -05004329 kfree(spec->image_id);
4330 kfree(spec->image_name);
4331 kfree(spec->snap_name);
4332 kfree(spec);
4333}
4334
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004335static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004336{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004337 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004338 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004339
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004340 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004341 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004342 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004343
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004344 rbd_put_client(rbd_dev->rbd_client);
4345 rbd_spec_put(rbd_dev->spec);
4346 kfree(rbd_dev->opts);
4347 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004348}
4349
4350static void rbd_dev_release(struct device *dev)
4351{
4352 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4353 bool need_put = !!rbd_dev->opts;
4354
4355 if (need_put) {
4356 destroy_workqueue(rbd_dev->task_wq);
4357 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4358 }
4359
4360 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004361
4362 /*
4363 * This is racy, but way better than putting module outside of
4364 * the release callback. The race window is pretty small, so
4365 * doing something similar to dm (dm-builtin.c) is overkill.
4366 */
4367 if (need_put)
4368 module_put(THIS_MODULE);
4369}
4370
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004371static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4372 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004373{
4374 struct rbd_device *rbd_dev;
4375
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004376 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004377 if (!rbd_dev)
4378 return NULL;
4379
4380 spin_lock_init(&rbd_dev->lock);
4381 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004382 init_rwsem(&rbd_dev->header_rwsem);
4383
Ilya Dryomov7e973322017-01-25 18:16:22 +01004384 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004385 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004386 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004387 if (spec->pool_ns) {
4388 WARN_ON(!*spec->pool_ns);
4389 rbd_dev->header_oloc.pool_ns =
4390 ceph_find_or_create_string(spec->pool_ns,
4391 strlen(spec->pool_ns));
4392 }
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004393
Ilya Dryomov99d16942016-08-12 16:11:41 +02004394 mutex_init(&rbd_dev->watch_mutex);
4395 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4396 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4397
Ilya Dryomoved95b212016-08-12 16:40:02 +02004398 init_rwsem(&rbd_dev->lock_rwsem);
4399 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4400 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4401 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4402 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4403 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4404 init_waitqueue_head(&rbd_dev->lock_waitq);
4405
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004406 rbd_dev->dev.bus = &rbd_bus_type;
4407 rbd_dev->dev.type = &rbd_device_type;
4408 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004409 device_initialize(&rbd_dev->dev);
4410
Alex Elderc53d5892012-10-25 23:34:42 -05004411 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004412 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004413
Alex Elderc53d5892012-10-25 23:34:42 -05004414 return rbd_dev;
4415}
4416
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004417/*
4418 * Create a mapping rbd_dev.
4419 */
4420static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4421 struct rbd_spec *spec,
4422 struct rbd_options *opts)
4423{
4424 struct rbd_device *rbd_dev;
4425
4426 rbd_dev = __rbd_dev_create(rbdc, spec);
4427 if (!rbd_dev)
4428 return NULL;
4429
4430 rbd_dev->opts = opts;
4431
4432 /* get an id and fill in device name */
4433 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4434 minor_to_rbd_dev_id(1 << MINORBITS),
4435 GFP_KERNEL);
4436 if (rbd_dev->dev_id < 0)
4437 goto fail_rbd_dev;
4438
4439 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4440 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4441 rbd_dev->name);
4442 if (!rbd_dev->task_wq)
4443 goto fail_dev_id;
4444
4445 /* we have a ref from do_rbd_add() */
4446 __module_get(THIS_MODULE);
4447
4448 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4449 return rbd_dev;
4450
4451fail_dev_id:
4452 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4453fail_rbd_dev:
4454 rbd_dev_free(rbd_dev);
4455 return NULL;
4456}
4457
Alex Elderc53d5892012-10-25 23:34:42 -05004458static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4459{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004460 if (rbd_dev)
4461 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004462}
4463
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004464/*
Alex Elder9d475de2012-07-03 16:01:19 -05004465 * Get the size and object order for an image snapshot, or if
4466 * snap_id is CEPH_NOSNAP, gets this information for the base
4467 * image.
4468 */
4469static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4470 u8 *order, u64 *snap_size)
4471{
4472 __le64 snapid = cpu_to_le64(snap_id);
4473 int ret;
4474 struct {
4475 u8 order;
4476 __le64 size;
4477 } __attribute__ ((packed)) size_buf = { 0 };
4478
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004479 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4480 &rbd_dev->header_oloc, "get_size",
4481 &snapid, sizeof(snapid),
4482 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004483 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004484 if (ret < 0)
4485 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004486 if (ret < sizeof (size_buf))
4487 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004488
Josh Durginc3545572013-08-28 17:08:10 -07004489 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004490 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004491 dout(" order %u", (unsigned int)*order);
4492 }
Alex Elder9d475de2012-07-03 16:01:19 -05004493 *snap_size = le64_to_cpu(size_buf.size);
4494
Josh Durginc3545572013-08-28 17:08:10 -07004495 dout(" snap_id 0x%016llx snap_size = %llu\n",
4496 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004497 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004498
4499 return 0;
4500}
4501
4502static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4503{
4504 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4505 &rbd_dev->header.obj_order,
4506 &rbd_dev->header.image_size);
4507}
4508
Alex Elder1e130192012-07-03 16:01:19 -05004509static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4510{
4511 void *reply_buf;
4512 int ret;
4513 void *p;
4514
4515 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4516 if (!reply_buf)
4517 return -ENOMEM;
4518
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004519 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4520 &rbd_dev->header_oloc, "get_object_prefix",
4521 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004522 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004523 if (ret < 0)
4524 goto out;
4525
4526 p = reply_buf;
4527 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004528 p + ret, NULL, GFP_NOIO);
4529 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004530
4531 if (IS_ERR(rbd_dev->header.object_prefix)) {
4532 ret = PTR_ERR(rbd_dev->header.object_prefix);
4533 rbd_dev->header.object_prefix = NULL;
4534 } else {
4535 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4536 }
Alex Elder1e130192012-07-03 16:01:19 -05004537out:
4538 kfree(reply_buf);
4539
4540 return ret;
4541}
4542
Alex Elderb1b54022012-07-03 16:01:19 -05004543static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4544 u64 *snap_features)
4545{
4546 __le64 snapid = cpu_to_le64(snap_id);
4547 struct {
4548 __le64 features;
4549 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004550 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004551 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004552 int ret;
4553
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004554 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4555 &rbd_dev->header_oloc, "get_features",
4556 &snapid, sizeof(snapid),
4557 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004558 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004559 if (ret < 0)
4560 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004561 if (ret < sizeof (features_buf))
4562 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004563
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004564 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4565 if (unsup) {
4566 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4567 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004568 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004569 }
Alex Elderd8891402012-10-09 13:50:17 -07004570
Alex Elderb1b54022012-07-03 16:01:19 -05004571 *snap_features = le64_to_cpu(features_buf.features);
4572
4573 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004574 (unsigned long long)snap_id,
4575 (unsigned long long)*snap_features,
4576 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004577
4578 return 0;
4579}
4580
4581static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4582{
4583 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4584 &rbd_dev->header.features);
4585}
4586
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004587struct parent_image_info {
4588 u64 pool_id;
4589 const char *image_id;
4590 u64 snap_id;
4591
4592 u64 overlap;
4593};
4594
4595/*
4596 * The caller is responsible for @pii.
4597 */
4598static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4599 struct page *req_page,
4600 struct page *reply_page,
4601 struct parent_image_info *pii)
4602{
4603 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4604 size_t reply_len = PAGE_SIZE;
4605 void *p, *end;
4606 int ret;
4607
4608 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4609 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4610 req_page, sizeof(u64), reply_page, &reply_len);
4611 if (ret)
4612 return ret;
4613
4614 p = page_address(reply_page);
4615 end = p + reply_len;
4616 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4617 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4618 if (IS_ERR(pii->image_id)) {
4619 ret = PTR_ERR(pii->image_id);
4620 pii->image_id = NULL;
4621 return ret;
4622 }
4623 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
4624 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4625
4626 return 0;
4627
4628e_inval:
4629 return -EINVAL;
4630}
4631
4632static int get_parent_info(struct rbd_device *rbd_dev,
4633 struct parent_image_info *pii)
4634{
4635 struct page *req_page, *reply_page;
4636 void *p;
4637 int ret;
4638
4639 req_page = alloc_page(GFP_KERNEL);
4640 if (!req_page)
4641 return -ENOMEM;
4642
4643 reply_page = alloc_page(GFP_KERNEL);
4644 if (!reply_page) {
4645 __free_page(req_page);
4646 return -ENOMEM;
4647 }
4648
4649 p = page_address(req_page);
4650 ceph_encode_64(&p, rbd_dev->spec->snap_id);
4651 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page, pii);
4652
4653 __free_page(req_page);
4654 __free_page(reply_page);
4655 return ret;
4656}
4657
Alex Elder86b00e02012-10-25 23:34:42 -05004658static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4659{
4660 struct rbd_spec *parent_spec;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004661 struct parent_image_info pii = { 0 };
Alex Elder86b00e02012-10-25 23:34:42 -05004662 int ret;
4663
4664 parent_spec = rbd_spec_alloc();
4665 if (!parent_spec)
4666 return -ENOMEM;
4667
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004668 ret = get_parent_info(rbd_dev, &pii);
4669 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004670 goto out_err;
4671
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004672 dout("%s pool_id %llu image_id %s snap_id %llu overlap %llu\n",
4673 __func__, pii.pool_id, pii.image_id, pii.snap_id, pii.overlap);
4674
4675 if (pii.pool_id == CEPH_NOPOOL) {
Alex Elder392a9da2013-05-06 17:40:33 -05004676 /*
4677 * Either the parent never existed, or we have
4678 * record of it but the image got flattened so it no
4679 * longer has a parent. When the parent of a
4680 * layered image disappears we immediately set the
4681 * overlap to 0. The effect of this is that all new
4682 * requests will be treated as if the image had no
4683 * parent.
4684 */
4685 if (rbd_dev->parent_overlap) {
4686 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004687 rbd_dev_parent_put(rbd_dev);
4688 pr_info("%s: clone image has been flattened\n",
4689 rbd_dev->disk->disk_name);
4690 }
4691
Alex Elder86b00e02012-10-25 23:34:42 -05004692 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004693 }
Alex Elder86b00e02012-10-25 23:34:42 -05004694
Alex Elder0903e872012-11-14 12:25:19 -06004695 /* The ceph file layout needs to fit pool id in 32 bits */
4696
4697 ret = -EIO;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004698 if (pii.pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004699 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004700 (unsigned long long)pii.pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004701 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004702 }
Alex Elder0903e872012-11-14 12:25:19 -06004703
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004704 /*
4705 * The parent won't change (except when the clone is
4706 * flattened, already handled that). So we only need to
4707 * record the parent spec we have not already done so.
4708 */
4709 if (!rbd_dev->parent_spec) {
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004710 parent_spec->pool_id = pii.pool_id;
4711 parent_spec->image_id = pii.image_id;
4712 pii.image_id = NULL;
4713 parent_spec->snap_id = pii.snap_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004714
4715 /* TODO: support cloning across namespaces */
4716 if (rbd_dev->spec->pool_ns) {
4717 parent_spec->pool_ns = kstrdup(rbd_dev->spec->pool_ns,
4718 GFP_KERNEL);
4719 if (!parent_spec->pool_ns) {
4720 ret = -ENOMEM;
4721 goto out_err;
4722 }
4723 }
4724
Alex Elder70cf49c2013-05-06 17:40:33 -05004725 rbd_dev->parent_spec = parent_spec;
4726 parent_spec = NULL; /* rbd_dev now owns this */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004727 }
4728
4729 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004730 * We always update the parent overlap. If it's zero we issue
4731 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004732 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004733 if (!pii.overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004734 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004735 /* refresh, careful to warn just once */
4736 if (rbd_dev->parent_overlap)
4737 rbd_warn(rbd_dev,
4738 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004739 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004740 /* initial probe */
4741 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004742 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004743 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004744 rbd_dev->parent_overlap = pii.overlap;
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004745
Alex Elder86b00e02012-10-25 23:34:42 -05004746out:
4747 ret = 0;
4748out_err:
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004749 kfree(pii.image_id);
Alex Elder86b00e02012-10-25 23:34:42 -05004750 rbd_spec_put(parent_spec);
Alex Elder86b00e02012-10-25 23:34:42 -05004751 return ret;
4752}
4753
Alex Eldercc070d52013-04-21 12:14:45 -05004754static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4755{
4756 struct {
4757 __le64 stripe_unit;
4758 __le64 stripe_count;
4759 } __attribute__ ((packed)) striping_info_buf = { 0 };
4760 size_t size = sizeof (striping_info_buf);
4761 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05004762 int ret;
4763
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004764 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4765 &rbd_dev->header_oloc, "get_stripe_unit_count",
4766 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004767 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4768 if (ret < 0)
4769 return ret;
4770 if (ret < size)
4771 return -ERANGE;
4772
Alex Eldercc070d52013-04-21 12:14:45 -05004773 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01004774 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
4775 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05004776 return 0;
4777}
4778
Ilya Dryomov7e973322017-01-25 18:16:22 +01004779static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4780{
4781 __le64 data_pool_id;
4782 int ret;
4783
4784 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4785 &rbd_dev->header_oloc, "get_data_pool",
4786 NULL, 0, &data_pool_id, sizeof(data_pool_id));
4787 if (ret < 0)
4788 return ret;
4789 if (ret < sizeof(data_pool_id))
4790 return -EBADMSG;
4791
4792 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4793 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4794 return 0;
4795}
4796
Alex Elder9e15b772012-10-30 19:40:33 -05004797static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4798{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004799 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05004800 size_t image_id_size;
4801 char *image_id;
4802 void *p;
4803 void *end;
4804 size_t size;
4805 void *reply_buf = NULL;
4806 size_t len = 0;
4807 char *image_name = NULL;
4808 int ret;
4809
4810 rbd_assert(!rbd_dev->spec->image_name);
4811
Alex Elder69e7a022012-11-01 08:39:26 -05004812 len = strlen(rbd_dev->spec->image_id);
4813 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004814 image_id = kmalloc(image_id_size, GFP_KERNEL);
4815 if (!image_id)
4816 return NULL;
4817
4818 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004819 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004820 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004821
4822 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4823 reply_buf = kmalloc(size, GFP_KERNEL);
4824 if (!reply_buf)
4825 goto out;
4826
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004827 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4828 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
4829 "dir_get_name", image_id, image_id_size,
4830 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05004831 if (ret < 0)
4832 goto out;
4833 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05004834 end = reply_buf + ret;
4835
Alex Elder9e15b772012-10-30 19:40:33 -05004836 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
4837 if (IS_ERR(image_name))
4838 image_name = NULL;
4839 else
4840 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
4841out:
4842 kfree(reply_buf);
4843 kfree(image_id);
4844
4845 return image_name;
4846}
4847
Alex Elder2ad3d712013-04-30 00:44:33 -05004848static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4849{
4850 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4851 const char *snap_name;
4852 u32 which = 0;
4853
4854 /* Skip over names until we find the one we are looking for */
4855
4856 snap_name = rbd_dev->header.snap_names;
4857 while (which < snapc->num_snaps) {
4858 if (!strcmp(name, snap_name))
4859 return snapc->snaps[which];
4860 snap_name += strlen(snap_name) + 1;
4861 which++;
4862 }
4863 return CEPH_NOSNAP;
4864}
4865
4866static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4867{
4868 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
4869 u32 which;
4870 bool found = false;
4871 u64 snap_id;
4872
4873 for (which = 0; !found && which < snapc->num_snaps; which++) {
4874 const char *snap_name;
4875
4876 snap_id = snapc->snaps[which];
4877 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07004878 if (IS_ERR(snap_name)) {
4879 /* ignore no-longer existing snapshots */
4880 if (PTR_ERR(snap_name) == -ENOENT)
4881 continue;
4882 else
4883 break;
4884 }
Alex Elder2ad3d712013-04-30 00:44:33 -05004885 found = !strcmp(name, snap_name);
4886 kfree(snap_name);
4887 }
4888 return found ? snap_id : CEPH_NOSNAP;
4889}
4890
4891/*
4892 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
4893 * no snapshot by that name is found, or if an error occurs.
4894 */
4895static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
4896{
4897 if (rbd_dev->image_format == 1)
4898 return rbd_v1_snap_id_by_name(rbd_dev, name);
4899
4900 return rbd_v2_snap_id_by_name(rbd_dev, name);
4901}
4902
Alex Elder9e15b772012-10-30 19:40:33 -05004903/*
Ilya Dryomov04077592014-07-23 17:11:20 +04004904 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05004905 */
Ilya Dryomov04077592014-07-23 17:11:20 +04004906static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
4907{
4908 struct rbd_spec *spec = rbd_dev->spec;
4909
4910 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
4911 rbd_assert(spec->image_id && spec->image_name);
4912 rbd_assert(spec->snap_name);
4913
4914 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
4915 u64 snap_id;
4916
4917 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
4918 if (snap_id == CEPH_NOSNAP)
4919 return -ENOENT;
4920
4921 spec->snap_id = snap_id;
4922 } else {
4923 spec->snap_id = CEPH_NOSNAP;
4924 }
4925
4926 return 0;
4927}
4928
4929/*
4930 * A parent image will have all ids but none of the names.
4931 *
4932 * All names in an rbd spec are dynamically allocated. It's OK if we
4933 * can't figure out the name for an image id.
4934 */
4935static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05004936{
Alex Elder2e9f7f12013-04-26 09:43:48 -05004937 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4938 struct rbd_spec *spec = rbd_dev->spec;
4939 const char *pool_name;
4940 const char *image_name;
4941 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004942 int ret;
4943
Ilya Dryomov04077592014-07-23 17:11:20 +04004944 rbd_assert(spec->pool_id != CEPH_NOPOOL);
4945 rbd_assert(spec->image_id);
4946 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05004947
Alex Elder2e9f7f12013-04-26 09:43:48 -05004948 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05004949
Alex Elder2e9f7f12013-04-26 09:43:48 -05004950 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
4951 if (!pool_name) {
4952 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05004953 return -EIO;
4954 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05004955 pool_name = kstrdup(pool_name, GFP_KERNEL);
4956 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05004957 return -ENOMEM;
4958
4959 /* Fetch the image name; tolerate failure here */
4960
Alex Elder2e9f7f12013-04-26 09:43:48 -05004961 image_name = rbd_dev_image_name(rbd_dev);
4962 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05004963 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05004964
Ilya Dryomov04077592014-07-23 17:11:20 +04004965 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05004966
Alex Elder2e9f7f12013-04-26 09:43:48 -05004967 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07004968 if (IS_ERR(snap_name)) {
4969 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004970 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05004971 }
4972
4973 spec->pool_name = pool_name;
4974 spec->image_name = image_name;
4975 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05004976
4977 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04004978
Alex Elder9e15b772012-10-30 19:40:33 -05004979out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05004980 kfree(image_name);
4981 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05004982 return ret;
4983}
4984
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004985static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05004986{
4987 size_t size;
4988 int ret;
4989 void *reply_buf;
4990 void *p;
4991 void *end;
4992 u64 seq;
4993 u32 snap_count;
4994 struct ceph_snap_context *snapc;
4995 u32 i;
4996
4997 /*
4998 * We'll need room for the seq value (maximum snapshot id),
4999 * snapshot count, and array of that many snapshot ids.
5000 * For now we have a fixed upper limit on the number we're
5001 * prepared to receive.
5002 */
5003 size = sizeof (__le64) + sizeof (__le32) +
5004 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5005 reply_buf = kzalloc(size, GFP_KERNEL);
5006 if (!reply_buf)
5007 return -ENOMEM;
5008
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005009 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5010 &rbd_dev->header_oloc, "get_snapcontext",
5011 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005012 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005013 if (ret < 0)
5014 goto out;
5015
Alex Elder35d489f2012-07-03 16:01:19 -05005016 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005017 end = reply_buf + ret;
5018 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005019 ceph_decode_64_safe(&p, end, seq, out);
5020 ceph_decode_32_safe(&p, end, snap_count, out);
5021
5022 /*
5023 * Make sure the reported number of snapshot ids wouldn't go
5024 * beyond the end of our buffer. But before checking that,
5025 * make sure the computed size of the snapshot context we
5026 * allocate is representable in a size_t.
5027 */
5028 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5029 / sizeof (u64)) {
5030 ret = -EINVAL;
5031 goto out;
5032 }
5033 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5034 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005035 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005036
Alex Elder812164f82013-04-30 00:44:32 -05005037 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005038 if (!snapc) {
5039 ret = -ENOMEM;
5040 goto out;
5041 }
Alex Elder35d489f2012-07-03 16:01:19 -05005042 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005043 for (i = 0; i < snap_count; i++)
5044 snapc->snaps[i] = ceph_decode_64(&p);
5045
Alex Elder49ece552013-05-06 08:37:00 -05005046 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005047 rbd_dev->header.snapc = snapc;
5048
5049 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005050 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005051out:
5052 kfree(reply_buf);
5053
Alex Elder57385b52013-04-21 12:14:45 -05005054 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005055}
5056
Alex Elder54cac612013-04-30 00:44:33 -05005057static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5058 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005059{
5060 size_t size;
5061 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005062 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005063 int ret;
5064 void *p;
5065 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005066 char *snap_name;
5067
5068 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5069 reply_buf = kmalloc(size, GFP_KERNEL);
5070 if (!reply_buf)
5071 return ERR_PTR(-ENOMEM);
5072
Alex Elder54cac612013-04-30 00:44:33 -05005073 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005074 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5075 &rbd_dev->header_oloc, "get_snapshot_name",
5076 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005077 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005078 if (ret < 0) {
5079 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005080 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005081 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005082
5083 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005084 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005085 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005086 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005087 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005088
Alex Elderf40eb342013-04-25 15:09:42 -05005089 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005090 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005091out:
5092 kfree(reply_buf);
5093
Alex Elderf40eb342013-04-25 15:09:42 -05005094 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005095}
5096
Alex Elder2df3fac2013-05-06 09:51:30 -05005097static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005098{
Alex Elder2df3fac2013-05-06 09:51:30 -05005099 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005100 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005101
Josh Durgin1617e402013-06-12 14:43:10 -07005102 ret = rbd_dev_v2_image_size(rbd_dev);
5103 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005104 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005105
Alex Elder2df3fac2013-05-06 09:51:30 -05005106 if (first_time) {
5107 ret = rbd_dev_v2_header_onetime(rbd_dev);
5108 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005109 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005110 }
5111
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005112 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005113 if (ret && first_time) {
5114 kfree(rbd_dev->header.object_prefix);
5115 rbd_dev->header.object_prefix = NULL;
5116 }
Alex Elder117973f2012-08-31 17:29:55 -05005117
5118 return ret;
5119}
5120
Ilya Dryomova720ae02014-07-23 17:11:19 +04005121static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5122{
5123 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5124
5125 if (rbd_dev->image_format == 1)
5126 return rbd_dev_v1_header_info(rbd_dev);
5127
5128 return rbd_dev_v2_header_info(rbd_dev);
5129}
5130
Alex Elder1ddbe942012-01-29 13:57:44 -06005131/*
Alex Eldere28fff262012-02-02 08:13:30 -06005132 * Skips over white space at *buf, and updates *buf to point to the
5133 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005134 * the token (string of non-white space characters) found. Note
5135 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005136 */
5137static inline size_t next_token(const char **buf)
5138{
5139 /*
5140 * These are the characters that produce nonzero for
5141 * isspace() in the "C" and "POSIX" locales.
5142 */
5143 const char *spaces = " \f\n\r\t\v";
5144
5145 *buf += strspn(*buf, spaces); /* Find start of token */
5146
5147 return strcspn(*buf, spaces); /* Return token length */
5148}
5149
5150/*
Alex Elderea3352f2012-07-09 21:04:23 -05005151 * Finds the next token in *buf, dynamically allocates a buffer big
5152 * enough to hold a copy of it, and copies the token into the new
5153 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5154 * that a duplicate buffer is created even for a zero-length token.
5155 *
5156 * Returns a pointer to the newly-allocated duplicate, or a null
5157 * pointer if memory for the duplicate was not available. If
5158 * the lenp argument is a non-null pointer, the length of the token
5159 * (not including the '\0') is returned in *lenp.
5160 *
5161 * If successful, the *buf pointer will be updated to point beyond
5162 * the end of the found token.
5163 *
5164 * Note: uses GFP_KERNEL for allocation.
5165 */
5166static inline char *dup_token(const char **buf, size_t *lenp)
5167{
5168 char *dup;
5169 size_t len;
5170
5171 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005172 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005173 if (!dup)
5174 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005175 *(dup + len) = '\0';
5176 *buf += len;
5177
5178 if (lenp)
5179 *lenp = len;
5180
5181 return dup;
5182}
5183
5184/*
Alex Elder859c31d2012-10-25 23:34:42 -05005185 * Parse the options provided for an "rbd add" (i.e., rbd image
5186 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5187 * and the data written is passed here via a NUL-terminated buffer.
5188 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005189 *
Alex Elder859c31d2012-10-25 23:34:42 -05005190 * The information extracted from these options is recorded in
5191 * the other parameters which return dynamically-allocated
5192 * structures:
5193 * ceph_opts
5194 * The address of a pointer that will refer to a ceph options
5195 * structure. Caller must release the returned pointer using
5196 * ceph_destroy_options() when it is no longer needed.
5197 * rbd_opts
5198 * Address of an rbd options pointer. Fully initialized by
5199 * this function; caller must release with kfree().
5200 * spec
5201 * Address of an rbd image specification pointer. Fully
5202 * initialized by this function based on parsed options.
5203 * Caller must release with rbd_spec_put().
5204 *
5205 * The options passed take this form:
5206 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5207 * where:
5208 * <mon_addrs>
5209 * A comma-separated list of one or more monitor addresses.
5210 * A monitor address is an ip address, optionally followed
5211 * by a port number (separated by a colon).
5212 * I.e.: ip1[:port1][,ip2[:port2]...]
5213 * <options>
5214 * A comma-separated list of ceph and/or rbd options.
5215 * <pool_name>
5216 * The name of the rados pool containing the rbd image.
5217 * <image_name>
5218 * The name of the image in that pool to map.
5219 * <snap_id>
5220 * An optional snapshot id. If provided, the mapping will
5221 * present data from the image at the time that snapshot was
5222 * created. The image head is used if no snapshot id is
5223 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005224 */
Alex Elder859c31d2012-10-25 23:34:42 -05005225static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005226 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005227 struct rbd_options **opts,
5228 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005229{
Alex Elderd22f76e2012-07-12 10:46:35 -05005230 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005231 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005232 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005233 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005234 size_t mon_addrs_size;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005235 struct parse_rbd_opts_ctx pctx = { 0 };
Alex Elder859c31d2012-10-25 23:34:42 -05005236 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005237 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005238
5239 /* The first four tokens are required */
5240
Alex Elder7ef32142012-02-02 08:13:30 -06005241 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005242 if (!len) {
5243 rbd_warn(NULL, "no monitor address(es) provided");
5244 return -EINVAL;
5245 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005246 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005247 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005248 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005249
Alex Elderdc79b112012-10-25 23:34:41 -05005250 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005251 options = dup_token(&buf, NULL);
5252 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005253 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005254 if (!*options) {
5255 rbd_warn(NULL, "no options provided");
5256 goto out_err;
5257 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005258
Ilya Dryomovc3001562018-07-03 15:28:43 +02005259 pctx.spec = rbd_spec_alloc();
5260 if (!pctx.spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005261 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005262
Ilya Dryomovc3001562018-07-03 15:28:43 +02005263 pctx.spec->pool_name = dup_token(&buf, NULL);
5264 if (!pctx.spec->pool_name)
Alex Elder859c31d2012-10-25 23:34:42 -05005265 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005266 if (!*pctx.spec->pool_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005267 rbd_warn(NULL, "no pool name provided");
5268 goto out_err;
5269 }
Alex Eldere28fff262012-02-02 08:13:30 -06005270
Ilya Dryomovc3001562018-07-03 15:28:43 +02005271 pctx.spec->image_name = dup_token(&buf, NULL);
5272 if (!pctx.spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005273 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005274 if (!*pctx.spec->image_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005275 rbd_warn(NULL, "no image name provided");
5276 goto out_err;
5277 }
Alex Eldere28fff262012-02-02 08:13:30 -06005278
Alex Elderf28e5652012-10-25 23:34:41 -05005279 /*
5280 * Snapshot name is optional; default is to use "-"
5281 * (indicating the head/no snapshot).
5282 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005283 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005284 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005285 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5286 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005287 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005288 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005289 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005290 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005291 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5292 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005293 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005294 *(snap_name + len) = '\0';
Ilya Dryomovc3001562018-07-03 15:28:43 +02005295 pctx.spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005296
Alex Elder0ddebc02012-10-25 23:34:41 -05005297 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005298
Ilya Dryomovc3001562018-07-03 15:28:43 +02005299 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5300 if (!pctx.opts)
Alex Elder4e9afeb2012-10-25 23:34:41 -05005301 goto out_mem;
5302
Ilya Dryomovc3001562018-07-03 15:28:43 +02005303 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5304 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
5305 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5306 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5307 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5308 pctx.opts->trim = RBD_TRIM_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005309
Alex Elder859c31d2012-10-25 23:34:42 -05005310 copts = ceph_parse_options(options, mon_addrs,
Ilya Dryomovc3001562018-07-03 15:28:43 +02005311 mon_addrs + mon_addrs_size - 1,
5312 parse_rbd_opts_token, &pctx);
Alex Elder859c31d2012-10-25 23:34:42 -05005313 if (IS_ERR(copts)) {
5314 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005315 goto out_err;
5316 }
Alex Elder859c31d2012-10-25 23:34:42 -05005317 kfree(options);
5318
5319 *ceph_opts = copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005320 *opts = pctx.opts;
5321 *rbd_spec = pctx.spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005322
Alex Elderdc79b112012-10-25 23:34:41 -05005323 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005324out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005325 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005326out_err:
Ilya Dryomovc3001562018-07-03 15:28:43 +02005327 kfree(pctx.opts);
5328 rbd_spec_put(pctx.spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005329 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005330
Alex Elderdc79b112012-10-25 23:34:41 -05005331 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005332}
5333
Ilya Dryomove010dd02017-04-13 12:17:39 +02005334static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5335{
5336 down_write(&rbd_dev->lock_rwsem);
5337 if (__rbd_is_lock_owner(rbd_dev))
5338 rbd_unlock(rbd_dev);
5339 up_write(&rbd_dev->lock_rwsem);
5340}
5341
5342static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5343{
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005344 int ret;
5345
Ilya Dryomove010dd02017-04-13 12:17:39 +02005346 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5347 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5348 return -EINVAL;
5349 }
5350
5351 /* FIXME: "rbd map --exclusive" should be in interruptible */
5352 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005353 ret = rbd_wait_state_locked(rbd_dev, true);
Ilya Dryomove010dd02017-04-13 12:17:39 +02005354 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005355 if (ret) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02005356 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5357 return -EROFS;
5358 }
5359
5360 return 0;
5361}
5362
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005363/*
Alex Elder589d30e2012-07-10 20:30:11 -05005364 * An rbd format 2 image has a unique identifier, distinct from the
5365 * name given to it by the user. Internally, that identifier is
5366 * what's used to specify the names of objects related to the image.
5367 *
5368 * A special "rbd id" object is used to map an rbd image name to its
5369 * id. If that object doesn't exist, then there is no v2 rbd image
5370 * with the supplied name.
5371 *
5372 * This function will record the given rbd_dev's image_id field if
5373 * it can be determined, and in that case will return 0. If any
5374 * errors occur a negative errno will be returned and the rbd_dev's
5375 * image_id field will be unchanged (and should be NULL).
5376 */
5377static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5378{
5379 int ret;
5380 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005381 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005382 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005383 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005384
Alex Elder589d30e2012-07-10 20:30:11 -05005385 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005386 * When probing a parent image, the image id is already
5387 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005388 * need to fetch the image id again in this case. We
5389 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005390 */
Alex Elderc0fba362013-04-25 23:15:08 -05005391 if (rbd_dev->spec->image_id) {
5392 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5393
Alex Elder2c0d0a12012-10-30 19:40:33 -05005394 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005395 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005396
5397 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005398 * First, see if the format 2 image id file exists, and if
5399 * so, get the image's persistent id from it.
5400 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005401 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5402 rbd_dev->spec->image_name);
5403 if (ret)
5404 return ret;
5405
5406 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005407
5408 /* Response will be an encoded string, which includes a length */
5409
5410 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5411 response = kzalloc(size, GFP_NOIO);
5412 if (!response) {
5413 ret = -ENOMEM;
5414 goto out;
5415 }
5416
Alex Elderc0fba362013-04-25 23:15:08 -05005417 /* If it doesn't exist we'll assume it's a format 1 image */
5418
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005419 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5420 "get_id", NULL, 0,
5421 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005422 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005423 if (ret == -ENOENT) {
5424 image_id = kstrdup("", GFP_KERNEL);
5425 ret = image_id ? 0 : -ENOMEM;
5426 if (!ret)
5427 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005428 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005429 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005430
Alex Elderc0fba362013-04-25 23:15:08 -05005431 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005432 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005433 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005434 if (!ret)
5435 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005436 }
5437
5438 if (!ret) {
5439 rbd_dev->spec->image_id = image_id;
5440 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005441 }
5442out:
5443 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005444 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005445 return ret;
5446}
5447
Alex Elder3abef3b2013-05-13 20:35:37 -05005448/*
5449 * Undo whatever state changes are made by v1 or v2 header info
5450 * call.
5451 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005452static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5453{
5454 struct rbd_image_header *header;
5455
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005456 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005457
5458 /* Free dynamic fields from the header, then zero it out */
5459
5460 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005461 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005462 kfree(header->snap_sizes);
5463 kfree(header->snap_names);
5464 kfree(header->object_prefix);
5465 memset(header, 0, sizeof (*header));
5466}
5467
Alex Elder2df3fac2013-05-06 09:51:30 -05005468static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005469{
5470 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005471
Alex Elder1e130192012-07-03 16:01:19 -05005472 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005473 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005474 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005475
Alex Elder2df3fac2013-05-06 09:51:30 -05005476 /*
5477 * Get the and check features for the image. Currently the
5478 * features are assumed to never change.
5479 */
Alex Elderb1b54022012-07-03 16:01:19 -05005480 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005481 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005482 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005483
Alex Eldercc070d52013-04-21 12:14:45 -05005484 /* If the image supports fancy striping, get its parameters */
5485
5486 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5487 ret = rbd_dev_v2_striping_info(rbd_dev);
5488 if (ret < 0)
5489 goto out_err;
5490 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005491
Ilya Dryomov7e973322017-01-25 18:16:22 +01005492 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5493 ret = rbd_dev_v2_data_pool(rbd_dev);
5494 if (ret)
5495 goto out_err;
5496 }
5497
Ilya Dryomov263423f2017-01-25 18:16:22 +01005498 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005499 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005500
Alex Elder9d475de2012-07-03 16:01:19 -05005501out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005502 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005503 kfree(rbd_dev->header.object_prefix);
5504 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005505 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005506}
5507
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005508/*
5509 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5510 * rbd_dev_image_probe() recursion depth, which means it's also the
5511 * length of the already discovered part of the parent chain.
5512 */
5513static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005514{
Alex Elder2f82ee52012-10-30 19:40:33 -05005515 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005516 int ret;
5517
5518 if (!rbd_dev->parent_spec)
5519 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005520
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005521 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5522 pr_info("parent chain is too long (%d)\n", depth);
5523 ret = -EINVAL;
5524 goto out_err;
5525 }
5526
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005527 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005528 if (!parent) {
5529 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005530 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005531 }
5532
5533 /*
5534 * Images related by parent/child relationships always share
5535 * rbd_client and spec/parent_spec, so bump their refcounts.
5536 */
5537 __rbd_get_client(rbd_dev->rbd_client);
5538 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005539
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005540 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005541 if (ret < 0)
5542 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005543
Alex Elder124afba2013-04-26 15:44:36 -05005544 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005545 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005546 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005547
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005548out_err:
5549 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005550 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005551 return ret;
5552}
5553
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005554static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5555{
5556 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5557 rbd_dev_mapping_clear(rbd_dev);
5558 rbd_free_disk(rbd_dev);
5559 if (!single_major)
5560 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5561}
5562
Ilya Dryomov811c6682016-04-15 16:22:16 +02005563/*
5564 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5565 * upon return.
5566 */
Alex Elder200a6a82013-04-28 23:32:34 -05005567static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005568{
Alex Elder83a06262012-10-30 15:47:17 -05005569 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005570
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005571 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005572
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005573 if (!single_major) {
5574 ret = register_blkdev(0, rbd_dev->name);
5575 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005576 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005577
5578 rbd_dev->major = ret;
5579 rbd_dev->minor = 0;
5580 } else {
5581 rbd_dev->major = rbd_major;
5582 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5583 }
Alex Elder83a06262012-10-30 15:47:17 -05005584
5585 /* Set up the blkdev mapping. */
5586
5587 ret = rbd_init_disk(rbd_dev);
5588 if (ret)
5589 goto err_out_blkdev;
5590
Alex Elderf35a4de2013-05-06 09:51:29 -05005591 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005592 if (ret)
5593 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005594
Alex Elderf35a4de2013-05-06 09:51:29 -05005595 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005596 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005597
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005598 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005599 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005600 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005601
Alex Elder129b79d2013-04-26 15:44:36 -05005602 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005603 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005604 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005605
Alex Elderf35a4de2013-05-06 09:51:29 -05005606err_out_mapping:
5607 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005608err_out_disk:
5609 rbd_free_disk(rbd_dev);
5610err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005611 if (!single_major)
5612 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005613err_out_unlock:
5614 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005615 return ret;
5616}
5617
Alex Elder332bb122013-04-27 09:59:30 -05005618static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5619{
5620 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005621 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005622
5623 /* Record the header object name for this rbd image. */
5624
5625 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005626 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005627 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5628 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005629 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005630 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5631 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005632
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005633 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005634}
5635
Alex Elder200a6a82013-04-28 23:32:34 -05005636static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5637{
Alex Elder6fd48b32013-04-28 23:32:34 -05005638 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005639 if (rbd_dev->opts)
5640 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005641 rbd_dev->image_format = 0;
5642 kfree(rbd_dev->spec->image_id);
5643 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005644}
5645
Alex Eldera30b71b2012-07-10 20:30:11 -05005646/*
5647 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005648 * device. If this image is the one being mapped (i.e., not a
5649 * parent), initiate a watch on its header object before using that
5650 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005651 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005652static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005653{
5654 int ret;
5655
5656 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005657 * Get the id from the image id object. Unless there's an
5658 * error, rbd_dev->spec->image_id will be filled in with
5659 * a dynamically-allocated string, and rbd_dev->image_format
5660 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005661 */
5662 ret = rbd_dev_image_id(rbd_dev);
5663 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005664 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005665
Alex Elder332bb122013-04-27 09:59:30 -05005666 ret = rbd_dev_header_name(rbd_dev);
5667 if (ret)
5668 goto err_out_format;
5669
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005670 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005671 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005672 if (ret) {
5673 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005674 pr_info("image %s/%s%s%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005675 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005676 rbd_dev->spec->pool_ns ?: "",
5677 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005678 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005679 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005680 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005681 }
Alex Elderb644de22013-04-27 09:59:31 -05005682
Ilya Dryomova720ae02014-07-23 17:11:19 +04005683 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005684 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005685 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005686
Ilya Dryomov04077592014-07-23 17:11:20 +04005687 /*
5688 * If this image is the one being mapped, we have pool name and
5689 * id, image name and id, and snap name - need to fill snap id.
5690 * Otherwise this is a parent image, identified by pool, image
5691 * and snap ids - need to fill in names for those ids.
5692 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005693 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005694 ret = rbd_spec_fill_snap_id(rbd_dev);
5695 else
5696 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005697 if (ret) {
5698 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005699 pr_info("snap %s/%s%s%s@%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005700 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005701 rbd_dev->spec->pool_ns ?: "",
5702 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005703 rbd_dev->spec->image_name,
5704 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005705 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005706 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005707
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005708 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5709 ret = rbd_dev_v2_parent_info(rbd_dev);
5710 if (ret)
5711 goto err_out_probe;
5712
5713 /*
5714 * Need to warn users if this image is the one being
5715 * mapped and has a parent.
5716 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005717 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005718 rbd_warn(rbd_dev,
5719 "WARNING: kernel layering is EXPERIMENTAL!");
5720 }
5721
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005722 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005723 if (ret)
5724 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005725
Alex Elder30d60ba2013-05-06 09:51:30 -05005726 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005727 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005728 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005729
Alex Elder6fd48b32013-04-28 23:32:34 -05005730err_out_probe:
5731 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005732err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005733 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005734 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005735err_out_format:
5736 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005737 kfree(rbd_dev->spec->image_id);
5738 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005739 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005740}
5741
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005742static ssize_t do_rbd_add(struct bus_type *bus,
5743 const char *buf,
5744 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005745{
Alex Eldercb8627c2012-07-09 21:04:23 -05005746 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005747 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005748 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005749 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005750 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005751 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005752
5753 if (!try_module_get(THIS_MODULE))
5754 return -ENODEV;
5755
Alex Eldera725f65e2012-02-02 08:13:30 -06005756 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005757 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005758 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005759 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06005760
Alex Elder9d3997f2012-10-25 23:34:42 -05005761 rbdc = rbd_get_client(ceph_opts);
5762 if (IS_ERR(rbdc)) {
5763 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005764 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005765 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005766
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005767 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01005768 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005769 if (rc < 0) {
5770 if (rc == -ENOENT)
5771 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005772 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005773 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05005774 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005775
Ilya Dryomovd1475432015-06-22 13:24:48 +03005776 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005777 if (!rbd_dev) {
5778 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05005779 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005780 }
Alex Elderc53d5892012-10-25 23:34:42 -05005781 rbdc = NULL; /* rbd_dev now owns this */
5782 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03005783 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005784
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005785 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5786 if (!rbd_dev->config_info) {
5787 rc = -ENOMEM;
5788 goto err_out_rbd_dev;
5789 }
5790
Ilya Dryomov811c6682016-04-15 16:22:16 +02005791 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005792 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005793 if (rc < 0) {
5794 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05005795 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005796 }
Alex Elder05fd6f62012-08-29 17:11:07 -05005797
Alex Elder7ce4eef2013-05-06 17:40:33 -05005798 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05005799 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02005800 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05005801
Alex Elderb536f692013-04-28 23:32:34 -05005802 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005803 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005804 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05005805
Ilya Dryomove010dd02017-04-13 12:17:39 +02005806 if (rbd_dev->opts->exclusive) {
5807 rc = rbd_add_acquire_lock(rbd_dev);
5808 if (rc)
5809 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05005810 }
5811
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005812 /* Everything's ready. Announce the disk to the world. */
5813
5814 rc = device_add(&rbd_dev->dev);
5815 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02005816 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005817
5818 add_disk(rbd_dev->disk);
5819 /* see rbd_init_disk() */
5820 blk_put_queue(rbd_dev->disk->queue);
5821
5822 spin_lock(&rbd_dev_list_lock);
5823 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5824 spin_unlock(&rbd_dev_list_lock);
5825
5826 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5827 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5828 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005829 rc = count;
5830out:
5831 module_put(THIS_MODULE);
5832 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05005833
Ilya Dryomove010dd02017-04-13 12:17:39 +02005834err_out_image_lock:
5835 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005836err_out_device_setup:
5837 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005838err_out_image_probe:
5839 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005840err_out_rbd_dev:
5841 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05005842err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05005843 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005844err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05005845 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03005846 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005847 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005848}
5849
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005850static ssize_t rbd_add(struct bus_type *bus,
5851 const char *buf,
5852 size_t count)
5853{
5854 if (single_major)
5855 return -EINVAL;
5856
5857 return do_rbd_add(bus, buf, count);
5858}
5859
5860static ssize_t rbd_add_single_major(struct bus_type *bus,
5861 const char *buf,
5862 size_t count)
5863{
5864 return do_rbd_add(bus, buf, count);
5865}
5866
Alex Elder05a46af2013-04-26 15:44:36 -05005867static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
5868{
Alex Elderad945fc2013-04-26 15:44:36 -05005869 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05005870 struct rbd_device *first = rbd_dev;
5871 struct rbd_device *second = first->parent;
5872 struct rbd_device *third;
5873
5874 /*
5875 * Follow to the parent with no grandparent and
5876 * remove it.
5877 */
5878 while (second && (third = second->parent)) {
5879 first = second;
5880 second = third;
5881 }
Alex Elderad945fc2013-04-26 15:44:36 -05005882 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005883 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005884 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05005885 first->parent = NULL;
5886 first->parent_overlap = 0;
5887
5888 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05005889 rbd_spec_put(first->parent_spec);
5890 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05005891 }
5892}
5893
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005894static ssize_t do_rbd_remove(struct bus_type *bus,
5895 const char *buf,
5896 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005897{
5898 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05005899 struct list_head *tmp;
5900 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02005901 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05005902 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02005903 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05005904 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005905
Mike Christie0276dca2016-08-18 18:38:45 +02005906 dev_id = -1;
5907 opt_buf[0] = '\0';
5908 sscanf(buf, "%d %5s", &dev_id, opt_buf);
5909 if (dev_id < 0) {
5910 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005911 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02005912 }
5913 if (opt_buf[0] != '\0') {
5914 if (!strcmp(opt_buf, "force")) {
5915 force = true;
5916 } else {
5917 pr_err("bad remove option at '%s'\n", opt_buf);
5918 return -EINVAL;
5919 }
5920 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005921
Alex Elder751cc0e2013-05-31 15:17:01 -05005922 ret = -ENOENT;
5923 spin_lock(&rbd_dev_list_lock);
5924 list_for_each(tmp, &rbd_dev_list) {
5925 rbd_dev = list_entry(tmp, struct rbd_device, node);
5926 if (rbd_dev->dev_id == dev_id) {
5927 ret = 0;
5928 break;
5929 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005930 }
Alex Elder751cc0e2013-05-31 15:17:01 -05005931 if (!ret) {
5932 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02005933 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05005934 ret = -EBUSY;
5935 else
Alex Elder82a442d2013-05-31 17:40:44 -05005936 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
5937 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05005938 spin_unlock_irq(&rbd_dev->lock);
5939 }
5940 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05005941 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005942 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05005943
Mike Christie0276dca2016-08-18 18:38:45 +02005944 if (force) {
5945 /*
5946 * Prevent new IO from being queued and wait for existing
5947 * IO to complete/fail.
5948 */
5949 blk_mq_freeze_queue(rbd_dev->disk->queue);
5950 blk_set_queue_dying(rbd_dev->disk->queue);
5951 }
5952
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005953 del_gendisk(rbd_dev->disk);
5954 spin_lock(&rbd_dev_list_lock);
5955 list_del_init(&rbd_dev->node);
5956 spin_unlock(&rbd_dev_list_lock);
5957 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02005958
Ilya Dryomove010dd02017-04-13 12:17:39 +02005959 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005960 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05005961 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005962 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05005963 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005964}
5965
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005966static ssize_t rbd_remove(struct bus_type *bus,
5967 const char *buf,
5968 size_t count)
5969{
5970 if (single_major)
5971 return -EINVAL;
5972
5973 return do_rbd_remove(bus, buf, count);
5974}
5975
5976static ssize_t rbd_remove_single_major(struct bus_type *bus,
5977 const char *buf,
5978 size_t count)
5979{
5980 return do_rbd_remove(bus, buf, count);
5981}
5982
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005983/*
5984 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005985 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005986 */
5987static int rbd_sysfs_init(void)
5988{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005989 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005990
Alex Elderfed4c142012-02-07 12:03:36 -06005991 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06005992 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005993 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005994
Alex Elderfed4c142012-02-07 12:03:36 -06005995 ret = bus_register(&rbd_bus_type);
5996 if (ret < 0)
5997 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005998
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005999 return ret;
6000}
6001
6002static void rbd_sysfs_cleanup(void)
6003{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006004 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006005 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006006}
6007
Alex Elder1c2a9df2013-05-01 12:43:03 -05006008static int rbd_slab_init(void)
6009{
6010 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006011 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006012 if (!rbd_img_request_cache)
6013 return -ENOMEM;
6014
6015 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006016 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006017 if (!rbd_obj_request_cache)
6018 goto out_err;
6019
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006020 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006021
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006022out_err:
Alex Elder868311b2013-05-01 12:43:03 -05006023 kmem_cache_destroy(rbd_img_request_cache);
6024 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006025 return -ENOMEM;
6026}
6027
6028static void rbd_slab_exit(void)
6029{
Alex Elder868311b2013-05-01 12:43:03 -05006030 rbd_assert(rbd_obj_request_cache);
6031 kmem_cache_destroy(rbd_obj_request_cache);
6032 rbd_obj_request_cache = NULL;
6033
Alex Elder1c2a9df2013-05-01 12:43:03 -05006034 rbd_assert(rbd_img_request_cache);
6035 kmem_cache_destroy(rbd_img_request_cache);
6036 rbd_img_request_cache = NULL;
6037}
6038
Alex Eldercc344fa2013-02-19 12:25:56 -06006039static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006040{
6041 int rc;
6042
Alex Elder1e32d342013-01-30 11:13:33 -06006043 if (!libceph_compatible(NULL)) {
6044 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006045 return -EINVAL;
6046 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006047
Alex Elder1c2a9df2013-05-01 12:43:03 -05006048 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006049 if (rc)
6050 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006051
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006052 /*
6053 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006054 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006055 */
6056 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6057 if (!rbd_wq) {
6058 rc = -ENOMEM;
6059 goto err_out_slab;
6060 }
6061
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006062 if (single_major) {
6063 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6064 if (rbd_major < 0) {
6065 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006066 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006067 }
6068 }
6069
Alex Elder1c2a9df2013-05-01 12:43:03 -05006070 rc = rbd_sysfs_init();
6071 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006072 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006073
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006074 if (single_major)
6075 pr_info("loaded (major %d)\n", rbd_major);
6076 else
6077 pr_info("loaded\n");
6078
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006079 return 0;
6080
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006081err_out_blkdev:
6082 if (single_major)
6083 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006084err_out_wq:
6085 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006086err_out_slab:
6087 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006088 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006089}
6090
Alex Eldercc344fa2013-02-19 12:25:56 -06006091static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006092{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006093 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006094 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006095 if (single_major)
6096 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006097 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006098 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006099}
6100
6101module_init(rbd_init);
6102module_exit(rbd_exit);
6103
Alex Elderd552c612013-05-31 20:13:09 -05006104MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006105MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6106MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006107/* following authorship retained from original osdblk.c */
6108MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6109
Ilya Dryomov90da2582013-12-13 15:28:56 +02006110MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006111MODULE_LICENSE("GPL");